source: lib/format_dpdk.h @ c7e547e

cachetimestampsdevelopdpdk-ndagetsiliverc-4.0.3rc-4.0.4ringdecrementfixringperformance
Last change on this file since c7e547e was c7e547e, checked in by Shane Alcock <salcock@…>, 3 years ago

Added a dpdkndag format for faster ndag reading

Instead of joining a multicast group and receiving nDAG packets
via the networking stack, this new format uses DPDK to sniff
the multicast direct from the wire. This should save some effort
shuffling the packets back through the kernel's networking stack.

  • Property mode set to 100644
File size: 9.5 KB
Line 
1#ifndef LIBTRACE_FORMAT_DPDK_H_
2#define LIBTRACE_FORMAT_DPDK_H_
3
4#include <libtrace.h>
5#include "libtrace_int.h"
6
7/* We can deal with any minor differences by checking the RTE VERSION
8 * Typically DPDK backports some fixes (typically for building against
9 * newer kernels) to the older version of DPDK.
10 *
11 * These get released with the rX suffix. The following macros where added
12 * in these new releases.
13 *
14 * Below this is a log of version that required changes to the libtrace
15 * code (that we still attempt to support).
16 *
17 * DPDK 16.04 or newer is recommended.
18 * However 1.6 and newer are still likely supported.
19 */
20#include <rte_eal.h>
21#include <rte_version.h>
22#ifndef RTE_VERSION_NUM
23#       define RTE_VERSION_NUM(a,b,c,d) ((a) << 24 | (b) << 16 | (c) << 8 | (d))
24#endif
25#ifndef RTE_VER_PATCH_RELEASE
26#       define RTE_VER_PATCH_RELEASE 0
27#endif
28#ifndef RTE_VERSION
29#       define RTE_VERSION RTE_VERSION_NUM(RTE_VER_MAJOR,RTE_VER_MINOR, \
30        RTE_VER_PATCH_LEVEL, RTE_VER_PATCH_RELEASE)
31#endif
32
33/* 1.6.0r2 :
34 *      rte_eal_pci_set_blacklist() is removed
35 *      device_list is renamed to pci_device_list
36 *      In the 1.7.0 release rte_eal_pci_probe is called by rte_eal_init
37 *      as such we do apply the whitelist before rte_eal_init.
38 *      This also works correctly with DPDK 1.6.0r2.
39 *
40 * Replaced by:
41 *      rte_devargs (we can simply whitelist)
42 */
43#if RTE_VERSION <= RTE_VERSION_NUM(1, 6, 0, 1)
44#       define DPDK_USE_BLACKLIST 1
45#else
46#       define DPDK_USE_BLACKLIST 0
47#endif
48
49/*
50 * 1.7.0 :
51 *      rte_pmd_init_all is removed
52 *
53 * Replaced by:
54 *      Nothing, no longer needed
55 */
56#if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 0)
57#       define DPDK_USE_PMD_INIT 1
58#else
59#       define DPDK_USE_PMD_INIT 0
60#endif
61
62/* 1.7.0-rc3 :
63 *
64 * Since 1.7.0-rc3 rte_eal_pci_probe is called as part of rte_eal_init.
65 * Somewhere between 1.7 and 1.8 calling it twice broke so we should not call
66 * it twice.
67 */
68#if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 3)
69#       define DPDK_USE_PCI_PROBE 1
70#else
71#       define DPDK_USE_PCI_PROBE 0
72#endif
73
74/* 1.8.0-rc1 :
75 * LOG LEVEL is a command line option which overrides what
76 * we previously set it to.
77 */
78#if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 1)
79#       define DPDK_USE_LOG_LEVEL 1
80#else
81#       define DPDK_USE_LOG_LEVEL 0
82#endif
83
84/* 1.8.0-rc2
85 * rx/tx_conf thresholds can be set to NULL in rte_eth_rx/tx_queue_setup
86 * this uses the default values, which are better tuned per device
87 * See issue #26
88 */
89#if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 2)
90#       define DPDK_USE_NULL_QUEUE_CONFIG 1
91#else
92#       define DPDK_USE_NULL_QUEUE_CONFIG 0
93#endif
94
95/* 2.0.0-rc1
96 * Unifies RSS hash between cards
97 */
98#if RTE_VERSION >= RTE_VERSION_NUM(2, 0, 0, 1)
99#       define RX_RSS_FLAGS (ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \
100                             ETH_RSS_SCTP)
101#else
102#       define RX_RSS_FLAGS (ETH_RSS_IPV4_UDP | ETH_RSS_IPV6 | ETH_RSS_IPV4 | \
103                             ETH_RSS_IPV4_TCP | ETH_RSS_IPV6_TCP |\
104                             ETH_RSS_IPV6_UDP)
105#endif
106
107/* v16.07-rc1 - deprecated
108 * rte_mempool_avail_count to replace rte_mempool_count
109 * rte_mempool_in_use_count to replace rte_mempool_free_count
110 */
111#if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1)
112#define rte_mempool_avail_count rte_mempool_count
113#define rte_mempool_in_use_count rte_mempool_free_count
114#endif
115
116#include <rte_per_lcore.h>
117#include <rte_debug.h>
118#include <rte_errno.h>
119#include <rte_common.h>
120#include <rte_log.h>
121#include <rte_memcpy.h>
122#include <rte_prefetch.h>
123#include <rte_branch_prediction.h>
124#include <rte_pci.h>
125#include <rte_ether.h>
126#include <rte_ethdev.h>
127#include <rte_ring.h>
128#include <rte_mempool.h>
129#include <rte_mbuf.h>
130#include <rte_launch.h>
131#include <rte_lcore.h>
132#include <rte_per_lcore.h>
133#include <rte_cycles.h>
134#include <pthread.h>
135#ifdef __FreeBSD__
136#include <pthread_np.h>
137#endif
138
139
140/* 16.04-rc3 ETH_LINK_SPEED_X are replaced with ETH_SPEED_NUM_X.
141 * ETH_LINK_SPEED_ are reused as flags, ugly.
142 * We use the new way in this code.
143 */
144#ifndef ETH_SPEED_NUM_1G
145        #define ETH_SPEED_NUM_1G ETH_LINK_SPEED_1000
146        #define ETH_SPEED_NUM_10G ETH_LINK_SPEED_10G
147        #define ETH_SPEED_NUM_20G ETH_LINK_SPEED_20G
148        #define ETH_SPEED_NUM_40G ETH_LINK_SPEED_40G
149#endif
150
151/* The default size of memory buffers to use - This is the max size of standard
152 * ethernet packet less the size of the MAC CHECKSUM, rounded up to the
153 * next power of 2, plus the RTE_PKTMBUF_HEADROOM. */
154#define RX_MBUF_SIZE (2048 + RTE_PKTMBUF_HEADROOM)
155
156/* The minimum number of memory buffers per queue tx or rx. Based on
157 * the requirement of the memory pool with 128 per thread buffers, needing
158 * at least 128*1.5 = 192 buffers. Our code allocates 128*2 to be safe.
159 */
160#define MIN_NB_BUF 128
161
162/* Number of receive memory buffers to use
163 * By default this is limited by driver to 4k and must be a multiple of 128.
164 * A modification can be made to the driver to remove this limit.
165 * This can be increased in the driver and here.
166 * Should be at least MIN_NB_BUF.
167 * We choose 2K rather than 4K because it enables the usage of sse vector
168 * drivers which are significantly faster than using the larger buffer.
169 */
170#define NB_RX_MBUF (4096/2)
171
172/* Number of send memory buffers to use.
173 * Same limits apply as those to NB_TX_MBUF.
174 */
175#define NB_TX_MBUF 1024
176
177/* The size of the PCI blacklist needs to be big enough to contain
178 * every PCI device address (listed by lspci every bus:device.function tuple).
179 */
180#define BLACK_LIST_SIZE 50
181
182/* The maximum number of characters the mempool name can be */
183#define MEMPOOL_NAME_LEN 20
184
185/* For single threaded libtrace we read packets as a batch/burst
186 * this is the maximum size of said burst */
187#define BURST_SIZE 32
188
189
190/* ~~~~~~~~~~~~~~~~~~~~~~ Advance settings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
191 * THESE MAY REQUIRE MODIFICATIONS TO INTEL DPDK
192 *
193 * Make sure you understand what these are doing before enabling them.
194 * They might make traces incompatible with other builds etc.
195 *
196 * These are also included to show how to do somethings which aren't
197 * obvious in the DPDK documentation.
198 */
199
200/* Print verbose messages to stderr */
201#define DEBUG 0
202
203/* Use clock_gettime() for nanosecond resolution rather than gettimeofday()
204 * only turn on if you know clock_gettime is a vsyscall on your system
205 * otherwise could be a large overhead. Again gettimeofday() should be
206 * vsyscall also if it's not you should seriously consider updating your
207 * kernel.
208 */
209#ifdef HAVE_CLOCK_GETTIME
210/* You can turn this on (set to 1) to prefer clock_gettime */
211#define USE_CLOCK_GETTIME 1
212#else
213/* DON'T CHANGE THIS !!! */
214#define USE_CLOCK_GETTIME 0
215#endif
216
217/* This is fairly safe to turn on - currently there appears to be a 'bug'
218 * in DPDK that will remove the checksum by making the packet appear 4bytes
219 * smaller than what it really is. Most formats don't include the checksum
220 * hence writing out a port such as int: ring: and dpdk: assumes there
221 * is no checksum and will attempt to write the checksum as part of the
222 * packet
223 */
224#define GET_MAC_CRC_CHECKSUM 0
225
226/* This requires a modification of the pmd drivers (inside Intel DPDK)
227 * TODO this requires updating (packet sizes are wrong TS most likely also)
228 */
229#define HAS_HW_TIMESTAMPS_82580 0
230
231#if HAS_HW_TIMESTAMPS_82580
232# define TS_NBITS_82580     40
233/* The maximum on the +ve or -ve side that we can be, make it half way */
234# define MAXSKEW_82580 ((uint64_t) (.5 * (double)(1ull<<TS_NBITS_82580)))
235#define WITHIN_VARIANCE(v1,v2,var) (((v1) - (var) < (v2)) && ((v1) + (var) > (v2)))
236#endif
237
238/* As per Intel 82580 specification - mismatch in 82580 datasheet
239 * it states ts is stored in Big Endian, however its actually Little */
240struct hw_timestamp_82580 {
241        uint64_t reserved;
242        uint64_t timestamp; /* Little Endian only lower 40 bits are valid */
243};
244
245enum paused_state {
246        DPDK_NEVER_STARTED,
247        DPDK_RUNNING,
248        DPDK_PAUSED,
249};
250
251struct dpdk_per_stream_t
252{
253        uint16_t queue_id;
254        uint64_t ts_last_sys; /* System timestamp of our most recent packet in nanoseconds */
255        struct rte_mempool *mempool;
256        int lcore;
257#if HAS_HW_TIMESTAMPS_82580
258        /* Timestamping only relevant to RX */
259        uint64_t ts_first_sys; /* Sytem timestamp of the first packet in nanoseconds */
260        uint32_t wrap_count; /* Number of times the NIC clock has wrapped around completely */
261#endif
262} ALIGN_STRUCT(CACHE_LINE_SIZE);
263
264#if HAS_HW_TIMESTAMPS_82580
265#define DPDK_EMPTY_STREAM {-1, 0, NULL, -1, 0, 0}
266#else
267#define DPDK_EMPTY_STREAM {-1, 0, NULL, -1}
268#endif
269
270typedef struct dpdk_per_stream_t dpdk_per_stream_t;
271
272
273libtrace_eventobj_t dpdk_trace_event(libtrace_t *trace,
274                libtrace_packet_t *packet);
275int dpdk_pstart_input (libtrace_t *libtrace);
276int dpdk_start_input (libtrace_t *libtrace);
277int dpdk_config_input (libtrace_t *libtrace,
278                trace_option_t option, void *data);
279int dpdk_init_input (libtrace_t *libtrace);
280int dpdk_pause_input(libtrace_t * libtrace);
281int dpdk_fin_input(libtrace_t * libtrace);
282int dpdk_read_packet (libtrace_t *libtrace, libtrace_packet_t *packet);
283int dpdk_pregister_thread(libtrace_t *libtrace, libtrace_thread_t *t,
284                bool reading);
285void dpdk_punregister_thread(libtrace_t *libtrace, libtrace_thread_t *t);
286int dpdk_read_packet_stream (libtrace_t *libtrace,
287                dpdk_per_stream_t *stream,
288                libtrace_message_queue_t *mesg,
289                struct rte_mbuf* pkts_burst[],
290                size_t nb_packets);
291int dpdk_prepare_packet(libtrace_t *libtrace,
292                libtrace_packet_t *packet, void *buffer,
293                libtrace_rt_types_t rt_type, uint32_t flags);
294#endif
Note: See TracBrowser for help on using the repository browser.