1 | #ifndef LIBTRACE_FORMAT_DPDK_H_ |
---|
2 | #define LIBTRACE_FORMAT_DPDK_H_ |
---|
3 | |
---|
4 | #include <libtrace.h> |
---|
5 | #include "libtrace_int.h" |
---|
6 | |
---|
7 | /* We can deal with any minor differences by checking the RTE VERSION |
---|
8 | * Typically DPDK backports some fixes (typically for building against |
---|
9 | * newer kernels) to the older version of DPDK. |
---|
10 | * |
---|
11 | * These get released with the rX suffix. The following macros where added |
---|
12 | * in these new releases. |
---|
13 | * |
---|
14 | * Below this is a log of version that required changes to the libtrace |
---|
15 | * code (that we still attempt to support). |
---|
16 | * |
---|
17 | * DPDK 16.04 or newer is recommended. |
---|
18 | * However 1.6 and newer are still likely supported. |
---|
19 | */ |
---|
20 | #include <rte_eal.h> |
---|
21 | #include <rte_version.h> |
---|
22 | #ifndef RTE_VERSION_NUM |
---|
23 | # define RTE_VERSION_NUM(a,b,c,d) ((a) << 24 | (b) << 16 | (c) << 8 | (d)) |
---|
24 | #endif |
---|
25 | #ifndef RTE_VER_PATCH_RELEASE |
---|
26 | # define RTE_VER_PATCH_RELEASE 0 |
---|
27 | #endif |
---|
28 | #ifndef RTE_VERSION |
---|
29 | # define RTE_VERSION RTE_VERSION_NUM(RTE_VER_MAJOR,RTE_VER_MINOR, \ |
---|
30 | RTE_VER_PATCH_LEVEL, RTE_VER_PATCH_RELEASE) |
---|
31 | #endif |
---|
32 | |
---|
33 | /* 1.6.0r2 : |
---|
34 | * rte_eal_pci_set_blacklist() is removed |
---|
35 | * device_list is renamed to pci_device_list |
---|
36 | * In the 1.7.0 release rte_eal_pci_probe is called by rte_eal_init |
---|
37 | * as such we do apply the whitelist before rte_eal_init. |
---|
38 | * This also works correctly with DPDK 1.6.0r2. |
---|
39 | * |
---|
40 | * Replaced by: |
---|
41 | * rte_devargs (we can simply whitelist) |
---|
42 | */ |
---|
43 | #if RTE_VERSION <= RTE_VERSION_NUM(1, 6, 0, 1) |
---|
44 | # define DPDK_USE_BLACKLIST 1 |
---|
45 | #else |
---|
46 | # define DPDK_USE_BLACKLIST 0 |
---|
47 | #endif |
---|
48 | |
---|
49 | /* |
---|
50 | * 1.7.0 : |
---|
51 | * rte_pmd_init_all is removed |
---|
52 | * |
---|
53 | * Replaced by: |
---|
54 | * Nothing, no longer needed |
---|
55 | */ |
---|
56 | #if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 0) |
---|
57 | # define DPDK_USE_PMD_INIT 1 |
---|
58 | #else |
---|
59 | # define DPDK_USE_PMD_INIT 0 |
---|
60 | #endif |
---|
61 | |
---|
62 | /* 1.7.0-rc3 : |
---|
63 | * |
---|
64 | * Since 1.7.0-rc3 rte_eal_pci_probe is called as part of rte_eal_init. |
---|
65 | * Somewhere between 1.7 and 1.8 calling it twice broke so we should not call |
---|
66 | * it twice. |
---|
67 | */ |
---|
68 | #if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 3) |
---|
69 | # define DPDK_USE_PCI_PROBE 1 |
---|
70 | #else |
---|
71 | # define DPDK_USE_PCI_PROBE 0 |
---|
72 | #endif |
---|
73 | |
---|
74 | /* 1.8.0-rc1 : |
---|
75 | * LOG LEVEL is a command line option which overrides what |
---|
76 | * we previously set it to. |
---|
77 | */ |
---|
78 | #if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 1) |
---|
79 | # define DPDK_USE_LOG_LEVEL 1 |
---|
80 | #else |
---|
81 | # define DPDK_USE_LOG_LEVEL 0 |
---|
82 | #endif |
---|
83 | |
---|
84 | /* 1.8.0-rc2 |
---|
85 | * rx/tx_conf thresholds can be set to NULL in rte_eth_rx/tx_queue_setup |
---|
86 | * this uses the default values, which are better tuned per device |
---|
87 | * See issue #26 |
---|
88 | */ |
---|
89 | #if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 2) |
---|
90 | # define DPDK_USE_NULL_QUEUE_CONFIG 1 |
---|
91 | #else |
---|
92 | # define DPDK_USE_NULL_QUEUE_CONFIG 0 |
---|
93 | #endif |
---|
94 | |
---|
95 | /* 2.0.0-rc1 |
---|
96 | * Unifies RSS hash between cards |
---|
97 | */ |
---|
98 | #if RTE_VERSION >= RTE_VERSION_NUM(2, 0, 0, 1) |
---|
99 | # define RX_RSS_FLAGS (ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \ |
---|
100 | ETH_RSS_SCTP) |
---|
101 | #else |
---|
102 | # define RX_RSS_FLAGS (ETH_RSS_IPV4_UDP | ETH_RSS_IPV6 | ETH_RSS_IPV4 | \ |
---|
103 | ETH_RSS_IPV4_TCP | ETH_RSS_IPV6_TCP |\ |
---|
104 | ETH_RSS_IPV6_UDP) |
---|
105 | #endif |
---|
106 | |
---|
107 | /* v16.07-rc1 - deprecated |
---|
108 | * rte_mempool_avail_count to replace rte_mempool_count |
---|
109 | * rte_mempool_in_use_count to replace rte_mempool_free_count |
---|
110 | */ |
---|
111 | #if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1) |
---|
112 | #define rte_mempool_avail_count rte_mempool_count |
---|
113 | #define rte_mempool_in_use_count rte_mempool_free_count |
---|
114 | #endif |
---|
115 | |
---|
116 | /* 17.05-rc1 deprecated, 17.08 removed |
---|
117 | * rte_set_log_level -> rte_log_set_global_level |
---|
118 | */ |
---|
119 | #if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 1) |
---|
120 | #define rte_log_set_global_level rte_set_log_level |
---|
121 | #endif |
---|
122 | |
---|
123 | /* 17.11-rc1 increases port size from 8 to 16bits |
---|
124 | */ |
---|
125 | #if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 1) |
---|
126 | typedef uint16_t portid_t; |
---|
127 | #else |
---|
128 | typedef uint8_t portid_t; |
---|
129 | #endif |
---|
130 | |
---|
131 | |
---|
132 | #include <rte_per_lcore.h> |
---|
133 | #include <rte_debug.h> |
---|
134 | #include <rte_errno.h> |
---|
135 | #include <rte_common.h> |
---|
136 | #include <rte_log.h> |
---|
137 | #include <rte_memcpy.h> |
---|
138 | #include <rte_prefetch.h> |
---|
139 | #include <rte_branch_prediction.h> |
---|
140 | #include <rte_pci.h> |
---|
141 | #include <rte_ether.h> |
---|
142 | #include <rte_ethdev.h> |
---|
143 | #include <rte_ring.h> |
---|
144 | #include <rte_mempool.h> |
---|
145 | #include <rte_mbuf.h> |
---|
146 | #include <rte_launch.h> |
---|
147 | #include <rte_lcore.h> |
---|
148 | #include <rte_per_lcore.h> |
---|
149 | #include <rte_cycles.h> |
---|
150 | #include <pthread.h> |
---|
151 | #ifdef __FreeBSD__ |
---|
152 | #include <pthread_np.h> |
---|
153 | #endif |
---|
154 | |
---|
155 | |
---|
156 | /* 16.04-rc3 ETH_LINK_SPEED_X are replaced with ETH_SPEED_NUM_X. |
---|
157 | * ETH_LINK_SPEED_ are reused as flags, ugly. |
---|
158 | * We use the new way in this code. |
---|
159 | */ |
---|
160 | #ifndef ETH_SPEED_NUM_1G |
---|
161 | #define ETH_SPEED_NUM_1G ETH_LINK_SPEED_1000 |
---|
162 | #define ETH_SPEED_NUM_10G ETH_LINK_SPEED_10G |
---|
163 | #define ETH_SPEED_NUM_20G ETH_LINK_SPEED_20G |
---|
164 | #define ETH_SPEED_NUM_40G ETH_LINK_SPEED_40G |
---|
165 | #endif |
---|
166 | |
---|
167 | /* The default size of memory buffers to use - This is the max size of standard |
---|
168 | * ethernet packet less the size of the MAC CHECKSUM, rounded up to the |
---|
169 | * next power of 2, plus the RTE_PKTMBUF_HEADROOM. */ |
---|
170 | #define RX_MBUF_SIZE (2048 + RTE_PKTMBUF_HEADROOM) |
---|
171 | |
---|
172 | /* The minimum number of memory buffers per queue tx or rx. Based on |
---|
173 | * the requirement of the memory pool with 128 per thread buffers, needing |
---|
174 | * at least 128*1.5 = 192 buffers. Our code allocates 128*2 to be safe. |
---|
175 | */ |
---|
176 | #define MIN_NB_BUF 128 |
---|
177 | |
---|
178 | /* Number of receive memory buffers to use |
---|
179 | * By default this is limited by driver to 4k and must be a multiple of 128. |
---|
180 | * A modification can be made to the driver to remove this limit. |
---|
181 | * This can be increased in the driver and here. |
---|
182 | * Should be at least MIN_NB_BUF. |
---|
183 | * We choose 2K rather than 4K because it enables the usage of sse vector |
---|
184 | * drivers which are significantly faster than using the larger buffer. |
---|
185 | */ |
---|
186 | #define NB_RX_MBUF (4096/2) |
---|
187 | |
---|
188 | /* Number of send memory buffers to use. |
---|
189 | * Same limits apply as those to NB_TX_MBUF. |
---|
190 | */ |
---|
191 | #define NB_TX_MBUF 1024 |
---|
192 | |
---|
193 | /* The size of the PCI blacklist needs to be big enough to contain |
---|
194 | * every PCI device address (listed by lspci every bus:device.function tuple). |
---|
195 | */ |
---|
196 | #define BLACK_LIST_SIZE 50 |
---|
197 | |
---|
198 | /* The maximum number of characters the mempool name can be */ |
---|
199 | #define MEMPOOL_NAME_LEN 20 |
---|
200 | |
---|
201 | /* For single threaded libtrace we read packets as a batch/burst |
---|
202 | * this is the maximum size of said burst */ |
---|
203 | #define BURST_SIZE 32 |
---|
204 | |
---|
205 | |
---|
206 | /* ~~~~~~~~~~~~~~~~~~~~~~ Advance settings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
---|
207 | * THESE MAY REQUIRE MODIFICATIONS TO INTEL DPDK |
---|
208 | * |
---|
209 | * Make sure you understand what these are doing before enabling them. |
---|
210 | * They might make traces incompatible with other builds etc. |
---|
211 | * |
---|
212 | * These are also included to show how to do somethings which aren't |
---|
213 | * obvious in the DPDK documentation. |
---|
214 | */ |
---|
215 | |
---|
216 | /* Print verbose messages to stderr */ |
---|
217 | #define DEBUG 0 |
---|
218 | |
---|
219 | /* Use clock_gettime() for nanosecond resolution rather than gettimeofday() |
---|
220 | * only turn on if you know clock_gettime is a vsyscall on your system |
---|
221 | * otherwise could be a large overhead. Again gettimeofday() should be |
---|
222 | * vsyscall also if it's not you should seriously consider updating your |
---|
223 | * kernel. |
---|
224 | */ |
---|
225 | #ifdef HAVE_CLOCK_GETTIME |
---|
226 | /* You can turn this on (set to 1) to prefer clock_gettime */ |
---|
227 | #define USE_CLOCK_GETTIME 1 |
---|
228 | #else |
---|
229 | /* DON'T CHANGE THIS !!! */ |
---|
230 | #define USE_CLOCK_GETTIME 0 |
---|
231 | #endif |
---|
232 | |
---|
233 | /* This is fairly safe to turn on - currently there appears to be a 'bug' |
---|
234 | * in DPDK that will remove the checksum by making the packet appear 4bytes |
---|
235 | * smaller than what it really is. Most formats don't include the checksum |
---|
236 | * hence writing out a port such as int: ring: and dpdk: assumes there |
---|
237 | * is no checksum and will attempt to write the checksum as part of the |
---|
238 | * packet |
---|
239 | */ |
---|
240 | #define GET_MAC_CRC_CHECKSUM 0 |
---|
241 | |
---|
242 | /* This requires a modification of the pmd drivers (inside Intel DPDK) |
---|
243 | * TODO this requires updating (packet sizes are wrong TS most likely also) |
---|
244 | */ |
---|
245 | #define HAS_HW_TIMESTAMPS_82580 0 |
---|
246 | |
---|
247 | #if HAS_HW_TIMESTAMPS_82580 |
---|
248 | # define TS_NBITS_82580 40 |
---|
249 | /* The maximum on the +ve or -ve side that we can be, make it half way */ |
---|
250 | # define MAXSKEW_82580 ((uint64_t) (.5 * (double)(1ull<<TS_NBITS_82580))) |
---|
251 | #define WITHIN_VARIANCE(v1,v2,var) (((v1) - (var) < (v2)) && ((v1) + (var) > (v2))) |
---|
252 | #endif |
---|
253 | |
---|
254 | /* As per Intel 82580 specification - mismatch in 82580 datasheet |
---|
255 | * it states ts is stored in Big Endian, however its actually Little */ |
---|
256 | struct hw_timestamp_82580 { |
---|
257 | uint64_t reserved; |
---|
258 | uint64_t timestamp; /* Little Endian only lower 40 bits are valid */ |
---|
259 | }; |
---|
260 | |
---|
261 | enum paused_state { |
---|
262 | DPDK_NEVER_STARTED, |
---|
263 | DPDK_RUNNING, |
---|
264 | DPDK_PAUSED, |
---|
265 | }; |
---|
266 | |
---|
267 | struct dpdk_per_stream_t |
---|
268 | { |
---|
269 | uint16_t queue_id; |
---|
270 | uint64_t ts_last_sys; /* System timestamp of our most recent packet in nanoseconds */ |
---|
271 | struct rte_mempool *mempool; |
---|
272 | int lcore; |
---|
273 | #if HAS_HW_TIMESTAMPS_82580 |
---|
274 | /* Timestamping only relevant to RX */ |
---|
275 | uint64_t ts_first_sys; /* Sytem timestamp of the first packet in nanoseconds */ |
---|
276 | uint32_t wrap_count; /* Number of times the NIC clock has wrapped around completely */ |
---|
277 | #endif |
---|
278 | } ALIGN_STRUCT(CACHE_LINE_SIZE); |
---|
279 | |
---|
280 | #if HAS_HW_TIMESTAMPS_82580 |
---|
281 | #define DPDK_EMPTY_STREAM {-1, 0, NULL, -1, 0, 0} |
---|
282 | #else |
---|
283 | #define DPDK_EMPTY_STREAM {-1, 0, NULL, -1} |
---|
284 | #endif |
---|
285 | |
---|
286 | typedef struct dpdk_per_stream_t dpdk_per_stream_t; |
---|
287 | |
---|
288 | |
---|
289 | libtrace_eventobj_t dpdk_trace_event(libtrace_t *trace, |
---|
290 | libtrace_packet_t *packet); |
---|
291 | int dpdk_pstart_input (libtrace_t *libtrace); |
---|
292 | int dpdk_start_input (libtrace_t *libtrace); |
---|
293 | int dpdk_config_input (libtrace_t *libtrace, |
---|
294 | trace_option_t option, void *data); |
---|
295 | int dpdk_init_input (libtrace_t *libtrace); |
---|
296 | int dpdk_pause_input(libtrace_t * libtrace); |
---|
297 | int dpdk_fin_input(libtrace_t * libtrace); |
---|
298 | int dpdk_read_packet (libtrace_t *libtrace, libtrace_packet_t *packet); |
---|
299 | int dpdk_pregister_thread(libtrace_t *libtrace, libtrace_thread_t *t, |
---|
300 | bool reading); |
---|
301 | void dpdk_punregister_thread(libtrace_t *libtrace, libtrace_thread_t *t); |
---|
302 | void dpdk_get_stats(libtrace_t *trace, libtrace_stat_t *stats); |
---|
303 | int dpdk_get_framing_length (const libtrace_packet_t *packet) ; |
---|
304 | int dpdk_read_packet_stream (libtrace_t *libtrace, |
---|
305 | dpdk_per_stream_t *stream, |
---|
306 | libtrace_message_queue_t *mesg, |
---|
307 | struct rte_mbuf* pkts_burst[], |
---|
308 | size_t nb_packets); |
---|
309 | int dpdk_prepare_packet(libtrace_t *libtrace, |
---|
310 | libtrace_packet_t *packet, void *buffer, |
---|
311 | libtrace_rt_types_t rt_type, uint32_t flags); |
---|
312 | #endif |
---|