source: lib/format_linux_common.h @ b359a11

cachetimestampsdeveloprc-4.0.4ringdecrementfixringperformance
Last change on this file since b359a11 was f9df20e, checked in by Shane Alcock <salcock@…>, 3 years ago

Fix bug where two ring: trace inputs had same fanout group.

We use rand() to assign the fanout group when we create the
input, but the rand() seed state is usually equal in both threads
at the point where we call rand().

So we end up with two inputs trying to be assigned to the same
fanout group, which fails and the second input will immediately
halt.

Also added code to try incrementing the fanout group number if
we end up clashing with another existing group. This will also
resolve any issues with the RNG producing one-off clashes.

  • Property mode set to 100644
File size: 14.0 KB
Line 
1/*
2 *
3 * Copyright (c) 2007-2016 The University of Waikato, Hamilton, New Zealand.
4 * All rights reserved.
5 *
6 * This file is part of libtrace.
7 *
8 * This code has been developed by the University of Waikato WAND
9 * research group. For further information please see http://www.wand.net.nz/
10 *
11 * libtrace is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
15 *
16 * libtrace is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 *
24 *
25 */
26
27
28/* Various definitions required for the linux format. They were moved here,
29 * because format_linux.c had a lot of header information before the actual
30 * code. The linux headers have been copied into here rather than included to
31 * support RT on machines that don't have the linux headers (like a mac for
32 * example.
33 */
34
35#ifndef FORMAT_LINUX_COMMON_H
36#define FORMAT_LINUX_COMMON_H
37
38#include "libtrace.h"
39#include "libtrace_int.h"
40
41#ifdef HAVE_NETPACKET_PACKET_H
42
43#include <sys/socket.h>
44#include <netpacket/packet.h>
45#include <net/ethernet.h>
46#include <net/if_arp.h>
47
48#include <net/if.h>
49#include <sys/ioctl.h>
50#include <poll.h>
51#include <sys/mman.h>
52
53#include <fcntl.h>
54
55/* MAX_ORDER is defined in linux/mmzone.h. 11 is default for 3.0 kernels.
56 * max_order will be decreased by one if the ring buffer fails to allocate.
57 * Used to get the correct sized buffers from the kernel.
58 */
59#define MAX_ORDER 11
60/* Number of frames in the ring used by both TX and TR rings. More frames
61 * hopefully means less packet loss, especially if traffic comes in bursts.
62 */
63#define CONF_RING_FRAMES        0x100
64
65/* The maximum frames allowed to be waiting in the TX_RING before the kernel is
66 * notified to write them out. Make sure this is less than CONF_RING_FRAMES.
67 * Performance doesn't seem to increase any more when setting this above 10.
68 */
69#define TX_MAX_QUEUE            10
70
71#else   /* HAVE_NETPACKET_PACKET_H */
72
73/* Need to know what a sockaddr_ll looks like */
74struct sockaddr_ll {
75        uint16_t sll_family;
76        uint16_t sll_protocol;
77        int32_t  sll_ifindex;
78        uint16_t sll_hatype;
79        uint8_t  sll_pkttype;
80        uint8_t  sll_halen;
81        uint8_t  sll_addr[8];
82};
83
84/* Packet types.  */
85#define PACKET_HOST             0               /* To us.  */
86#define PACKET_BROADCAST        1               /* To all.  */
87#define PACKET_MULTICAST        2               /* To group.  */
88#define PACKET_OTHERHOST        3               /* To someone else.  */
89#define PACKET_OUTGOING         4               /* Originated by us . */
90#define PACKET_LOOPBACK         5
91#define PACKET_FASTROUTE        6
92
93/* Packet socket options.  */
94
95#define PACKET_ADD_MEMBERSHIP           1
96#define PACKET_DROP_MEMBERSHIP          2
97#define PACKET_RECV_OUTPUT              3
98#define PACKET_RX_RING                  5
99#define PACKET_STATISTICS               6
100
101#endif /* HAVE_NETPACKET_PACKET_H */
102
103struct tpacket_stats {
104        unsigned int tp_packets;
105        unsigned int tp_drops;
106};
107
108typedef enum { TS_NONE, TS_TIMEVAL, TS_TIMESPEC } timestamptype_t;
109
110/* linux/if_packet.h defines. They are here rather than including the header
111 * this means that we can interpret a ring frame on a kernel that doesn't
112 * support the format directly.
113 */
114#define PACKET_RX_RING  5
115#define PACKET_VERSION  10
116#define PACKET_HDRLEN   11
117#define PACKET_TX_RING  13
118#define PACKET_FANOUT   18
119#define TP_STATUS_USER  0x1
120#define TP_STATUS_SEND_REQUEST  0x1
121#define TP_STATUS_AVAILABLE     0x0
122#define TO_TP_HDR2(x)   ((struct tpacket2_hdr *) (x))
123#define TO_TP_HDR3(x)   ((struct tpacket3_hdr *) (x))
124#define TPACKET_ALIGNMENT       16
125#define TPACKET_ALIGN(x)        (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
126#define TPACKET2_HDRLEN         (TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
127#define TPACKET3_HDRLEN         (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
128
129
130/* Since 3.1 kernel we have packet_fanout support */
131// schedule to socket by skb's rxhash - the implementation is bi-directional
132#define PACKET_FANOUT_HASH              0
133// schedule round robin
134#define PACKET_FANOUT_LB                1
135// schedule to the same socket that received the packet
136#define PACKET_FANOUT_CPU               2
137// Something to do with fragmented packets and hashing problems !! TODO figure out if this needs to be on
138#define PACKET_FANOUT_FLAG_DEFRAG       0x8000
139/* Included but unused by libtrace since Linux 3.10 */
140// if one socket if full roll over to the next
141#define PACKET_FANOUT_ROLLOVER          3
142// This flag makes any other system roll over
143#define PACKET_FANOUT_FLAG_ROLLOVER     0x1000
144/* Included but unused by libtrace since Linux 3.12 */
145// schedule random
146#define PACKET_FANOUT_RND               4
147
148
149enum tpacket_versions {
150        TPACKET_V1,
151        TPACKET_V2,
152        TPACKET_V3
153};
154
155struct tpacket2_hdr {
156        /* Frame status - in use by kernel or libtrace etc. */
157        uint32_t        tp_status;
158        /* Wire length */
159        uint32_t        tp_len;
160        /* Captured length */
161        uint32_t        tp_snaplen;
162        /* Offset in bytes from frame start to the mac (link layer) header */
163        uint16_t        tp_mac;
164        /* Offset in bytes from frame start to the net (network layer) header */
165        uint16_t        tp_net;
166        /* Timestamp */
167        uint32_t        tp_sec;
168        uint32_t        tp_nsec;
169        /* Not used VLAN tag control information */
170        uint16_t        tp_vlan_tci;
171        uint16_t        tp_padding;
172};
173
174struct tpacket_hdr_variant1 {
175        uint32_t        tp_rxhash;
176        uint32_t        tp_vlan_tci;
177};
178
179struct tpacket3_hdr {
180        uint32_t                tp_next_offset;
181        uint32_t                tp_sec;
182        uint32_t                tp_nsec;
183        uint32_t                tp_snaplen;
184        uint32_t                tp_len;
185        uint32_t                tp_status;
186        uint16_t                tp_mac;
187        uint16_t                tp_net;
188        /* pkt_hdr variants */
189        union {
190                struct tpacket_hdr_variant1 hv1;
191        };
192};
193
194struct tpacket_req {
195        unsigned int tp_block_size;  /* Minimal size of contiguous block */
196        unsigned int tp_block_nr;    /* Number of blocks */
197        unsigned int tp_frame_size;  /* Size of frame */
198        unsigned int tp_frame_nr;    /* Total number of frames */
199};
200
201#ifndef IF_NAMESIZE
202#define IF_NAMESIZE 16
203#endif
204
205/* A structure we use to hold statistic counters from the network cards
206 * as accessed via the /proc/net/dev
207 */
208struct linux_dev_stats {
209        char if_name[IF_NAMESIZE];
210        uint64_t rx_bytes;
211        uint64_t rx_packets;
212        uint64_t rx_errors;
213        uint64_t rx_drops;
214        uint64_t rx_fifo;
215        uint64_t rx_frame;
216        uint64_t rx_compressed;
217        uint64_t rx_multicast;
218        uint64_t tx_bytes;
219        uint64_t tx_packets;
220        uint64_t tx_errors;
221        uint64_t tx_drops;
222        uint64_t tx_fifo;
223        uint64_t tx_colls;
224        uint64_t tx_carrier;
225        uint64_t tx_compressed;
226};
227
228/* Note that this structure is passed over the wire in rt encapsulation, and
229 * thus we need to be careful with data sizes.  timeval's and timespec's
230 * can also change their size on 32/64 machines.
231 */
232struct linux_format_data_t {
233        /* The snap length for the capture */
234        int snaplen;
235        /* Flag indicating whether the interface should be placed in
236         * promiscuous mode */
237        int promisc;
238        /* The timestamp format used by the capture */
239        timestamptype_t timestamptype;
240        /* A BPF filter that is applied to every captured packet */
241        libtrace_filter_t *filter;
242        /* Statistics for the capture process, e.g. dropped packet counts */
243        struct tpacket_stats stats;
244        /* Statistics for the NIC rather than the socket */
245        struct linux_dev_stats dev_stats;
246        /* Flag indicating whether the statistics are current or not */
247        int stats_valid;
248        /* Used to determine buffer size for the ring buffer */
249        uint32_t max_order;
250        /* Used for the parallel case, fanout is the mode */
251        uint16_t fanout_flags;
252        /* The group lets Linux know which sockets to group together
253         * so we use a random here to try avoid collisions */
254        uint16_t fanout_group;
255        /* When running in parallel mode this is malloc'd with an array
256         * file descriptors from packet fanout will use, here we assume/hope
257         * that every ring can get setup the same */
258        libtrace_list_t *per_stream;
259
260};
261
262struct linux_format_data_out_t {
263        /* The file descriptor used to write the packets */
264        int fd;
265        /* The tx ring mmap location */
266        char * tx_ring;
267        /* The current frame number within the tx ring */
268        int txring_offset;
269        /* The current ring buffer layout */
270        struct tpacket_req req;
271        /* Our sockaddr structure, here so we can cache the interface number */
272        struct sockaddr_ll sock_hdr;
273        /* The (maximum) number of packets that haven't been written */
274        int queue;
275        /* The format this trace is using linuxring or linuxnative */
276        libtrace_rt_types_t format;
277        /* Used to determine buffer size for the ring buffer */
278        uint32_t max_order;
279};
280
281struct linux_per_stream_t {
282        /* File descriptor for the memory mapped stream */
283        int fd;
284        /* Memory mapped buffer */
285        char *rx_ring;
286        /* Offset within the mapped buffer */
287        int rxring_offset;
288        /* The ring buffer layout */
289        struct tpacket_req req;
290        uint64_t last_timestamp;
291} ALIGN_STRUCT(CACHE_LINE_SIZE);
292
293#define ZERO_LINUX_STREAM {-1, MAP_FAILED, 0, {0,0,0,0}, 0}
294
295
296/* Format header for encapsulating packets captured using linux native */
297struct libtrace_linuxnative_header {
298        /* Timestamp of the packet, as a timeval */
299        struct {
300                uint32_t tv_sec;
301                uint32_t tv_usec;
302        } tv;
303        /* Timestamp of the packet, as a timespec */
304        struct {
305                uint32_t tv_sec;
306                uint32_t tv_nsec;
307        } ts;
308        /* The timestamp format used by the process that captured this packet */
309        uint8_t timestamptype;
310        /* Wire length */
311        uint32_t wirelen;
312        /* Capture length */
313        uint32_t caplen;
314        /* The linux native header itself */
315        struct sockaddr_ll hdr;
316};
317
318/* Helper macros to make addressing data in the above structures easier */
319#define DATA(x) ((struct linux_format_data_t *)x->format_data)
320#define DATA_OUT(x) ((struct linux_format_data_out_t *)x->format_data)
321#define STREAM_DATA(x) ((struct linux_per_stream_t *)x->data)
322
323#define FORMAT_DATA DATA(libtrace)
324#define FORMAT_DATA_OUT DATA_OUT(libtrace)
325
326#define FORMAT_DATA_HEAD FORMAT_DATA->per_stream->head
327#define FORMAT_DATA_FIRST ((struct linux_per_stream_t *)FORMAT_DATA_HEAD->data)
328
329/* Get the sockaddr_ll structure from a frame */
330#define GET_SOCKADDR_HDR(x)  ((struct sockaddr_ll *) (((char *) (x))\
331        + TPACKET_ALIGN(sizeof(struct tpacket2_hdr))))
332
333/* Common functions */
334#ifdef HAVE_NETPACKET_PACKET_H
335int linuxcommon_init_input(libtrace_t *libtrace);
336int linuxcommon_init_output(libtrace_out_t *libtrace);
337int linuxcommon_probe_filename(const char *filename);
338int linuxcommon_config_input(libtrace_t *libtrace, trace_option_t option,
339                             void *data);
340void linuxcommon_close_input_stream(libtrace_t *libtrace,
341                                    struct linux_per_stream_t *stream);
342int linuxcommon_start_input_stream(libtrace_t *libtrace,
343                                   struct linux_per_stream_t *stream);
344int linuxcommon_pause_input(libtrace_t *libtrace);
345int linuxcommon_get_fd(const libtrace_t *libtrace);
346int linuxcommon_fin_input(libtrace_t *libtrace);
347int linuxcommon_pregister_thread(libtrace_t *libtrace,
348                                 libtrace_thread_t *t,
349                                 bool reading);
350int linuxcommon_pstart_input(libtrace_t *libtrace,
351                             int (*start_stream)(libtrace_t *, struct linux_per_stream_t*));
352#endif /* HAVE_NETPACKET_PACKET_H */
353
354void linuxcommon_get_statistics(libtrace_t *libtrace, libtrace_stat_t *stat);
355
356static inline libtrace_direction_t linuxcommon_get_direction(uint8_t pkttype)
357{
358        switch (pkttype) {
359                case PACKET_OUTGOING:
360                case PACKET_LOOPBACK:
361                        return TRACE_DIR_OUTGOING;
362                case PACKET_OTHERHOST:
363                        return TRACE_DIR_OTHER;
364                default:
365                        return TRACE_DIR_INCOMING;
366        }
367}
368
369static inline libtrace_direction_t
370linuxcommon_set_direction(struct sockaddr_ll * skadr,
371                          libtrace_direction_t direction)
372{
373        switch (direction) {
374                case TRACE_DIR_OUTGOING:
375                        skadr->sll_pkttype = PACKET_OUTGOING;
376                        return TRACE_DIR_OUTGOING;
377                case TRACE_DIR_INCOMING:
378                        skadr->sll_pkttype = PACKET_HOST;
379                        return TRACE_DIR_INCOMING;
380                case TRACE_DIR_OTHER:
381                        skadr->sll_pkttype = PACKET_OTHERHOST;
382                        return TRACE_DIR_OTHER;
383                default:
384                        return -1;
385        }
386}
387
388static inline libtrace_linktype_t linuxcommon_get_link_type(uint16_t linktype)
389{
390        /* Convert the ARPHRD type into an appropriate libtrace link type */
391        switch (linktype) {
392                case LIBTRACE_ARPHRD_ETHER:
393                case LIBTRACE_ARPHRD_LOOPBACK:
394                        return TRACE_TYPE_ETH;
395                case LIBTRACE_ARPHRD_PPP:
396                case LIBTRACE_ARPHRD_IPGRE:
397                        return TRACE_TYPE_NONE;
398                case LIBTRACE_ARPHRD_IEEE80211_RADIOTAP:
399                        return TRACE_TYPE_80211_RADIO;
400                case LIBTRACE_ARPHRD_IEEE80211:
401                        return TRACE_TYPE_80211;
402                case LIBTRACE_ARPHRD_SIT:
403                case LIBTRACE_ARPHRD_NONE:
404                        return TRACE_TYPE_NONE;
405                default: /* shrug, beyond me! */
406                        printf("unknown Linux ARPHRD type 0x%04x\n",linktype);
407                        return (libtrace_linktype_t)~0U;
408        }
409}
410
411#ifdef HAVE_NETPACKET_PACKET_H
412/**
413 * Converts a socket, either packet_mmap or standard raw socket into a
414 * fanout socket.
415 * NOTE: This means we can read from the socket with multiple queues,
416 * each must be setup (identically) and then this called upon them
417 *
418 * @return 0 success, -1 error
419 */
420static inline int linuxcommon_to_packet_fanout(libtrace_t *libtrace,
421                                        struct linux_per_stream_t *stream)
422{
423        int fanout_opt;
424        int attempts = 0;
425        while (attempts < 5) {
426                fanout_opt = ((int)FORMAT_DATA->fanout_flags << 16) |
427                                 (int)FORMAT_DATA->fanout_group;
428
429                if (setsockopt(stream->fd, SOL_PACKET, PACKET_FANOUT,
430                                &fanout_opt, sizeof(fanout_opt)) == -1) {
431                        trace_set_err(libtrace, TRACE_ERR_INIT_FAILED,
432                              "Converting the fd to a socket fanout failed %s",
433                              libtrace->uridata);
434                        FORMAT_DATA->fanout_group ++;
435                        attempts ++;
436                        continue;
437                }
438                return 0;
439        }
440        return -1;
441}
442#endif /* HAVE_NETPACKET_PACKET_H */
443
444
445#endif /* FORMAT_LINUX_COMMON_H */
Note: See TracBrowser for help on using the repository browser.