Changeset c7e547e


Ignore:
Timestamp:
12/05/17 14:56:48 (3 years ago)
Author:
Shane Alcock <salcock@…>
Branches:
cachetimestamps, develop, dpdk-ndag, etsilive, master, rc-4.0.3, rc-4.0.4, ringdecrementfix, ringperformance
Children:
d83ba86a
Parents:
18c908d
Message:

Added a dpdkndag format for faster ndag reading

Instead of joining a multicast group and receiving nDAG packets
via the networking stack, this new format uses DPDK to sniff
the multicast direct from the wire. This should save some effort
shuffling the packets back through the kernel's networking stack.

Location:
lib
Files:
2 added
5 edited

Legend:

Unmodified
Added
Removed
  • lib/Makefile.am

    rea75ec2 rc7e547e  
    3434
    3535if HAVE_DPDK
    36 NATIVEFORMATS+= format_dpdk.c
     36NATIVEFORMATS+= format_dpdk.c format_dpdkndag.c
    3737# So we also make libtrace.mk in dpdk otherwise automake tries to expand
    3838# it too early which I cannot seem to stop unless we use a path that
  • lib/format_dpdk.c

    rdb84bb2 rc7e547e  
    4242#include "libtrace_arphrd.h"
    4343#include "hash_toeplitz.h"
     44#include "format_dpdk.h"
    4445
    4546#ifdef HAVE_INTTYPES_H
     
    5960#endif
    6061
    61 /* We can deal with any minor differences by checking the RTE VERSION
    62  * Typically DPDK backports some fixes (typically for building against
    63  * newer kernels) to the older version of DPDK.
    64  *
    65  * These get released with the rX suffix. The following macros where added
    66  * in these new releases.
    67  *
    68  * Below this is a log of version that required changes to the libtrace
    69  * code (that we still attempt to support).
    70  *
    71  * DPDK 16.04 or newer is recommended.
    72  * However 1.6 and newer are still likely supported.
    73  */
    74 #include <rte_eal.h>
    75 #include <rte_version.h>
    76 #ifndef RTE_VERSION_NUM
    77 #       define RTE_VERSION_NUM(a,b,c,d) ((a) << 24 | (b) << 16 | (c) << 8 | (d))
    78 #endif
    79 #ifndef RTE_VER_PATCH_RELEASE
    80 #       define RTE_VER_PATCH_RELEASE 0
    81 #endif
    82 #ifndef RTE_VERSION
    83 #       define RTE_VERSION RTE_VERSION_NUM(RTE_VER_MAJOR,RTE_VER_MINOR, \
    84         RTE_VER_PATCH_LEVEL, RTE_VER_PATCH_RELEASE)
    85 #endif
    86 
    87 /* 1.6.0r2 :
    88  *      rte_eal_pci_set_blacklist() is removed
    89  *      device_list is renamed to pci_device_list
    90  *      In the 1.7.0 release rte_eal_pci_probe is called by rte_eal_init
    91  *      as such we do apply the whitelist before rte_eal_init.
    92  *      This also works correctly with DPDK 1.6.0r2.
    93  *
    94  * Replaced by:
    95  *      rte_devargs (we can simply whitelist)
    96  */
    97 #if RTE_VERSION <= RTE_VERSION_NUM(1, 6, 0, 1)
    98 #       define DPDK_USE_BLACKLIST 1
    99 #else
    100 #       define DPDK_USE_BLACKLIST 0
    101 #endif
    102 
    103 /*
    104  * 1.7.0 :
    105  *      rte_pmd_init_all is removed
    106  *
    107  * Replaced by:
    108  *      Nothing, no longer needed
    109  */
    110 #if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 0)
    111 #       define DPDK_USE_PMD_INIT 1
    112 #else
    113 #       define DPDK_USE_PMD_INIT 0
    114 #endif
    115 
    116 /* 1.7.0-rc3 :
    117  *
    118  * Since 1.7.0-rc3 rte_eal_pci_probe is called as part of rte_eal_init.
    119  * Somewhere between 1.7 and 1.8 calling it twice broke so we should not call
    120  * it twice.
    121  */
    122 #if RTE_VERSION < RTE_VERSION_NUM(1, 7, 0, 3)
    123 #       define DPDK_USE_PCI_PROBE 1
    124 #else
    125 #       define DPDK_USE_PCI_PROBE 0
    126 #endif
    127 
    128 /* 1.8.0-rc1 :
    129  * LOG LEVEL is a command line option which overrides what
    130  * we previously set it to.
    131  */
    132 #if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 1)
    133 #       define DPDK_USE_LOG_LEVEL 1
    134 #else
    135 #       define DPDK_USE_LOG_LEVEL 0
    136 #endif
    137 
    138 /* 1.8.0-rc2
    139  * rx/tx_conf thresholds can be set to NULL in rte_eth_rx/tx_queue_setup
    140  * this uses the default values, which are better tuned per device
    141  * See issue #26
    142  */
    143 #if RTE_VERSION >= RTE_VERSION_NUM(1, 8, 0, 2)
    144 #       define DPDK_USE_NULL_QUEUE_CONFIG 1
    145 #else
    146 #       define DPDK_USE_NULL_QUEUE_CONFIG 0
    147 #endif
    148 
    149 /* 2.0.0-rc1
    150  * Unifies RSS hash between cards
    151  */
    152 #if RTE_VERSION >= RTE_VERSION_NUM(2, 0, 0, 1)
    153 #       define RX_RSS_FLAGS (ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP | \
    154                              ETH_RSS_SCTP)
    155 #else
    156 #       define RX_RSS_FLAGS (ETH_RSS_IPV4_UDP | ETH_RSS_IPV6 | ETH_RSS_IPV4 | \
    157                              ETH_RSS_IPV4_TCP | ETH_RSS_IPV6_TCP |\
    158                              ETH_RSS_IPV6_UDP)
    159 #endif
    160 
    161 /* v16.07-rc1 - deprecated
    162  * rte_mempool_avail_count to replace rte_mempool_count
    163  * rte_mempool_in_use_count to replace rte_mempool_free_count
    164  */
    165 #if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1)
    166 #define rte_mempool_avail_count rte_mempool_count
    167 #define rte_mempool_in_use_count rte_mempool_free_count
    168 #endif
    169 
    170 #include <rte_per_lcore.h>
    171 #include <rte_debug.h>
    172 #include <rte_errno.h>
    173 #include <rte_common.h>
    174 #include <rte_log.h>
    175 #include <rte_memcpy.h>
    176 #include <rte_prefetch.h>
    177 #include <rte_branch_prediction.h>
    178 #include <rte_pci.h>
    179 #include <rte_ether.h>
    180 #include <rte_ethdev.h>
    181 #include <rte_ring.h>
    182 #include <rte_mempool.h>
    183 #include <rte_mbuf.h>
    184 #include <rte_launch.h>
    185 #include <rte_lcore.h>
    186 #include <rte_per_lcore.h>
    187 #include <rte_cycles.h>
    188 #include <pthread.h>
    189 #ifdef __FreeBSD__
    190 #include <pthread_np.h>
    191 #endif
    192 
    193 /* 16.04-rc3 ETH_LINK_SPEED_X are replaced with ETH_SPEED_NUM_X.
    194  * ETH_LINK_SPEED_ are reused as flags, ugly.
    195  * We use the new way in this code.
    196  */
    197 #ifndef ETH_SPEED_NUM_1G
    198         #define ETH_SPEED_NUM_1G ETH_LINK_SPEED_1000
    199         #define ETH_SPEED_NUM_10G ETH_LINK_SPEED_10G
    200         #define ETH_SPEED_NUM_20G ETH_LINK_SPEED_20G
    201         #define ETH_SPEED_NUM_40G ETH_LINK_SPEED_40G
    202 #endif
    203 
    204 /* The default size of memory buffers to use - This is the max size of standard
    205  * ethernet packet less the size of the MAC CHECKSUM */
    206 #define RX_MBUF_SIZE 1514
    207 
    208 /* The minimum number of memory buffers per queue tx or rx. Based on
    209  * the requirement of the memory pool with 128 per thread buffers, needing
    210  * at least 128*1.5 = 192 buffers. Our code allocates 128*2 to be safe.
    211  */
    212 #define MIN_NB_BUF 128
    213 
    214 /* Number of receive memory buffers to use
    215  * By default this is limited by driver to 4k and must be a multiple of 128.
    216  * A modification can be made to the driver to remove this limit.
    217  * This can be increased in the driver and here.
    218  * Should be at least MIN_NB_BUF.
    219  * We choose 2K rather than 4K because it enables the usage of sse vector
    220  * drivers which are significantly faster than using the larger buffer.
    221  */
    222 #define NB_RX_MBUF (4096/2)
    223 
    224 /* Number of send memory buffers to use.
    225  * Same limits apply as those to NB_TX_MBUF.
    226  */
    227 #define NB_TX_MBUF 1024
    228 
    229 /* The size of the PCI blacklist needs to be big enough to contain
    230  * every PCI device address (listed by lspci every bus:device.function tuple).
    231  */
    232 #define BLACK_LIST_SIZE 50
    233 
    234 /* The maximum number of characters the mempool name can be */
    235 #define MEMPOOL_NAME_LEN 20
    236 
    237 /* For single threaded libtrace we read packets as a batch/burst
    238  * this is the maximum size of said burst */
    239 #define BURST_SIZE 32
    240 
    24162#define MBUF(x) ((struct rte_mbuf *) x)
    24263/* Get the original placement of the packet data */
     
    25980#endif
    26081
    261 /* ~~~~~~~~~~~~~~~~~~~~~~ Advance settings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    262  * THESE MAY REQUIRE MODIFICATIONS TO INTEL DPDK
    263  *
    264  * Make sure you understand what these are doing before enabling them.
    265  * They might make traces incompatible with other builds etc.
    266  *
    267  * These are also included to show how to do somethings which aren't
    268  * obvious in the DPDK documentation.
    269  */
    270 
    271 /* Print verbose messages to stderr */
    272 #define DEBUG 0
    273 
    274 /* Use clock_gettime() for nanosecond resolution rather than gettimeofday()
    275  * only turn on if you know clock_gettime is a vsyscall on your system
    276  * otherwise could be a large overhead. Again gettimeofday() should be
    277  * vsyscall also if it's not you should seriously consider updating your
    278  * kernel.
    279  */
    280 #ifdef HAVE_CLOCK_GETTIME
    281 /* You can turn this on (set to 1) to prefer clock_gettime */
    282 #define USE_CLOCK_GETTIME 1
    283 #else
    284 /* DON'T CHANGE THIS !!! */
    285 #define USE_CLOCK_GETTIME 0
    286 #endif
    287 
    288 /* This is fairly safe to turn on - currently there appears to be a 'bug'
    289  * in DPDK that will remove the checksum by making the packet appear 4bytes
    290  * smaller than what it really is. Most formats don't include the checksum
    291  * hence writing out a port such as int: ring: and dpdk: assumes there
    292  * is no checksum and will attempt to write the checksum as part of the
    293  * packet
    294  */
    295 #define GET_MAC_CRC_CHECKSUM 0
    296 
    297 /* This requires a modification of the pmd drivers (inside Intel DPDK)
    298  * TODO this requires updating (packet sizes are wrong TS most likely also)
    299  */
    300 #define HAS_HW_TIMESTAMPS_82580 0
    301 
    302 #if HAS_HW_TIMESTAMPS_82580
    303 # define TS_NBITS_82580     40
    304 /* The maximum on the +ve or -ve side that we can be, make it half way */
    305 # define MAXSKEW_82580 ((uint64_t) (.5 * (double)(1ull<<TS_NBITS_82580)))
    306 #define WITHIN_VARIANCE(v1,v2,var) (((v1) - (var) < (v2)) && ((v1) + (var) > (v2)))
    307 #endif
    30882
    30983static pthread_mutex_t dpdk_lock = PTHREAD_MUTEX_INITIALIZER;
    31084/* Memory pools Per NUMA node */
    31185static struct rte_mempool * mem_pools[4][RTE_MAX_LCORE] = {{0}};
    312 
    313 /* As per Intel 82580 specification - mismatch in 82580 datasheet
    314  * it states ts is stored in Big Endian, however its actually Little */
    315 struct hw_timestamp_82580 {
    316         uint64_t reserved;
    317         uint64_t timestamp; /* Little Endian only lower 40 bits are valid */
    318 };
    319 
    320 enum paused_state {
    321         DPDK_NEVER_STARTED,
    322         DPDK_RUNNING,
    323         DPDK_PAUSED,
    324 };
    325 
    326 struct dpdk_per_stream_t
    327 {
    328         uint16_t queue_id;
    329         uint64_t ts_last_sys; /* System timestamp of our most recent packet in nanoseconds */
    330         struct rte_mempool *mempool;
    331         int lcore;
    332 #if HAS_HW_TIMESTAMPS_82580
    333         /* Timestamping only relevant to RX */
    334         uint64_t ts_first_sys; /* Sytem timestamp of the first packet in nanoseconds */
    335         uint32_t wrap_count; /* Number of times the NIC clock has wrapped around completely */
    336 #endif
    337 } ALIGN_STRUCT(CACHE_LINE_SIZE);
    338 
    339 #if HAS_HW_TIMESTAMPS_82580
    340 #define DPDK_EMPTY_STREAM {-1, 0, NULL, -1, 0, 0}
    341 #else
    342 #define DPDK_EMPTY_STREAM {-1, 0, NULL, -1}
    343 #endif
    344 
    345 typedef struct dpdk_per_stream_t dpdk_per_stream_t;
    34686
    34787/* Used by both input and output however some fields are not used
     
    833573}
    834574
    835 static int dpdk_init_input (libtrace_t *libtrace) {
     575int dpdk_init_input (libtrace_t *libtrace) {
    836576        dpdk_per_stream_t stream = DPDK_EMPTY_STREAM;
    837577        char err[500];
     
    916656 *
    917657 */
    918 static int dpdk_config_input (libtrace_t *libtrace,
     658int dpdk_config_input (libtrace_t *libtrace,
    919659                              trace_option_t option,
    920660                              void *data) {
     
    15121252}
    15131253
    1514 static int dpdk_start_input (libtrace_t *libtrace) {
     1254int dpdk_start_input (libtrace_t *libtrace) {
    15151255        char err[500];
    15161256        err[0] = 0;
     
    15421282}
    15431283
    1544 static int dpdk_pstart_input (libtrace_t *libtrace) {
     1284int dpdk_pstart_input (libtrace_t *libtrace) {
    15451285        char err[500];
    15461286        int i=0, phys_cores=0;
     
    16151355 *                in any other manner including statistics functions.
    16161356 */
    1617 static int dpdk_pregister_thread(libtrace_t *libtrace, libtrace_thread_t *t, bool reading)
     1357int dpdk_pregister_thread(libtrace_t *libtrace, libtrace_thread_t *t, bool reading)
    16181358{
    16191359#if DEBUG
     
    16651405 * they are destroyed.
    16661406 */
    1667 static void dpdk_punregister_thread(libtrace_t *libtrace UNUSED, libtrace_thread_t *t UNUSED)
     1407void dpdk_punregister_thread(libtrace_t *libtrace UNUSED, libtrace_thread_t *t UNUSED)
    16681408{
    16691409        struct rte_config *cfg = rte_eal_get_configuration();
     
    17011441}
    17021442
    1703 static int dpdk_pause_input(libtrace_t * libtrace) {
     1443int dpdk_pause_input(libtrace_t * libtrace) {
    17041444        libtrace_list_node_t *tmp = FORMAT_DATA_HEAD(libtrace);
    17051445        /* This stops the device, but can be restarted using rte_eth_dev_start() */
     
    17561496}
    17571497
    1758 static int dpdk_fin_input(libtrace_t * libtrace) {
     1498int dpdk_fin_input(libtrace_t * libtrace) {
    17591499        libtrace_list_node_t * n;
    17601500        /* Free our memory structures */
     
    18581598}
    18591599
    1860 static int dpdk_prepare_packet(libtrace_t *libtrace UNUSED,
     1600int dpdk_prepare_packet(libtrace_t *libtrace UNUSED,
    18611601                               libtrace_packet_t *packet, void *buffer,
    18621602                               libtrace_rt_types_t rt_type, uint32_t flags) {
     
    21161856/** Reads at least one packet or returns an error
    21171857 */
    2118 static inline int dpdk_read_packet_stream (libtrace_t *libtrace,
     1858int dpdk_read_packet_stream (libtrace_t *libtrace,
    21191859                                           dpdk_per_stream_t *stream,
    21201860                                           libtrace_message_queue_t *mesg,
     
    21811921}
    21821922
    2183 static int dpdk_read_packet (libtrace_t *libtrace, libtrace_packet_t *packet) {
     1923int dpdk_read_packet (libtrace_t *libtrace, libtrace_packet_t *packet) {
    21841924        int nb_rx; /* Number of rx packets we've received */
    21851925        dpdk_per_stream_t *stream = FORMAT_DATA_FIRST(libtrace);
     
    21991939        if (FORMAT(libtrace)->burst_size != FORMAT(libtrace)->burst_offset) {
    22001940                packet->buffer = FORMAT(libtrace)->burst_pkts[FORMAT(libtrace)->burst_offset++];
     1941                packet->trace = libtrace;
    22011942                dpdk_prepare_packet(libtrace, packet, packet->buffer, packet->type, 0);
    22021943                return 1; // TODO should be bytes read, which essentially useless anyway
     
    22101951                FORMAT(libtrace)->burst_offset = 1;
    22111952                packet->buffer = FORMAT(libtrace)->burst_pkts[0];
     1953                packet->trace = libtrace;
    22121954                dpdk_prepare_packet(libtrace, packet, packet->buffer, packet->type, 0);
    22131955                return 1;
     
    22832025 * create a select()able file descriptor in DPDK.
    22842026 */
    2285 static libtrace_eventobj_t dpdk_trace_event(libtrace_t *trace,
     2027libtrace_eventobj_t dpdk_trace_event(libtrace_t *trace,
    22862028                                            libtrace_packet_t *packet) {
    22872029        libtrace_eventobj_t event = {0,0,0.0,0};
  • lib/libtrace.h.in

    ra9d0e40 rc7e547e  
    396396        TRACE_FORMAT_PCAPNG     =18,    /**< PCAP-NG trace file */
    397397        TRACE_FORMAT_NDAG       =19,    /**< DAG multicast over a network */
     398        TRACE_FORMAT_DPDK_NDAG       =20,    /**< DAG multicast over a network, received via DPDK */
    398399};
    399400
  • lib/libtrace_int.h

    ra9d0e40 rc7e547e  
    12431243/** Constructor for Intels DPDK format module */
    12441244void dpdk_constructor(void);
     1245
     1246/** Constructor for receiving network DAG via Intels DPDK format module */
     1247void dpdkndag_constructor(void);
     1248
    12451249#endif
    12461250
  • lib/trace.c

    rea75ec2 rc7e547e  
    152152#endif
    153153#ifdef HAVE_DPDK
    154         dpdk_constructor();
     154                dpdk_constructor();
     155                dpdkndag_constructor();
    155156#endif
    156157        }
Note: See TracChangeset for help on using the changeset viewer.