--- //depot/vendor/freebsd/src/contrib/libpcap/pcap-bpf.c 2007/02/27 02:39:57 +++ //depot/projects/zcopybpf/src/contrib/libpcap/pcap-bpf.c 2007/03/04 02:08:48 @@ -30,6 +30,8 @@ #endif #include /* optionally get BSD define */ +#include +#include #include #include #include @@ -139,6 +141,118 @@ return (0); } +#ifdef BIOCGETBUFMODE +/* + * Selection routine for zero-copy BPF: identify the next completed buffer, + * if any. Try shared memory first, and if that doesn't work, make a system + * call, which may dislodge a buffer. + * + * Return (1) if the buffer is found, (0) if a retry is required, and (-1) if + * there is an unrecoverable error. + * + * XXXRW: Check to make sure the version comparison we're doing here is + * really the right thing -- maybe use serial number arithmetic? + */ +static int +pcap_next_zbuf(pcap_t *p, u_int *cc) +{ + struct bpf_zbuf_header *bzh; + struct bpf_zbuf bz; + struct timeval tv; + fd_set r_set; + int r; + + FD_ZERO(&r_set); + FD_SET(p->fd, &r_set); + p->bzh = NULL; + p->buffer = NULL; + if (p->to_ms != 0) { + tv.tv_sec = p->to_ms / 1000; + tv.tv_usec = (p->to_ms * 1000) % 1000000; + } + r = select(p->fd + 1, &r_set, NULL, NULL, &tv); + if (r < 0 && errno == EINTR) + return (0); + else if (r < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "select: %s", strerror(errno)); + return (-1); + } + /* + * Handle timeouts here + */ + if (r == 0) { + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + /* + * select(2) woke us up due to a timeout, and there was no + * data to be processed in the store buffer. Tell pcap to + * to wait again. + */ + if (bz.bz_bufa == NULL) + return (0); + } + /* XXXCSJP should we check FD_ISSET()? */ + /* + * If we have made it this far, chances are select(2) returned because + * there is data ready to be processed in the hold buffer. Compare the + * user generation numbers against the kernels. If there are any + * differences, process the packet data. + */ + bzh = (struct bpf_zbuf_header *)p->zbuf1; + if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) { + p->bzh = bzh; + p->buffer = (u_char *)p->zbuf1; + p->buffer += sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + bzh = (struct bpf_zbuf_header *)p->zbuf2; + if (bzh->bzh_kernel_gen > bzh->bzh_user_gen) { + p->bzh = bzh; + p->buffer = (u_char *)p->zbuf2; + p->buffer += sizeof(*bzh); + *cc = bzh->bzh_kernel_len; + return (1); + } + /* + * If the generation numbers were the same for both buffers, then it + * is possible that we woke up because of BIOCIMMEDIATE. In either + * case, manually rotate the buffers. + */ + if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) { + (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "BIOCROTZBUF: %s", strerror(errno)); + return (-1); + } + /* + * It's possible that we were unable to rotate the buffer because the + * user generation numbers have not been modified, in which case retry. + */ + if (bz.bz_bufa == NULL) + return (0); + p->bzh = (struct bpf_zbuf_header *)bz.bz_bufa; + p->buffer = (u_char *)bz.bz_bufa; + p->buffer += sizeof(*bzh); + *cc = bz.bz_buflen; + return (1); +} + +static int +pcap_ack_zbuf(pcap_t *p) +{ + struct bpf_zbuf bz; + + p->bzh->bzh_user_gen++; + p->bzh = NULL; + p->buffer = NULL; + return (0); +} +#endif + static int pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user) { @@ -147,6 +261,9 @@ register u_char *bp, *ep; u_char *datap; struct bpf_insn *fcode; +#ifdef BIOCSETBUFMODE + int i; +#endif #ifdef PCAP_FDDIPAD register int pad; #endif @@ -167,7 +284,19 @@ } cc = p->cc; if (p->cc == 0) { - cc = read(p->fd, (char *)p->buffer, p->bufsize); +#ifdef BIOCSETBUFMODE + if (p->zbuf1 != NULL) { + if (p->buffer != NULL) + pcap_ack_zbuf(p); + i = pcap_next_zbuf(p, &cc); + if (i == 0) + goto again; + if (i < 0) + return (-1); + } else +#endif + cc = read(p->fd, (char *)p->buffer, p->bufsize); + if (cc < 0) { /* Don't choke when we get ptraced */ switch (errno) { @@ -598,6 +727,10 @@ struct bpf_insn total_insn; struct bpf_program total_prog; struct utsname osinfo; +#ifdef BIOCSETBUFMODE + struct bpf_zbuf bz; + u_int bufmode, zbufmax; +#endif #ifdef HAVE_DAG_API if (strstr(device, "dag")) { @@ -636,6 +769,73 @@ } /* + * XXXRW: Depending on the availability of zero-copy BPF, we take one + * of two strategies here: if it is available and usable, we go ahead + * and set it up; otherwise we play the song-and-dance to try to + * probe an acceptable read buffer size. Zero-copy BPF requires that + * buffers be mapped into memory before selecting the interface to + * attach to, so we do that here also. + */ +#ifdef BIOCSETBUFMODE + if (getenv("BPF_ZERO_COPY")) { + bufmode = BPF_BUFMODE_ZBUF; + if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETBUFMODE: %s", + pcap_strerror(errno)); + goto bad; + } + + if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s", + pcap_strerror(errno)); + goto bad; + } + + /* + * XXXRW: This logic should be revisited. + */ + p->zbufsize = 32768; + if (p->zbufsize % getpagesize() != 0) + p->zbufsize = getpagesize(); + if (p->zbufsize > zbufmax) + p->zbufsize = zbufmax; + + p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE, + MAP_ANON, -1, 0); + if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) { + if (p->zbuf1 != MAP_FAILED) + munmap(p->zbuf1, p->zbufsize); + if (p->zbuf2 != MAP_FAILED) + munmap(p->zbuf1, p->zbufsize); + snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s", + pcap_strerror(errno)); + } + + bzero(&bz, sizeof(bz)); + bz.bz_bufa = p->zbuf1; + bz.bz_bufb = p->zbuf2; + bz.bz_buflen = p->zbufsize; + + if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s", + pcap_strerror(errno)); + goto bad; + } + + (void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); + if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s", + device, pcap_strerror(errno)); + goto bad; + } + + v = p->zbufsize - sizeof(struct bpf_zbuf_header); + } else { +#endif + + /* * Try finding a good size for the buffer; 32768 may be too * big, so keep cutting it in half until we find a size * that works, or run out of sizes to try. If the default @@ -670,6 +870,9 @@ "BIOCSBLEN: %s: No buffer size worked", device); goto bad; } +#ifdef BIOCSETBUFMODE + } +#endif /* Get the data link layer type. */ if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) { @@ -844,7 +1047,8 @@ } #endif /* set timeout */ - if (to_ms != 0) { + p->to_ms = to_ms; + if (to_ms != 0 && getenv("BPF_ZERO_COPY") == NULL) { /* * XXX - is this seconds/nanoseconds in AIX? * (Treating it as such doesn't fix the timeout @@ -859,6 +1063,9 @@ goto bad; } } +#ifdef BIOCSETBUFMODE + p->timeout = to_ms; +#endif #ifdef _AIX #ifdef BIOCIMMEDIATE @@ -931,16 +1138,22 @@ goto bad; } p->bufsize = v; - p->buffer = (u_char *)malloc(p->bufsize); - if (p->buffer == NULL) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", - pcap_strerror(errno)); - goto bad; +#ifdef BIOCSETBUFMODE + if (p->zbuf1 == NULL) { +#endif + p->buffer = (u_char *)malloc(p->bufsize); + if (p->buffer == NULL) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s", + pcap_strerror(errno)); + goto bad; + } +#ifdef _AIX + /* For some strange reason this seems to prevent the EFAULT + * problems we have experienced from AIX BPF. */ + memset(p->buffer, 0x0, p->bufsize); +#endif +#ifdef BIOCSETBUFMODE } -#ifdef _AIX - /* For some strange reason this seems to prevent the EFAULT - * problems we have experienced from AIX BPF. */ - memset(p->buffer, 0x0, p->bufsize); #endif /* @@ -1025,7 +1238,24 @@ return (p); bad: + (void)close(fd); +#ifdef BIOCSETBUFMODE + if (p->zbuf1 != NULL) + munmap(p->zbuf1, v); + if (p->zbuf2 != NULL) + munmap(p->zbuf2, v); + /* + * If we are using zerocopy, the packet buffer will be referencing + * an address in one of the shared pages, if any. In which case + * we will not free it. + */ + if (getenv("BPF_ZERO_COPY") == NULL && p->buffer != NULL) + free(p->buffer); +#else + if (p->buffer != NULL) + free(p->buffer); +#endif if (p->dlt_list != NULL) free(p->dlt_list); free(p); --- //depot/vendor/freebsd/src/contrib/libpcap/pcap-int.h 2006/09/04 21:39:51 +++ //depot/projects/zcopybpf/src/contrib/libpcap/pcap-int.h 2007/02/06 22:01:39 @@ -148,12 +148,35 @@ struct pcap_md md; /* - * Read buffer. + * Read buffer -- for file descriptor read buffer model. */ int bufsize; u_char *buffer; u_char *bp; int cc; + int to_ms; + + /* + * XXXRW: Exactly how to handle ifdefs, etc, is not something I've + * worked out yet. Presumably we need to add a configure check for + * zero-copy BPF. + * + * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will + * alternative between these two actual mmap'd buffers as required. + * As there is a header on the front size of the mmap'd buffer, only + * some of the buffer is exposed to libpcap as a whole via bufsize; + * zbufsize is the true size. + */ + u_char *zbuf1, *zbuf2; + u_int zbufsize; + u_int timeout; + + /* + * If there's currently a buffer being actively processed, then it is + * referenced here; 'buffer' is also pointed at it, but offset by the + * size of the header. + */ + struct bpf_zbuf_header *bzh; /* * Place holder for pcap_next(). --- //depot/vendor/freebsd/src/lib/libpcap/Makefile 2006/04/13 14:39:22 +++ //depot/projects/zcopybpf/src/lib/libpcap/Makefile 2007/01/31 16:37:37 @@ -16,6 +16,7 @@ YFLAGS+=-p pcapyy LFLAGS+=-Ppcapyy +CFLAGS+=-I../../sys -g CFLAGS+=-DHAVE_CONFIG_H -Dyylval=pcapyylval -I${.CURDIR} -I. CFLAGS+=-D_U_="__attribute__((unused))" CFLAGS+=-DHAVE_SNPRINTF -DHAVE_VSNPRINTF --- //depot/vendor/freebsd/src/share/man/man4/bpf.4 2007/02/27 02:39:57 +++ //depot/projects/zcopybpf/src/share/man/man4/bpf.4 2007/03/04 02:08:48 @@ -1,3 +1,30 @@ +.\" Copyright (c) 2007 Seccuris Inc. +.\" All rights reserved. +.\" +.\" This sofware was developed by Robert N. M. Watson under contract to +.\" Seccuris Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" .\" Copyright (c) 1990 The Regents of the University of California. .\" All rights reserved. .\" @@ -61,18 +88,53 @@ all file descriptors listening on that interface apply their filter. Each descriptor that accepts the packet receives its own copy. .Pp -Reads from these files return the next group of packets -that have matched the filter. -To improve performance, the buffer passed to read must be -the same size as the buffers used internally by -.Nm . +.Nm +devices operate in one of two buffering modes: buffered +.Xr read 2 , +in which packet data is copied from the kernel explicitly using the +.Xr read 2 +system call, and zero-copy buffer mode, in which the user process provides +two memory regions that +.Nm +will write to directly as the packets are accepted. +The buffering mode may be set with the +.Dv BIOCSETBUFMODE +ioctl (see below), and will default to buffered +.Xr read 2 +mode +.Dv ( BPF_BUFMODE_BUFFER ) +by default. +Buffers return the next group of packets that have matched the filter. +Note that an individual packet larger than the buffer size is necessarily +truncated. +.Pp +In the case of buffered +.Xr read 2 , +the user process will declare a fixed buffer size that will be used both for +sizing internal buffers and for all +.Xr read 2 +operations on the file. This size is returned by the .Dv BIOCGBLEN ioctl (see below), and can be set with .Dv BIOCSBLEN . -Note that an individual packet larger than this size is necessarily -truncated. +.Pp +In zero-copy buffering, the user process registers two memory buffers with +.Nm +via the +.Dv BIOCSETZBUF +ioctl (see below). +The user process may monitor for completion (filling) of a buffer, at which +point the memory contents of the buffer will be stable until the buffer is +returned for further kernel use using the +.Dv BIOCACKZBUF +ioctl. +Buffers will be of a fixed (and equal) size, be +page-aligned, and the size must be an integer multiple of the page size. +The maximum zero-copy buffer size is returned by the +.Dv BIOCGETZMAX +ioctl (see below). .Pp The packet filter will support any link level protocol that has fixed length headers. @@ -127,7 +189,7 @@ The (third) argument to .Xr ioctl 2 should be a pointer to the type indicated. -.Bl -tag -width BIOCGRTIMEOUT +.Bl -tag -width BIOCGETBUFMODE .It Dv BIOCGBLEN .Pq Li u_int Returns the required buffer length for reads on @@ -349,6 +411,87 @@ This prevents the execution of ioctl commands which could change the underlying operating parameters of the device. +.It Dv BIOCGETBUFMODE +.It Dv BIOCSETBUFMODE +.Pq Li u_int +Get or set the current +.Nm +buffering mode; possible values are +.Dv BPF_BUFMODE_BUFFER , +buffered +.Xr read 2 +mode, and +.Dv BPF_BUFMODE_ZBUF , +zero-copy buffer mode. +.It Dv BIOCACKZBUF +.Pq Li struct bpf_zbuf +Return a completed zero-copy buffer to the kernel for reuse. +The following structure is used as an argument to these and other zero-copy +buffer ioctls: +.Bd -literal +struct bpf_zbuf { + void *bz_bufa; + void *bz_bufb; + size_t bz_buflen; +}; +.Ed +.Pp +Only the +.Vt bz_bufa +field will be used with this ioctl. +.It Dv BIOCGETZBUF +.It Dv BIOCSETZBUF +.Pq Li struct bpf_zbuf +Get or set the current zero-copy buffer locations; buffer locations may be +set only once zero-copy buffer mode has been selected, and prior to attaching +the +.Nm +device to an interface. +Buffers must be of identical size, page-aligned, and an integer multiple of +pages in size. +The three fields +.Vt bz_bufa , +.Vt bz_bufb , +and +.Vt bz_buflen +must be filled out. +.It Dv BIOCGETZMAX +.Pq Li size_t +Get the largest individual zero-copy buffer size allowed. +As two buffers are used in zero-copy buffer mode, the limit (in practice) is +twice the returned size. +As zero-copy buffers consume kernel address space, conservative selection of +buffer size, especially when there are multiple +.Nm +descriptors in use on 32-bit systems. +.It Dv BIOCGETZNEXT +.It Dv BIOCROTZBUF +.Pq Li struct bpf_zbuf +Get the buffer pointer and length of the next zero-copy buffer buffer ready +for userspace use, or +.Dv NULL +if there is no pending buffer. +.Pp +.Dv BIOCGETZNEXT +queries for the next completely filled buffer ready for immediate use, +returning NULL if there are only empty or partially filled buffers available. +.Pp +.Dv BIOCROTZBUF +queries for a filled buffer, but in the event there is only a partially +filled buffer, will make that buffer available for userspace to use +immediately. +This allows consumers of zero-copy buffering to implement timeouts and +retrieve partially filled buffers. +.Dv BIOCROTZBUF +will return +.Dv NULL +only if no data is present in either of the zero-copy buffers. +.Pp +Only the +.Vt bz_bufa +and +.Vt bz_buflen +fields will be used with this ioctl. .El .Sh BPF HEADER The following structure is prepended to each packet returned by --- //depot/vendor/freebsd/src/sys/amd64/conf/GENERIC 2007/04/10 21:41:17 +++ //depot/projects/zcopybpf/src/sys/amd64/conf/GENERIC 2007/04/16 13:57:45 @@ -26,6 +26,20 @@ makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols +options BPF_ZEROCOPY +device hwpmc +options HWPMC_HOOKS + +options NULLFS +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options GDB # Support remote GDB. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + options SCHED_4BSD # 4BSD scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking @@ -61,15 +75,6 @@ options ADAPTIVE_GIANT # Giant mutex is adaptive. options STOP_NMI # Stop CPUS using NMI instead of IPI -# Debugging for use in -current -options KDB # Enable kernel debugger support. -options DDB # Support DDB. -options GDB # Support remote GDB. -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed - # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel @@ -80,6 +85,7 @@ # Floppy drives device fdc + # ATA and ATAPI devices device ata device atadisk # ATA disk drives --- //depot/vendor/freebsd/src/sys/conf/files 2007/04/25 15:31:33 +++ //depot/projects/zcopybpf/src/sys/conf/files 2007/04/28 19:45:55 @@ -1523,8 +1523,10 @@ libkern/strtouq.c standard libkern/strvalid.c standard net/bpf.c standard +net/bpf_buffer.c optional bpf net/bpf_jitter.c optional bpf_jitter net/bpf_filter.c optional bpf | netgraph_bpf +net/bpf_zerocopy.c optional bpf_zerocopy net/bridgestp.c optional if_bridge net/bsd_comp.c optional ppp_bsdcomp net/ieee8023ad_lacp.c optional lagg --- //depot/vendor/freebsd/src/sys/conf/options 2007/04/14 20:22:55 +++ //depot/projects/zcopybpf/src/sys/conf/options 2007/04/16 13:57:45 @@ -491,6 +491,7 @@ # DRM options DRM_DEBUG opt_drm.h +BPF_ZEROCOPY opt_bpf.h ZERO_COPY_SOCKETS opt_zero.h TI_PRIVATE_JUMBOS opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h --- //depot/vendor/freebsd/src/sys/net/bpf.c 2007/02/26 22:26:22 +++ //depot/projects/zcopybpf/src/sys/net/bpf.c 2007/04/07 21:21:11 @@ -65,9 +65,13 @@ #include #include +#include #ifdef BPF_JITTER #include #endif +#ifdef BPF_ZEROCOPY +#include +#endif #include #include @@ -79,7 +83,7 @@ #include -static MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); +MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -97,19 +101,17 @@ static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; -static void bpf_allocbufs(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_freed(struct bpf_d *); -static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, int, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); -static void catchpacket(struct bpf_d *, u_char *, u_int, - u_int, void (*)(const void *, void *, size_t), +static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, + void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); @@ -124,10 +126,10 @@ SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); static int bpf_bufsize = 4096; SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, - &bpf_bufsize, 0, "Default bpf buffer size"); + &bpf_bufsize, 0, ""); static int bpf_maxbufsize = BPF_MAXBUFSIZE; SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, - &bpf_maxbufsize, 0, "Maximum bpf buffer size"); + &bpf_maxbufsize, 0, ""); static int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); @@ -144,7 +146,6 @@ static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, - .d_flags = D_NEEDGIANT, .d_open = bpfopen, .d_close = bpfclose, .d_read = bpfread, @@ -158,7 +159,217 @@ static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; +/* + * Wrapper functions for various buffering methods. If the set of buffer + * modes expands, we will probably want to introduce a switch data structure + * similar to protosw, et. + */ +static void +bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_bytes(d, buf, offset, src, len)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); +#endif + + default: + panic("bpf_buf_append_bytes"); + } +} + +static void +bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); +#endif + + default: + panic("bpf_buf_append_mbuf"); + } +} + +/* + * If the buffer mechanism has a way to decide that a held buffer can be made + * free, then it is exposed via the bpf_buffree() interface. (1) is returned + * if the buffer can be discarded, (0) is returned if it cannot. + */ +static int +bpf_buffree(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_buffree(d)); +#endif + } + return (0); +} + +void +bpf_bufheld(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_bufheld(d); + break; +#endif + } +} + +static void +bpf_free(struct bpf_d *d) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_free(d)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_free(d)); +#endif + + default: + panic("bpf_buf_free"); + } +} + +static int +bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_uiomove(d, buf, len, uio)); + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_uiomove(d, buf, len, uio)); +#endif + + default: + panic("bpf_buf_uiomove"); + } +} + +static int +bpf_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_ioctl_sblen(d, i)); +} + +static int +bpf_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_ackzbuf(td, d, bz)); +#else + panic("bpf_ioctl_ackzbuf"); +#endif +} + +static int +bpf_ioctl_getzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getzbuf(td, d, bz)); +#else + panic("bpf_ioctl_getzbuf"); +#endif +} + +static int +bpf_ioctl_getznext(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getznext(td, d, bz)); +#else + panic("bpf_ioctl_getznext"); +#endif +} + +static int +bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_getzmax(td, d, i)); +#else + panic("bpf_ioctl_getzmax"); +#endif +} + static int +bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); +#else + panic("bpf_ioctl_rotzbuf"); +#endif +} + +static int +bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); +#ifdef BPF_ZEROCOPY + return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); +#else + panic("bpf_ioctl_setzbuf"); +#endif +} + +/* + * General BPF functions. + */ +static int bpf_movein(struct uio *uio, int linktype, int mtu, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) { @@ -397,7 +608,14 @@ "bpf%d", dev2unit(dev)); MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO); dev->si_drv1 = d; - d->bd_bufsize = bpf_bufsize; + + /* + * XXXRW: For historical reasons, perform a one-time initialization + * call to the buffer routines, even though we're not yet committed + * to a particular buffer method. + */ + bpf_buffer_init(d); + d->bd_bufmode = BPF_BUFMODE_DEFAULT; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_pid = td->td_proc->p_pid; @@ -444,18 +662,6 @@ return (0); } - -/* - * Rotate the packet buffers in descriptor d. Move the store buffer - * into the hold slot, and the free buffer into the store slot. - * Zero the length of the new store buffer. - */ -#define ROTATE_BUFFERS(d) \ - (d)->bd_hbuf = (d)->bd_sbuf; \ - (d)->bd_hlen = (d)->bd_slen; \ - (d)->bd_sbuf = (d)->bd_fbuf; \ - (d)->bd_slen = 0; \ - (d)->bd_fbuf = NULL; /* * bpfread - read next chunk of packets from buffers */ @@ -474,6 +680,15 @@ return (EINVAL); BPFD_LOCK(d); + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { + /* + * XXXRW: For now, we don't implement a uiomove for the + * scatter-gather buffers associated with BPF_BUFMODE_ZBUF, + * so simply disallow read(). + */ + BPFD_UNLOCK(d); + return (EOPNOTSUPP); + } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); @@ -546,8 +761,12 @@ * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. + * + * XXXRW: More synchronization needed here: what if a second thread + * issues a read on the same fd at the same time? Don't want this + * getting invalidated. */ - error = uiomove(d->bd_hbuf, d->bd_hlen, uio); + error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); d->bd_fbuf = d->bd_hbuf; @@ -558,7 +777,6 @@ return (error); } - /* * If there are processes sleeping on this descriptor, wake them up. */ @@ -594,6 +812,23 @@ } static int +bpf_ready(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + // printf("bpf_ready: hlen: %d, immediate %d, state %d, slen %d\n", + // d->bd_hlen, d->bd_immediate, d->bd_state, d->bd_slen); + + if (!bpf_buffree(d) && d->bd_hlen != 0) + return (1); + if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0) + return (1); + return (0); +} + +static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d = dev->si_drv1; @@ -602,23 +837,32 @@ struct sockaddr dst; int error, hlen; - if (d->bd_bif == NULL) + d->bd_wcount++; + if (d->bd_bif == NULL) { + d->bd_wdcount++; return (ENXIO); + } ifp = d->bd_bif->bif_ifp; - if ((ifp->if_flags & IFF_UP) == 0) + if ((ifp->if_flags & IFF_UP) == 0) { + d->bd_wdcount++; return (ENETDOWN); + } - if (uio->uio_resid == 0) + if (uio->uio_resid == 0) { + d->bd_wdcount++; return (0); + } bzero(&dst, sizeof(dst)); error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m, &dst, &hlen, d->bd_wfilter); - if (error) + if (error) { + d->bd_wdcount++; return (error); - + } + d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -646,6 +890,8 @@ NET_LOCK_GIANT(); error = (*ifp->if_output)(ifp, m, &dst, NULL); + if (error) + d->bd_wdcount++; NET_UNLOCK_GIANT(); if (mc != NULL) { @@ -679,6 +925,10 @@ d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; + d->bd_wcount = 0; + d->bd_wfcount = 0; + d->bd_wdcount = 0; + d->bd_zcopy = 0; } /* @@ -703,6 +953,11 @@ * BIOCSDIRECTION Set packet direction flag * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. + * BIOCGETZBUF Query current zero-copy buffer locations. + * BIOCSETZBUF Set current zero-copy buffer locations. + * BIOCSETZBUF Acknowledge reading zero-copy buffers. + * BIOCGETZMAX Get maximum zero-copy buffer size. + * BIOCGETZNEXT Get next ready zero-copy buffer location */ /* ARGSUSED */ static int @@ -711,7 +966,7 @@ { struct bpf_d *d = dev->si_drv1; int error = 0; - + /* * Refresh PID associated with this descriptor. */ @@ -740,6 +995,8 @@ case BIOCSRTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: + case BIOCACKZBUF: + case BIOCGETZBUF: break; default: return (EPERM); @@ -775,8 +1032,10 @@ if (d->bd_bif == NULL) error = EINVAL; else { + NET_LOCK_GIANT(); ifp = d->bd_bif->bif_ifp; error = (*ifp->if_ioctl)(ifp, cmd, addr); + NET_UNLOCK_GIANT(); } break; } @@ -792,17 +1051,7 @@ * Set buffer length. */ case BIOCSBLEN: - if (d->bd_bif != NULL) - error = EINVAL; - else { - u_int size = *(u_int *)addr; - - if (size > bpf_maxbufsize) - *(u_int *)addr = size = bpf_maxbufsize; - else if (size < BPF_MINBUFSIZE) - *(u_int *)addr = size = BPF_MINBUFSIZE; - d->bd_bufsize = size; - } + error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* @@ -834,9 +1083,9 @@ break; } if (d->bd_promisc == 0) { - mtx_lock(&Giant); + NET_LOCK_GIANT(); error = ifpromisc(d->bd_bif->bif_ifp, 1); - mtx_unlock(&Giant); + NET_UNLOCK_GIANT(); if (error == 0) d->bd_promisc = 1; } @@ -1039,6 +1288,62 @@ case BIOCGRSIG: *(u_int *)addr = d->bd_sig; break; + + case BIOCGETBUFMODE: + *(u_int *)addr = d->bd_bufmode; + break; + + case BIOCSETBUFMODE: + /* + * Allow the buffering mode to be changed as long as we + * haven't yet committed to a particular mode. Our + * definition of commitment, for now, is whether or not a + * buffer has been allocated or an interface attached, since + * that's the point where things get tricky. + * + * XXXRW: This will need some refinement. Is checking both + * for buffers and interface binding redundant? + */ + switch (*(u_int *)addr) { + case BPF_BUFMODE_BUFFER: + break; + +#ifdef BPF_ZEROCOPY + case BPF_BUFMODE_ZBUF: + break; +#endif + + default: + return (EINVAL); + } + + BPFD_LOCK(d); + if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || + d->bd_fbuf != NULL || d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EBUSY); + } + d->bd_bufmode = *(u_int *)addr; + BPFD_UNLOCK(d); + break; + + case BIOCACKZBUF: + return (bpf_ioctl_ackzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCGETZBUF: + return (bpf_ioctl_getzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCGETZMAX: + return (bpf_ioctl_getzmax(td, d, (size_t *)addr)); + + case BIOCGETZNEXT: + return (bpf_ioctl_getznext(td, d, (struct bpf_zbuf *)addr)); + + case BIOCSETZBUF: + return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr)); + + case BIOCROTZBUF: + return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr)); } return (error); } @@ -1139,13 +1444,33 @@ return (ENXIO); bp = theywant->if_bpf; + /* - * Allocate the packet buffers if we need to. - * If we're already attached to requested interface, - * just flush the buffer. + * Behavior here depends on the buffering model. If we're using + * kernel memory buffers, then we can allocate them here. If we're + * using zero-copy, then the user process must have registered + * buffers by the time we get here. If not, return an error. + * + * XXXRW: Could this be better abstracted? + * + * XXXRW: There are locking issues here with multi-threaded use: what + * if two threads try to set the interface at once? */ - if (d->bd_sbuf == NULL) - bpf_allocbufs(d); + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + if (d->bd_sbuf == NULL) + bpf_buffer_alloc(d); + KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); + break; + + case BPF_BUFMODE_ZBUF: + if (d->bd_sbuf == NULL) + return (EINVAL); + break; + + default: + panic("bpf_setif: bufmode %d", d->bd_bufmode); + } if (bp != d->bd_bif) { if (d->bd_bif) /* @@ -1289,37 +1614,14 @@ #ifdef MAC if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif - catchpacket(d, pkt, pktlen, slen, bcopy, &tv); + catchpacket(d, pkt, pktlen, slen, + bpf_append_bytes, &tv); } BPFD_UNLOCK(d); } BPFIF_UNLOCK(bp); } -/* - * Copy data from an mbuf chain into a buffer. This code is derived - * from m_copydata in sys/uipc_mbuf.c. - */ -static void -bpf_mcopy(const void *src_arg, void *dst_arg, size_t len) -{ - const struct mbuf *m; - u_int count; - u_char *dst; - - m = src_arg; - dst = dst_arg; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } -} - #define BPF_CHECK_DIRECTION(d, m) \ if (((d)->bd_direction == BPF_D_IN && (m)->m_pkthdr.rcvif == NULL) || \ ((d)->bd_direction == BPF_D_OUT && (m)->m_pkthdr.rcvif != NULL)) @@ -1369,7 +1671,7 @@ if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1424,7 +1726,7 @@ if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, - bpf_mcopy, &tv); + bpf_append_mbuf, &tv); } BPFD_UNLOCK(d); } @@ -1437,19 +1739,34 @@ * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while - * bpf_mcopy is passed in to copy mbuf chains. In the latter case, + * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, - void (*cpfn)(const void *, void *, size_t), struct timeval *tv) + void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *tv) { - struct bpf_hdr *hp; + struct bpf_hdr hdr; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; int do_wakeup = 0; BPFD_LOCK_ASSERT(d); + + /* + * Detect whether user space has released a buffer back to us, and if + * so, move it from being a hold buffer to a free buffer. This may + * not be the best place to do it (for example, we might only want to + * run this check if we need the space), but for now it's a reliable + * spot to do it. + */ + if (bpf_buffree(d)) { + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + } + /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1484,65 +1801,52 @@ } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set, or the read timeout has - * already expired during a select call. A packet - * arrived, so the reader should be woken up. + * Immediate mode is set, or the read timeout has already + * expired during a select call. A packet arrived, so the + * reader should be woken up. */ do_wakeup = 1; /* - * Append the bpf header. + * Append the bpf header. Note we append the actual header size, but + * move forward the length of the header plus padding. */ - hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); - hp->bh_tstamp = *tv; - hp->bh_datalen = pktlen; - hp->bh_hdrlen = hdrlen; + bzero(&hdr, sizeof(hdr)); + hdr.bh_tstamp = *tv; + hdr.bh_datalen = pktlen; + hdr.bh_hdrlen = hdrlen; + hdr.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); + /* * Copy the packet data into the store buffer and update its length. */ - (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); d->bd_slen = curlen + totlen; + /* + * XXXCSJP we could probably save a syscall per wakeup if we check the + * d->bd_immediate flag, hold buffer status and rotate the buffers + * before the wakeup. + */ if (do_wakeup) bpf_wakeup(d); } /* - * Initialize all nonzero fields of a descriptor. - */ -static void -bpf_allocbufs(struct bpf_d *d) -{ - - KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL")); - KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL")); - KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL")); - - d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_slen = 0; - d->bd_hlen = 0; -} - -/* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { + /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ - if (d->bd_sbuf != NULL) { - free(d->bd_sbuf, M_BPF); - if (d->bd_hbuf != NULL) - free(d->bd_hbuf, M_BPF); - if (d->bd_fbuf != NULL) - free(d->bd_fbuf, M_BPF); - } + bpf_free(d); if (d->bd_rfilter) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER @@ -1763,6 +2067,10 @@ strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; + d->bd_wcount = bd->bd_wcount; + d->bd_wdcount = bd->bd_wdcount; + d->bd_wfcount = bd->bd_wfcount; + d->bd_zcopy = bd->bd_zcopy; } static int --- //depot/vendor/freebsd/src/sys/net/bpf.h 2007/02/26 22:26:22 +++ //depot/projects/zcopybpf/src/sys/net/bpf.h 2007/03/04 02:08:48 @@ -92,6 +92,44 @@ #define BPF_MAJOR_VERSION 1 #define BPF_MINOR_VERSION 1 +/* + * Historically, BPF has supported a single buffering model, first using mbuf + * clusters in kernel, and later using malloc(9) buffers in kernel. We now + * support multiple buffering modes, which may be queried and set using + * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity + * of changing modes while sniffing packets, the mode becomes fixed once an + * interface has been attached to the BPF descriptor. + */ +#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ +#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ + +#define BPF_BUFMODE_DEFAULT BPF_BUFMODE_BUFFER /* Default. */ + +/* + * Struct used by BIOCACKZBUF, BIOCGETZNEXT, BIOCGETZBUF, BIOCSETZBUF: + * describes up to two zero-copy buffer as used by BPF. + * + * BIOCACKZBUF Acknowledge read of stored zero-copy buffer (rotate). + * BIOCGETZBUF Query current zero-copy buffer locations. + * BIOCGETZNEXT Query next stored buffer, if available. + * BIOCSETZBUF Set current zero-copy buffer locations (once only). + * + * Pointers may be set to NULL to indicate a buffer is not configure, should + * be freed, or is not being acknowledged. + */ +struct bpf_zbuf { + void *bz_bufa; /* Location of 'a' zero-copy buffer. */ + void *bz_bufb; /* Location of 'b' zero-copy buffer. */ + size_t bz_buflen; /* Size of zero-copy buffers. */ +}; + +/* Packet directions */ +enum bpf_direction { + BPF_D_IN, /* See incoming packets */ + BPF_D_INOUT, /* See incoming and outgoing packets */ + BPF_D_OUT /* See outgoing packets */ +}; + #define BIOCGBLEN _IOR('B',102, u_int) #define BIOCSBLEN _IOWR('B',102, u_int) #define BIOCSETF _IOW('B',103, struct bpf_program) @@ -115,18 +153,19 @@ #define BIOCGDLTLIST _IOWR('B',121, struct bpf_dltlist) #define BIOCLOCK _IO('B', 122) #define BIOCSETWF _IOW('B',123, struct bpf_program) -#define BIOCFEEDBACK _IOW('B',124, u_int) - /* Obsolete */ -#define BIOCGSEESENT BIOCGDIRECTION -#define BIOCSSEESENT BIOCSDIRECTION +#define BIOCGSEESENT BIOCGDIRECTION +#define BIOCSSEESENT BIOCSDIRECTION -/* Packet directions */ -enum bpf_direction { - BPF_D_IN, /* See incoming packets */ - BPF_D_INOUT, /* See incoming and outgoing packets */ - BPF_D_OUT /* See outgoing packets */ -}; +#define BIOCGETBUFMODE _IOR('B', 124, u_int) +#define BIOCSETBUFMODE _IOW('B', 125, u_int) +#define BIOCACKZBUF _IOW('B', 126, struct bpf_zbuf) +#define BIOCGETZBUF _IOR('B', 127, struct bpf_zbuf) +#define BIOCGETZMAX _IOR('B', 128, size_t) +#define BIOCGETZNEXT _IOR('B', 129, struct bpf_zbuf) +#define BIOCROTZBUF _IOR('B', 130, struct bpf_zbuf) +#define BIOCSETZBUF _IOW('B', 131, struct bpf_zbuf) +#define BIOCFEEDBACK _IOW('B', 132, u_int) /* * Structure prepended to each packet. @@ -149,6 +188,21 @@ #endif /* + * When using zero-copy BPF buffers, a shared memory header is present + * allowing the kernel BPF implementation and user process to synchronize + * without using system calls. This structure defines that header. + * + * The layout of this structure is critical, and must not be changed; if must + * fit in a single page on all architectures. + */ +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + u_int _bzh_pad[5]; +}; + +/* * Data-link level type codes. */ #define DLT_NULL 0 /* BSD loopback encapsulation */ @@ -627,6 +681,29 @@ }; #ifdef _KERNEL + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_BPF); +#endif + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_bpf); +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer into the + * hold slot, and the free buffer ino the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. + */ +#define ROTATE_BUFFERS(d) do { \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = NULL; \ + bpf_bufheld(d); \ +} while (0) + /* * Descriptor associated with each attached hardware interface. */ @@ -639,6 +716,7 @@ struct mtx bif_mtx; /* mutex for interface */ }; +void bpf_bufheld(struct bpf_d *d); int bpf_validate(const struct bpf_insn *, int); void bpf_tap(struct bpf_if *, u_char *, u_int); void bpf_mtap(struct bpf_if *, struct mbuf *); --- //depot/vendor/freebsd/src/sys/net/bpf_filter.c 2006/05/28 20:01:17 +++ //depot/projects/zcopybpf/src/sys/net/bpf_filter.c 2007/01/31 06:16:43 @@ -83,14 +83,11 @@ static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t -m_xword(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xword(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp, *np; - register struct mbuf *m0; + size_t len; + u_char *cp, *np; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -111,21 +108,18 @@ *err = 0; np = mtod(m0, u_char *); switch (len - k) { - case 1: return ((u_int32_t)cp[0] << 24) | ((u_int32_t)np[0] << 16) | ((u_int32_t)np[1] << 8) | (u_int32_t)np[2]; - case 2: return ((u_int32_t)cp[0] << 24) | ((u_int32_t)cp[1] << 16) | ((u_int32_t)np[0] << 8) | (u_int32_t)np[1]; - default: return ((u_int32_t)cp[0] << 24) | @@ -135,18 +129,15 @@ } bad: *err = 1; - return 0; + return (0); } static u_int16_t -m_xhalf(m, k, err) - register struct mbuf *m; - register bpf_u_int32 k; - register int *err; +m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) { - register size_t len; - register u_char *cp; - register struct mbuf *m0; + size_t len; + u_char *cp; + struct mbuf *m0; len = m->m_len; while (k >= len) { @@ -159,16 +150,16 @@ cp = mtod(m, u_char *) + k; if (len - k >= 2) { *err = 0; - return EXTRACT_SHORT(cp); + return (EXTRACT_SHORT(cp)); } m0 = m->m_next; if (m0 == 0) goto bad; *err = 0; - return (cp[0] << 8) | mtod(m0, u_char *)[0]; + return ((cp[0] << 8) | mtod(m0, u_char *)[0]); bad: *err = 1; - return 0; + return (0); } #endif @@ -178,21 +169,17 @@ * buflen is the amount of data present */ u_int -bpf_filter(pc, p, wirelen, buflen) - register const struct bpf_insn *pc; - register u_char *p; - u_int wirelen; - register u_int buflen; +bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { - register u_int32_t A = 0, X = 0; - register bpf_u_int32 k; + u_int32_t A = 0, X = 0; + bpf_u_int32 k; int32_t mem[BPF_MEMWORDS]; - if (pc == 0) + if (pc == NULL) /* * No filter means accept all. */ - return (u_int)-1; + return ((u_int)-1); --pc; while (1) { @@ -206,10 +193,10 @@ abort(); #endif case BPF_RET|BPF_K: - return (u_int)pc->k; + return ((u_int)pc->k); case BPF_RET|BPF_A: - return (u_int)A; + return ((u_int)A); case BPF_LD|BPF_W|BPF_ABS: k = pc->k; @@ -224,7 +211,7 @@ return 0; continue; #else - return 0; + return (0); #endif } #ifdef BPF_ALIGN @@ -256,7 +243,7 @@ k = pc->k; if (k >= buflen) { #ifdef _KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -287,13 +274,13 @@ int merr; if (buflen != 0) - return 0; + return (0); A = m_xword((struct mbuf *)p, k, &merr); if (merr != 0) - return 0; + return (0); continue; #else - return 0; + return (0); #endif } #ifdef BPF_ALIGN @@ -315,10 +302,10 @@ return 0; A = m_xhalf((struct mbuf *)p, k, &merr); if (merr != 0) - return 0; + return (0); continue; #else - return 0; + return (0); #endif } A = EXTRACT_SHORT(&p[k]); @@ -328,7 +315,7 @@ k = X + pc->k; if (pc->k >= buflen || X >= buflen - pc->k) { #ifdef _KERNEL - register struct mbuf *m; + struct mbuf *m; if (buflen != 0) return 0; @@ -337,7 +324,7 @@ A = mtod(m, u_char *)[k]; continue; #else - return 0; + return (0); #endif } A = p[k]; --- //depot/vendor/freebsd/src/sys/net/bpfdesc.h 2007/02/26 22:26:22 +++ //depot/projects/zcopybpf/src/sys/net/bpfdesc.h 2007/03/04 02:08:48 @@ -48,10 +48,11 @@ /* * Descriptor associated with each open bpf file. */ +struct zbuf; struct bpf_d { LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* - * Buffer slots: two malloc buffers store the incoming packets. + * Buffer slots: two memory clusters buffer the incoming packets. * The model has three slots. Sbuf is always occupied. * sbuf (store) - Receive interrupt puts packets here. * hbuf (hold) - When sbuf is full, put buffer here and @@ -93,6 +94,11 @@ u_long bd_fcount; /* number of packets which matched filter */ pid_t bd_pid; /* PID which created descriptor */ int bd_locked; /* true if descriptor is locked */ + u_int bd_bufmode; /* Current buffer mode. */ + u_long bd_wcount; /* number of packets written */ + u_long bd_wfcount; /* number of packets that matched write filter */ + u_long bd_wdcount; /* number of packets dropped during a write */ + u_long bd_zcopy; /* number of zero copy operations */ }; /* Values for bd_state */ @@ -107,12 +113,6 @@ NET_ASSERT_GIANT(); \ } while (0) -/* Test whether a BPF is ready for read(). */ -#define bpf_ready(bd) \ - ((bd)->bd_hlen != 0 || \ - (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ - (bd)->bd_slen != 0)) - /* * External representation of the bpf descriptor */ @@ -133,6 +133,10 @@ pid_t bd_pid; char bd_ifname[IFNAMSIZ]; int bd_locked; + u_long bd_wcount; + u_long bd_wfcount; + u_long bd_wdcount; + u_long bd_zcopy; }; #define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx)