Message ID | 1457702752-24736-1-git-send-email-venkatesh.vivekanandan@linaro.org |
---|---|
State | Superseded |
Headers | show |
On 03/11/16 16:25, venkatesh.vivekanandan@linaro.org wrote: > From: Venkatesh Vivekanandan <venkatesh.vivekanandan@linaro.org> > > example/ipfw files are the ones that are taken as-is from netmap-ipfw. > source can be found @ https://github.com/luigirizzo/netmap-ipfw.git > > Signed-off-by: Venkatesh Vivekanandan <venkatesh.vivekanandan@linaro.org> > --- > v2: remove permanent stub files and generate them while building > > example/ipfw/BSDmakefile | 8 + > example/ipfw/README | 76 + > example/ipfw/UPDATE | 44 + > example/ipfw/extra/expand_number.c | 101 + > example/ipfw/extra/glue.c | 555 +++ > example/ipfw/extra/glue.h | 488 ++ > example/ipfw/extra/humanize_number.c | 167 + > example/ipfw/extra/ipfw2_mod.c | 278 ++ > example/ipfw/extra/linux_defs.h | 144 + > example/ipfw/extra/missing.c | 732 +++ > example/ipfw/extra/missing.h | 801 ++++ > example/ipfw/extra/session.c | 644 +++ > example/ipfw/extra/sys/contrib/pf/net/pfvar.h | 27 + > example/ipfw/extra/sys/sys/kernel.h | 26 + > example/ipfw/extra/sys/sys/malloc.h | 13 + > example/ipfw/extra/sys/sys/mbuf.h | 383 ++ > example/ipfw/extra/sys/sys/module.h | 43 + > example/ipfw/extra/sys/sys/systm.h | 159 + > example/ipfw/extra/sys/sys/taskqueue.h | 51 + > example/ipfw/ipfw/altq.c | 151 + > example/ipfw/ipfw/dummynet.c | 1410 ++++++ > example/ipfw/ipfw/ipfw.8 | 3723 +++++++++++++++ > example/ipfw/ipfw/ipfw2.c | 4968 +++++++++++++++++++++ > example/ipfw/ipfw/ipfw2.h | 352 ++ > example/ipfw/ipfw/ipv6.c | 536 +++ > example/ipfw/ipfw/main.c | 628 +++ > example/ipfw/ipfw/nat.c | 1115 +++++ > example/ipfw/ipfw/tables.c | 2013 +++++++++ > example/ipfw/sys/net/pfil.h | 148 + > example/ipfw/sys/net/radix.c | 1208 +++++ > example/ipfw/sys/net/radix.h | 168 + > example/ipfw/sys/netgraph/ng_ipfw.h | 33 + > example/ipfw/sys/netinet/in_cksum.c | 146 + > example/ipfw/sys/netinet/ip_dummynet.h | 264 ++ > example/ipfw/sys/netinet/ip_fw.h | 1009 +++++ > example/ipfw/sys/netinet/tcp.h | 247 + > example/ipfw/sys/netinet/udp.h | 69 + > example/ipfw/sys/netpfil/ipfw/dn_heap.c | 552 +++ > example/ipfw/sys/netpfil/ipfw/dn_heap.h | 192 + > example/ipfw/sys/netpfil/ipfw/dn_sched.h | 192 + > example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c | 120 + > example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c | 229 + > example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c | 864 ++++ > example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c | 307 ++ > example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c | 373 ++ > example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c | 846 ++++ > example/ipfw/sys/netpfil/ipfw/ip_dn_io.c | 960 ++++ > example/ipfw/sys/netpfil/ipfw/ip_dn_private.h | 404 ++ > example/ipfw/sys/netpfil/ipfw/ip_dummynet.c | 2334 ++++++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw2.c | 2905 ++++++++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c | 1604 +++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c | 537 +++ > example/ipfw/sys/netpfil/ipfw/ip_fw_log.c | 567 +++ > example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c | 587 +++ > example/ipfw/sys/netpfil/ipfw/ip_fw_private.h | 625 +++ > example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c | 3469 ++++++++++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw_table.c | 3674 +++++++++++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw_table.h | 246 + > example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c | 4081 +++++++++++++++++ > example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c | 812 ++++ > example/ipfw/sys/sys/fnv_hash.h | 71 + > example/ipfw/sys/sys/hash.h | 133 + > 62 files changed, 49612 insertions(+) > create mode 100644 example/ipfw/BSDmakefile > create mode 100644 example/ipfw/README > create mode 100644 example/ipfw/UPDATE > create mode 100644 example/ipfw/extra/expand_number.c > create mode 100644 example/ipfw/extra/glue.c > create mode 100644 example/ipfw/extra/glue.h > create mode 100644 example/ipfw/extra/humanize_number.c > create mode 100644 example/ipfw/extra/ipfw2_mod.c > create mode 100644 example/ipfw/extra/linux_defs.h > create mode 100644 example/ipfw/extra/missing.c > create mode 100644 example/ipfw/extra/missing.h > create mode 100644 example/ipfw/extra/session.c > create mode 100644 example/ipfw/extra/sys/contrib/pf/net/pfvar.h > create mode 100644 example/ipfw/extra/sys/sys/kernel.h > create mode 100644 example/ipfw/extra/sys/sys/malloc.h > create mode 100644 example/ipfw/extra/sys/sys/mbuf.h > create mode 100644 example/ipfw/extra/sys/sys/module.h > create mode 100644 example/ipfw/extra/sys/sys/systm.h > create mode 100644 example/ipfw/extra/sys/sys/taskqueue.h > create mode 100644 example/ipfw/ipfw/altq.c > create mode 100644 example/ipfw/ipfw/dummynet.c > create mode 100644 example/ipfw/ipfw/ipfw.8 > create mode 100644 example/ipfw/ipfw/ipfw2.c > create mode 100644 example/ipfw/ipfw/ipfw2.h > create mode 100644 example/ipfw/ipfw/ipv6.c > create mode 100644 example/ipfw/ipfw/main.c > create mode 100644 example/ipfw/ipfw/nat.c > create mode 100644 example/ipfw/ipfw/tables.c > create mode 100644 example/ipfw/sys/net/pfil.h > create mode 100644 example/ipfw/sys/net/radix.c > create mode 100644 example/ipfw/sys/net/radix.h > create mode 100644 example/ipfw/sys/netgraph/ng_ipfw.h > create mode 100644 example/ipfw/sys/netinet/in_cksum.c > create mode 100644 example/ipfw/sys/netinet/ip_dummynet.h > create mode 100644 example/ipfw/sys/netinet/ip_fw.h > create mode 100644 example/ipfw/sys/netinet/tcp.h > create mode 100644 example/ipfw/sys/netinet/udp.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_heap.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_heap.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_io.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_private.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dummynet.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw2.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_log.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_private.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table.h > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c > create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c > create mode 100644 example/ipfw/sys/sys/fnv_hash.h > create mode 100644 example/ipfw/sys/sys/hash.h > > diff --git a/example/ipfw/BSDmakefile b/example/ipfw/BSDmakefile > new file mode 100644 > index 0000000..810ae2b > --- /dev/null > +++ b/example/ipfw/BSDmakefile > @@ -0,0 +1,8 @@ > +# forward to use gmake > +.PHONY: ipfw kipfw > + > +all: > + gmake > + > +$(.TARGETS) : > + gmake MAKE=gmake $(.TARGETS) > diff --git a/example/ipfw/README b/example/ipfw/README > new file mode 100644 > index 0000000..2a55ba0 > --- /dev/null > +++ b/example/ipfw/README > @@ -0,0 +1,76 @@ > +# README FILE FOR IPFW-USER ON TOP OF NETMAP > + This readme file should describe ODP and things related to odp. Might be some comparison numbers between ODP and original code. > +This directory contains a version of ipfw and dummynet that can > +run in userland, using NETMAP as the backend for packet I/O. > +This permits a throughput about 10 times higher than the > +corresponding in-kernel version. I have measured about 6.5 Mpps > +for plain filtering, and 2.2 Mpps going through a pipe. > +Some optimizations are possible when running on netmap pipes, > +or other netmap ports that support zero copy. > + > +To build the code simply run > + make NETMAP_INC=/some/where/with/netmap-release/sys > + > +pointing to the netmap 'sys' directory > +(the makefile uses gmake underneath) > + > +The base version comes from FreeBSD-HEAD -r '{2012-08-03}' > +(and subsequently updated in late 2013) > +with small modifications listed below > + > + netinet/ipfw > + ip_dn_io.c > + support for on-stack mbufs > + ip_fw2.c > + some conditional compilation for functions not > + available in userspace > + ip_fw_log.c > + revise snprintf, SNPARGS (MAC) > + > + > +sbin/ipfw and the kernel counterpart communicate throuugh a > +TCP socket (localhost:5555) carrying the raw data that would > +normally be carried on seg/getsockopt. > + > +For testing purposes, opening a telnet session to port 5556 and > +typing some bytes will start a fake 'infinite source' so you can > +check how fast your ruleset works. > + > + gmake > + dummynet/ipfw & # preferably in another window > + telnet localhost 5556 # type some bytes to start 'traffic' > + > + sh -c "while true; do ipfw/ipfw show; ipfw/ipfw zero; sleep 1; done" > + > +(on an i7-3400 I get about 15 Mpps) > + > +Real packet I/O is possible using netmap info.iet.unipi.it/~luigi/netmap/ > +You can use a couple of VALE switches (part of netmap) to connect > +a source and sink to the userspace firewall, as follows > + > + s f f d > + [pkt-gen]-->--[valeA]-->--[kipfw]-->--[valeB]-->--[pkt-gen] > + > +The commands to run (in separate windows) are > + > + # preliminarly, load the netmap module > + sudo kldload netmap.ko > + > + # connect the firewall to two vale switches > + ./kipfw valeA:f valeB:f & > + > + # configure ipfw/dummynet > + ipfw/ipfw show # or other > + > + # start the sink > + pkt-gen -i valeB:d -f rx > + > + # start an infinite source > + pkt-gen -i valeA:s -f tx > + > + # plain again with the firewall and enjoy > + ipfw/ipfw show # or other Ok, that is good setup. We support VALE and netmap pktio so might be we will have about the same numbers. We need some 'make check' script to prove that example still works. That can be done with pcap pktio or with netmap/VALE or with loop back. > + > +On my i7-3400 I get about 6.5 Mpps with a single rule, and about 2.2 Mpps > +when going through a dummynet pipe. This is for a single process handling > +the traffic. > diff --git a/example/ipfw/UPDATE b/example/ipfw/UPDATE > new file mode 100644 > index 0000000..3da344f > --- /dev/null > +++ b/example/ipfw/UPDATE > @@ -0,0 +1,44 @@ > +--- 20141017 --- updating to FreeBSD head 273155 > + > +sys/net/pfil.h V $FreeBSD$ > +sys/net/radix.h V $FreeBSD$ > +sys/net/radix.c V merge, caddr_t -> u_char * > + > +sys/netgraph/ng_ipfw.h -- unchanged > + > +sys/netinet/in_cksum.c -- unchanged > +sys/netinet/ip_dummynet.h V add DN_IS_ECN > +sys/netinet/ip_fw.h massive changes > +sys/netinet/tcp.h V $FreeBSD$ > +sys/netinet/udp.h V $FreeBSD$ > + > +sys/netpfil/ipfw > +dn_heap.c -- unchanged > +dn_heap.h -- unchanged > +dn_sched.h $FreeBSD$ > +dn_sched_fifo.c -- unchanged > +dn_sched_prio.c -- unchanged > +dn_sched_qfq.c -- unchanged > +dn_sched_rr.c -- unchanged > +dn_sched_wf2q.c -- unchanged > +ip_dn_glue.c V $FreeBSD$ > +ip_dn_io.c V ecn, check missing ifp > +ip_dn_private.h V $FreeBSD$ > +ip_dummynet.c V $FreeBSD$, callout_reset_sbt, fs fixes, module > +ip_fw2.c XXX large > +ip_fw_dynamic.c XXX large > +ip_fw_log.c XXX IP_FW_ARG.. TARG > +ip_fw_pfil.c XXX small change > +ip_fw_private.h XXX large > +ip_fw_sockopt.c XXX huge > +ip_fw_table.c XXX huge > + > +Userspace: > +altq.c $FreeBSD$, bprintf > +dummynet.c $FreeBSD$, ecn, bprintf > +ipfw2.c $FreeBSD$, commands > +ipfw2.h as above > +ipv6.c as above > +main.c small changes > +nat.c internal > + > diff --git a/example/ipfw/extra/expand_number.c b/example/ipfw/extra/expand_number.c > new file mode 100644 > index 0000000..523fbb0 > --- /dev/null > +++ b/example/ipfw/extra/expand_number.c > @@ -0,0 +1,101 @@ > +/*- > + * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org> > + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 211343 2010-08-15 18:32:06Z des $"); > + > +#include <sys/types.h> > +#include <ctype.h> > +#include <errno.h> > +#include <inttypes.h> > +//#include <libutil.h> > +#include <stdint.h> > + > +/* > + * Convert an expression of the following forms to a uint64_t. > + * 1) A positive decimal number. > + * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). > + * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). > + * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). > + * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). > + * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). > + * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). > + * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). > + */ > +int > +expand_number(const char *buf, uint64_t *num) > +{ > + uint64_t number; > + unsigned shift; > + char *endptr; > + > + number = strtoumax(buf, &endptr, 0); > + > + if (endptr == buf) { > + /* No valid digits. */ > + errno = EINVAL; > + return (-1); > + } > + > + switch (tolower((unsigned char)*endptr)) { > + case 'e': > + shift = 60; > + break; > + case 'p': > + shift = 50; > + break; > + case 't': > + shift = 40; > + break; > + case 'g': > + shift = 30; > + break; > + case 'm': > + shift = 20; > + break; > + case 'k': > + shift = 10; > + break; > + case 'b': > + case '\0': /* No unit. */ > + *num = number; > + return (0); > + default: > + /* Unrecognized unit. */ > + errno = EINVAL; > + return (-1); > + } > + > + if ((number << shift) >> shift != number) { > + /* Overflow */ > + errno = ERANGE; > + return (-1); > + } > + > + *num = number << shift; > + return (0); > +} > diff --git a/example/ipfw/extra/glue.c b/example/ipfw/extra/glue.c > new file mode 100644 > index 0000000..0786453 > --- /dev/null > +++ b/example/ipfw/extra/glue.c > @@ -0,0 +1,555 @@ > +/* > + * Userland functions missing in linux > + * taken from /usr/src/lib/libc/stdtime/time32.c > + */ > + > +#include <stdlib.h> > +#include <stdio.h> > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <netinet/in.h> /* sockaddr_in */ > +#include <netinet/tcp.h> /* TCP_NODELAY */ > +#include <sys/uio.h> > +#include <unistd.h> /* uint* types */ > +#include <errno.h> > +#include <string.h> /* bzero */ > +#include <arpa/inet.h> /* htonl */ > + > +#ifndef HAVE_NAT > +/* dummy nat functions */ > +void > +ipfw_show_nat(int ac, char **av) > +{ > + D("unsupported"); > +} > + > +void > +ipfw_config_nat(int ac, char **av) > +{ > + D("unsupported"); > +} > +#endif /* HAVE_NAT */ > + > +#ifdef NEED_STRTONUM > +/* missing in linux and windows */ > +long long int > +strtonum(const char *nptr, long long minval, long long maxval, > + const char **errstr) > +{ > + long long ret; > + int errno_c = errno; /* save actual errno */ > + > + errno = 0; > +#ifdef TCC > + ret = strtol(nptr, (char **)errstr, 0); > +#else > + ret = strtoll(nptr, (char **)errstr, 0); > +#endif > + /* We accept only a string that represent exactly a number (ie. start > + * and end with a digit). > + * FreeBSD version wants errstr==NULL if no error occurs, otherwise > + * errstr should point to an error string. > + * For our purspose, we implement only the invalid error, ranges > + * error aren't checked > + */ > + if (errno != 0 || nptr == *errstr || **errstr != '\0') > + *errstr = "invalid"; > + else { > + *errstr = NULL; > + errno = errno_c; > + } > + return ret; > +} > + > +int > +ishexnumber(int c) > +{ > + return ((c >= '0' && c <= '9') || > + (c >= 'a' && c <= 'f') || > + (c >= 'A' && c <= 'F') ); > +} > + > +#endif /* NEED_STRTONUM */ > + > +#ifdef __linux__ > + > + > +int optreset; /* missing in linux */ > + > +/* > + * not implemented in linux. > + * taken from /usr/src/lib/libc/string/strlcpy.c > + */ > +size_t > +strlcpy(dst, src, siz) > + char *dst; > + const char *src; > + size_t siz; > +{ > + char *d = dst; > + const char *s = src; > + size_t n = siz; > + > + /* Copy as many bytes as will fit */ > + if (n != 0 && --n != 0) { > + do { > + if ((*d++ = *s++) == 0) > + break; > + } while (--n != 0); > + } > + > + /* Not enough room in dst, add NUL and traverse rest of src */ > + if (n == 0) { > + if (siz != 0) > + *d = '\0'; /* NUL-terminate dst */ > + while (*s++) > + ; > + } > + > + return(s - src - 1); /* count does not include NUL */ > +} > + > + > +#endif /* __linux__ */ > + > + > +#if defined (EMULATE_SYSCTL) > +//XXX missing prerequisites > +#include <net/if.h> //openwrt > +#include <netinet/ip.h> //openwrt > +#include <netinet/ip_fw.h> > +#include <netinet/ip_dummynet.h> > +int do_cmd(int optname, void *optval, uintptr_t optlen); > +#endif /* EMULATE_SYSCTL */ > + > +/* > + * set or get system information > + * XXX lock acquisition/serialize calls > + * > + * we export this as sys/module/ipfw_mod/parameters/___ > + * This function get or/and set the value of the sysctl passed by > + * the name parameter. If the old value is not desired, > + * oldp and oldlenp should be set to NULL. > + * > + * XXX > + * I do not know how this works in FreeBSD in the case > + * where there are no write permission on the sysctl var. > + * We read the value and set return variables in any way > + * but returns -1 on write failures, regardless the > + * read success. > + * > + * Since there is no information on types, in the following > + * code we assume a length of 4 is a int. > + * > + * Returns 0 on success, -1 on errors. > + */ > +int > +sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, > + size_t newlen) > +{ > +#if defined (EMULATE_SYSCTL) > + /* > + * we embed the sysctl request in the usual sockopt mechanics. > + * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 > + * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET > + * subcommands. > + * the syntax of this function is fully compatible with > + * POSIX sysctlby name: > + * if newp and newlen are != 0 => this is a set > + * else if oldp and oldlen are != 0 => this is a get > + * to avoid too much overhead in the module, the whole > + * sysctltable is returned, and the parsing is done in userland, > + * a probe request is done to retrieve the size needed to > + * transfer the table, before the real request > + * if both old and new params = 0 => this is a print > + * this is a special request, done only by main() > + * to implement the extension './ipfw sysctl', > + * a command that bypasses the normal getopt, and that > + * is available on those platforms that use this > + * sysctl emulation. > + * in this case, a negative oldlen signals that *oldp > + * is actually a FILE* to print somewhere else than stdout > + */ > + > + int l; > + int ret; > + struct dn_id* oid; > + struct sysctlhead* entry; > + char* pstring; > + char* pdata; > + FILE* fp; > + > + if((oldlenp != NULL) && ((int)*oldlenp < 0)) > + fp = (FILE*)oldp; > + else > + fp = stdout; > + if(newp != NULL && newlen != 0) > + { > + //this is a set > + l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; > + oid = malloc(l); > + if (oid == NULL) > + return -1; > + oid->len = l; > + oid->type = DN_SYSCTL_SET; > + oid->id = DN_API_VERSION; > + > + entry = (struct sysctlhead*)(oid+1); > + pdata = (char*)(entry+1); > + pstring = pdata + newlen; > + > + entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; > + entry->namelen = strlen(name)+1; > + entry->flags = 0; > + entry->datalen = newlen; > + > + bcopy(newp, pdata, newlen); > + bcopy(name, pstring, strlen(name)+1); > + > + ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); > + if (ret != 0) > + return -1; > + } > + else > + { > + //this is a get or a print > + l = sizeof(struct dn_id); > + oid = malloc(l); > + if (oid == NULL) > + return -1; > + oid->len = l; > + oid->type = DN_SYSCTL_GET; > + oid->id = DN_API_VERSION; > + > + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); > + if (ret != 0) > + return -1; > + > + l=oid->id; > + free(oid); > + oid = malloc(l); > + if (oid == NULL) > + return -1; > + oid->len = l; > + oid->type = DN_SYSCTL_GET; > + oid->id = DN_API_VERSION; > + > + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); > + if (ret != 0) > + return -1; > + > + entry = (struct sysctlhead*)(oid+1); > + while(entry->blocklen != 0) > + { > + pdata = (char*)(entry+1); > + pstring = pdata+entry->datalen; > + > + //time to check if this is a get or a print > + if(name != NULL && oldp != NULL && *oldlenp > 0) > + { > + //this is a get > + if(strcmp(name,pstring) == 0) > + { > + //match found, sanity chech on len > + if(*oldlenp < entry->datalen) > + { > + printf("%s error: buffer too small\n",__FUNCTION__); > + return -1; > + } > + *oldlenp = entry->datalen; > + bcopy(pdata, oldp, *oldlenp); > + return 0; > + } > + } > + else > + { > + //this is a print > + if( name == NULL ) > + goto print; > + if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) > + goto print; > + else > + goto skip; > +print: > + fprintf(fp, "%s: ",pstring); > + switch( entry->flags >> 2 ) > + { > + case SYSCTLTYPE_LONG: > + fprintf(fp, "%li ", *(long*)(pdata)); > + break; > + case SYSCTLTYPE_UINT: > + fprintf(fp, "%u ", *(unsigned int*)(pdata)); > + break; > + case SYSCTLTYPE_ULONG: > + fprintf(fp, "%lu ", *(unsigned long*)(pdata)); > + break; > + case SYSCTLTYPE_INT: > + default: > + fprintf(fp, "%i ", *(int*)(pdata)); > + } > + if( (entry->flags & 0x00000003) == CTLFLAG_RD ) > + fprintf(fp, "\t(read only)\n"); > + else > + fprintf(fp, "\n"); > +skip: ; > + } > + entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); > + } > + free(oid); > + return 0; > + } > + //fallback for invalid options > + return -1; > + > +#else /* __linux__ */ > + FILE *fp; > + char *basename = "/sys/module/ipfw_mod/parameters/"; > + char filename[256]; /* full filename */ > + char *varp; > + int ret = 0; /* return value */ > + int d; > + > + if (name == NULL) /* XXX set errno */ > + return -1; > + > + /* locate the filename */ > + varp = strrchr(name, '.'); > + if (varp == NULL) /* XXX set errno */ > + return -1; > + > + snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); > + > + /* > + * XXX we could open the file here, in rw mode > + * but need to check if a file have write > + * permissions. > + */ > + > + /* check parameters */ > + if (oldp && oldlenp) { /* read mode */ > + fp = fopen(filename, "r"); > + if (fp == NULL) { > + fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); > + return -1; > + } > + if (*oldlenp == 4) { > + if (fscanf(fp, "%d", &d) == 1) > + memcpy(oldp, &d, *oldlenp); > + else > + ret = -1; > + } > + fclose(fp); > + } > + > + if (newp && newlen) { /* write */ > + fp = fopen(filename, "w"); > + if (fp == NULL) { > + fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); > + return -1; > + } > + if (newlen == 4) { > + if (fprintf(fp, "%d", *(int*)newp) < 1) > + ret = -1; > + } > + > + fclose(fp); > + } > + > + return ret; > +#endif /* __linux__ */ > +} > + > +/* > + * The following two functions implement getsockopt/setsockopt > + * replacements to talk over a TCP socket. > + * Because the calls are synchronous, we can run blocking code > + * and do not need to play special tricks to be selectable. > + * The wire protocol for the emulation is the following: > + * REQUEST: n32 req_size, level, optname; u8 data[req_size] > + * RESPONSE: n32 resp_size, ret_code; u8 data[resp_size] > + * data is only present if ret_code == 0 > + * > + * Return 0 if the message wan sent to the remote > + * endpoint, -1 on error. > + * > + * If the required lenght is greater then the > + * available buffer size, -1 is returned and > + * optlen is the required lenght. > + */ > +enum sock_type {GET_SOCKOPT, SET_SOCKOPT}; > + > +struct wire_hdr { > + uint32_t optlen; /* actual data len */ > + uint32_t level; /* or error */ > + uint32_t optname; /* or act len */ > + uint32_t dir; /* in or out */ > +}; > + > +/* do a complete write of the buffer */ > +static int > +writen(int fd, const char *buf, int len) > +{ > + int i; > + > + for (; len > 0; buf += i, len -= i) { > + i = write(fd, buf, len); > + ND("have %d wrote %d", len, i); > + if (i < 0) { > + if (errno == EAGAIN) > + continue; > + return -1; > + } > + } > + return 0; > +} > + > +/* do a complete read */ > +static int > +readn(int fd, char *buf, int len) > +{ > + int i, pos; > + > + for (pos = 0; pos < len; pos += i) { > + i = read(fd, buf + pos, len - pos); > + ND("have %d want %d got %d", pos, len, i); > + if (i < 0) { > + if (errno == EAGAIN) > + continue; > + return -1; > + } > + } > + ND("full read got %d", pos); > + return 0; > +} > + > +int > +__sockopt2(int s, int level, int optname, void *optval, socklen_t *optlen, > + enum sopt_dir dir) > +{ > + struct wire_hdr r; > + int len = optlen && optval ? *optlen : 0; > + int new_errno; > + > + ND("dir %d optlen %d level %d optname %d", dir, len, level, optname); > + /* send request to the server */ > + r.optlen = htonl(len); > + r.level = htonl(level); > + r.optname = htonl(optname); > + r.dir = htonl(dir); > + > + if (writen(s, (const char *) &r, sizeof(r))) > + return -1; /* error writing */ > + > + /* send data, if present */ > + if (len < 0) { > + fprintf(stderr, "%s invalid args found\n", __FUNCTION__); > + return -1; > + } else if (len > 0) { > + if (writen(s, optval, len)) > + return -1; /* error writing */ > + } > + > + /* read response size and error code */ > + if (readn(s, (char *)&r, sizeof(r))) > + return -1; /* error reading */ > + len = ntohl(r.optlen); > + ND("got header, datalen %d", len); > + if (len > 0) { > + if (readn(s, optval, len)) { > + return -1; /* error reading */ > + } > + } > + if (optlen) > + *optlen = ntohl(r.optlen); /* actual len */ > + new_errno = ntohl(r.level); > + if (new_errno) > + errno = new_errno; > + return (new_errno ? -1 : 0); > +} > + > +/* > + * getsockopt() replacement. > + */ > +int > +getsockopt2(int s, int level, int optname, void *optval, > + socklen_t *optlen) > +{ > + return __sockopt2(s, level, optname, optval, optlen, SOPT_GET); > +} > + > +/* > + * setsockopt() replacement > + */ > +int > +setsockopt2(int s, int level, int optname, void *optval, > + socklen_t optlen) > +{ > + /* optlen not changed, use the local address */ > + return __sockopt2(s, level, optname, optval, &optlen, SOPT_SET); > +} > + > +#ifdef socket > +#undef socket /* we want the real one */ > +#endif > +/* > + * This function replaces the socket() call to connect to > + * the ipfw control socket. > + * We actually ignore the paramerers if IPFW_HOST and IPFW_PORT > + * are defined. > + */ > +int > +do_connect(const char *addr, int port) > +{ > + int conn_fd; > + > + /* open the socket */ > +#ifdef NETLINK > + > +struct rtnl_handle rth; > + > + conn_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); > +#else > + struct sockaddr_in server; /* server address */ > + const char *s; > + > + conn_fd = socket(AF_INET, SOCK_STREAM, 0); > + if (conn_fd < 0) { > + perror("socket"); > + return -1; > + } > +#endif > +#ifndef NETLINK > + /* fill the sockaddr structure with server address */ > + bzero(&server, sizeof(server)); > + server.sin_family = AF_INET; > + > + /* override the host if set in the environment */ > + s = getenv("IPFW_HOST"); > + if (s) > + addr = s; > + inet_aton(addr, &server.sin_addr); > + s = getenv("IPFW_PORT"); > + if (s && atoi(s) > 0) > + port = atoi(s); > + server.sin_port = htons(port); > + > + /* connect to the server */ > + if (connect(conn_fd, (struct sockaddr*) &server, sizeof(server)) < 0) { > + perror("connect"); > + return -1; > + } > +#ifdef setsockopt /* we want the real one here */ > +#undef setsockopt > +#undef getsockopt > +#endif > + { > + int on = 1, ret; > + ret = setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); > + ND("set TCP_NODELAY %d returns %d", on, ret); > + } > + if (0) > + fprintf(stderr, "connected to %s:%d\n", > + inet_ntoa(server.sin_addr), ntohs(server.sin_port)); > +#endif > + return conn_fd; > +} > diff --git a/example/ipfw/extra/glue.h b/example/ipfw/extra/glue.h > new file mode 100644 > index 0000000..97b25bf > --- /dev/null > +++ b/example/ipfw/extra/glue.h > @@ -0,0 +1,488 @@ > +/* > + * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > +/* > + * $Id: glue.h 8327 2011-03-22 17:01:35Z marta $ > + * > + * glue code to adapt the FreeBSD version to linux and windows, > + * userland and kernel. > + * This is included before any other headers, so we do not have > + * a chance to override any #define that should appear in other > + * headers. > + * First handle headers for userland and kernel. Then common code > + * (including headers that require a specific order of inclusion), > + * then the user- and kernel- specific parts. > + */ > + > +#ifndef _GLUE_H > +#define _GLUE_H > + > +/* > + * common definitions to allow portability > + */ > +#ifndef __FBSDID > +#define __FBSDID(x) struct __hack > +#endif /* FBSDID */ > + > +#include <stdint.h> /* linux needs it in addition to sys/types.h */ > +#include <sys/types.h> /* for size_t */ > + > +#define true 1 /* stdbool */ > +#ifdef _KERNEL /* prevent a warning */ > +#undef _KERNEL > +#include <sys/ioctl.h> > +#include <sys/time.h> > +#include <errno.h> /* we want errno */ > +#define _KERNEL > +#else > +#include <sys/ioctl.h> > +#endif > + > +#include <time.h> > +#ifndef USERSPACE > +#include <netinet/ether.h> > +#endif > + > + > +/*----- */ > + > +/* ipfw2.c - from timeconv.h */ > +static __inline time_t > +_long_to_time(long tlong) > +{ > + if (sizeof(long) == sizeof(__int32_t)) > + return((time_t)(__int32_t)(tlong)); > + return((time_t)tlong); > +} > + > +#define min(a, b) ((a) < (b) ? (a) : (b) ) // radix.c > +/* > + * debugging macros from ip_dn_private.h > + */ > +#include <sys/time.h> > +#include <stdio.h> > +extern char *strrchr(const char *, int); > +static inline const char *xyz(const char *s) { > + static char buf[128]; > + struct timeval t; > + const char *ret = strrchr(s, '/'); > + if (ret) s = ret + 1; > + gettimeofday(&t, NULL); > + buf[sizeof(buf) - 1] = '\0'; > + snprintf(buf, sizeof(buf), "[%4d.%06d] %s", > + (int)(t.tv_sec % 1000), (int)(t.tv_usec), s); > + return buf; > +} > + > +#define ND(fmt, ...) do {} while (0) > +#define D1(fmt, ...) do {} while (0) > +#define D(fmt, ...) fprintf(stderr, "%s:%-10s [%d] " fmt "\n", \ > + xyz(__FILE__), __FUNCTION__, __LINE__, ## __VA_ARGS__) > + > +/* Rate limited version of "D", lps indicates how many per second */ > +#define RD(lps, format, ...) \ > + do { \ > + static int t0, __cnt; \ > + struct timeval __xxts; \ > + gettimeofday(&__xxts, NULL); \ > + if (t0 != __xxts.tv_sec) { \ > + t0 = __xxts.tv_sec; \ > + __cnt = 0; \ > + } \ > + if (__cnt++ < lps) { \ > + D(format, ##__VA_ARGS__); \ > + } \ > + } while (0) > + > +#define DX(lev, fmt, ...) do { \ > + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) > +/* end debugging macros */ > + > + > +/* > + * sbin/ipfw on non-freebsd platform > + */ > +#ifdef NEED_STRTONUM > +/* prototypes from libutil */ > +/* humanize_number(3) */ > +#define HN_DECIMAL 0x01 > +#define HN_NOSPACE 0x02 > +#define HN_B 0x04 > +#define HN_DIVISOR_1000 0x08 > +#define HN_IEC_PREFIXES 0x10 > + > +#define HN_GETSCALE 0x10 > +#define HN_AUTOSCALE 0x20 > + > + > +int humanize_number(char *_buf, size_t _len, int64_t _number, > + const char *_suffix, int _scale, int _flags); > +int expand_number(const char *buf, uint64_t *num); > + > + > +long long > +strtonum(const char *nptr, long long minval, long long maxval, > + const char **errstr); > +#ifndef __APPLE__ > +int ishexnumber(int c); > +#endif > +#endif /* NEED_STRTONUM */ > + > +#ifdef NEED_SYSCTLBYNAME /* and other linux calls */ > +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, > + void *newp, size_t newlen); > +#define setprogname(x) /* not present in linux */ > + > +extern int optreset; /* not present in linux */ > + > +long long int strtonum(const char *nptr, long long minval, > + long long maxval, const char **errstr); > + > + > +struct ether_addr; > +struct ether_addr * ether_aton(const char *a); > + > +#define ICMP6_MAXTYPE 201 > +#define __u6_addr in6_u > +#define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ > + > + > +#define __u6_addr32 u6_addr32 > +/* on freebsd sys/socket.h pf specific */ > +#define NET_RT_IFLIST 3 /* survey interface list */ > + > +#define RTM_VERSION 5 /* Up the ante and ignore older versions */ > + > +#endif // NEED_SYSCTLBYNAME > + > +#ifdef NEED_SIN_LEN > +/* > + * linux at least does not have sin_len and sin6_len, so we remap > + * to some safe fields (check use of sin6_flowinfo XXX) > + */ > +#define sin_len sin_zero[0] > +#define sin6_len sin6_flowinfo > +#endif /* NEED_SIN_LEN */ > + > +#ifdef NEED_ROUNDUP2 /* in freensd is in sys/param.h */ > +/* round up to the next power of 2 (y) */ > +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ > +#endif // NEED_ROUNDUP2 > + > +/* possibly redundant, does not harm */ > +size_t strlcpy(char * dst, const char * src, size_t siz); > + > +/* > + * Part 2: common userland and kernel definitions > + */ > + > +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ > +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ > +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ > +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ > + > +/* > + * linux: sysctl are mapped into /sys/module/ipfw_mod parameters > + * windows: they are emulated via get/setsockopt > + */ > +#define CTLFLAG_RD 1 > +#define CTLFLAG_RDTUN 1 > +#define CTLFLAG_RW 2 > +#define CTLFLAG_SECURE3 0 /* unsupported */ > +#define CTLFLAG_VNET 0 /* unsupported */ > + > +/* if needed, queue.h must be included here after list.h */ > + > +/* > + * our own struct thread > + */ > +struct thread { /* ip_fw_sockopt */ > + void *sopt_td; > + void *td_ucred; > +}; > + > +enum sopt_dir { SOPT_GET, SOPT_SET }; > + > +struct sockopt { > + enum sopt_dir sopt_dir; /* is this a get or a set? */ > + int sopt_level; /* second arg of [gs]etsockopt */ > + int sopt_name; /* third arg of [gs]etsockopt */ > + void *sopt_val; /* fourth arg of [gs]etsockopt */ > + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ > + struct thread *sopt_td; /* calling thread or null if kernel */ > +}; > + > + > +/* > + * List of values used for set/getsockopt options. > + * The base value on FreeBSD is defined as a macro, > + * if not available we will use our own enum. > + * The TABLE_BASE value is used in the kernel. > + */ > +#define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ > +#define IP_FW_TABLE_ADD (_IPFW_SOCKOPT_BASE + 0) > +#define IP_FW_TABLE_DEL (_IPFW_SOCKOPT_BASE + 1) > +#define IP_FW_TABLE_FLUSH (_IPFW_SOCKOPT_BASE + 2) > +#define IP_FW_TABLE_GETSIZE (_IPFW_SOCKOPT_BASE + 3) > +#define IP_FW_TABLE_LIST (_IPFW_SOCKOPT_BASE + 4) > +#define IP_FW_DYN_GET (_IPFW_SOCKOPT_BASE + 5) > + > +#define IP_FW3 (_IPFW_SOCKOPT_BASE + 8) > +#define IP_DUMMYNET3 (_IPFW_SOCKOPT_BASE + 9) > + > +#define IP_FW_ADD (_IPFW_SOCKOPT_BASE + 10) > +#define IP_FW_DEL (_IPFW_SOCKOPT_BASE + 11) > +#define IP_FW_FLUSH (_IPFW_SOCKOPT_BASE + 12) > +#define IP_FW_ZERO (_IPFW_SOCKOPT_BASE + 13) > +#define IP_FW_GET (_IPFW_SOCKOPT_BASE + 14) > +#define IP_FW_RESETLOG (_IPFW_SOCKOPT_BASE + 15) > + > +#define IP_FW_NAT_CFG (_IPFW_SOCKOPT_BASE + 16) > +#define IP_FW_NAT_DEL (_IPFW_SOCKOPT_BASE + 17) > +#define IP_FW_NAT_GET_CONFIG (_IPFW_SOCKOPT_BASE + 18) > +#define IP_FW_NAT_GET_LOG (_IPFW_SOCKOPT_BASE + 19) > + > +#define IP_DUMMYNET_CONFIGURE (_IPFW_SOCKOPT_BASE + 20) > +#define IP_DUMMYNET_DEL (_IPFW_SOCKOPT_BASE + 21) > +#define IP_DUMMYNET_FLUSH (_IPFW_SOCKOPT_BASE + 22) > + /* 63 is missing */ > +#define IP_DUMMYNET_GET (_IPFW_SOCKOPT_BASE + 24) > +#define _IPFW_SOCKOPT_END (_IPFW_SOCKOPT_BASE + 25) > + > +/* > + * Part 3: userland stuff for linux/windows > + */ > + > + > +/* > + * now remap functions for userland or linux kernel etc. > + */ > +#ifdef USERSPACE > +/* > + * definitions used when the programs communicate through userspace. > + * We need to define the socket and addresses used to talk, and > + * the userland side must also remap socket() and [gs]etsockopt() > + * to appropriate wrappers. > + */ > + > +#define LOCALADDR "127.0.0.1" > +#define IPFW_PORT 5555 > + > +#ifndef KERNEL_SIDE > +#ifdef _KERNEL > +#error _KERNEL defined in user space > +#endif > +int do_connect(const char *addr, int port); > +#include <sys/socket.h> /* for socklen_t */ > + > +#define socket(a, b, c) do_connect(LOCALADDR, IPFW_PORT) > +#define setsockopt setsockopt2 > +#define getsockopt getsockopt2 > +int getsockopt2(int s, int lev, int optname, void *optval, socklen_t *optlen); > +int setsockopt2(int s, int lev, int optname, void *optval, socklen_t optlen); > +#endif /* KERNEL_SIDE */ > + > +#endif /* USERSPACE */ > + > +/* > + * Part 5: windows specific stuff and sysctl emulation > + */ > + > +/******************* > +* SYSCTL emulation * > +********************/ > +#ifdef EMULATE_SYSCTL > + > +/* this needs to be here, as it is part of the user-kernel messages */ > +/* flag is set with the last 2 bits for access, as defined in glue.h > + * and the rest for type > + */ > +enum { > + SYSCTLTYPE_INT = 0, > + SYSCTLTYPE_UINT = 1, > + SYSCTLTYPE_SHORT = 2, > + SYSCTLTYPE_USHORT = 3, > + SYSCTLTYPE_LONG = 4, > + SYSCTLTYPE_ULONG = 5, > + SYSCTLTYPE_STRING = 6, > + > + /* the following are SYSCTL_PROC equivalents of the above, > + * where the SYSCTLTYPE is shifted 2 bits, > + * and SYSCTLTYPE_PROC is set > + */ > + SYSCTLTYPE_PROC = 0x100, > + CTLTYPE_INT = (0x100 | (0<<2)), > + CTLTYPE_UINT = (0x100 | (1<<2)), > + CTLTYPE_LONG = (0x100 | (4<<2)), > + CTLTYPE_ULONG = (0x100 | (5<<2)) > +}; > + > +struct sysctlhead { > + uint32_t blocklen; //total size of the entry > + uint32_t namelen; //strlen(name) + '\0' > + uint32_t flags; //type and access > + uint32_t datalen; > +}; > + > + > +#endif /* EMULATE_SYSCTL */ > +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, > + size_t newlen); > + > +#ifndef __FreeBSD__ > +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) > +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) > +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) > + > +static inline int fls(int _n) > +{ > + unsigned int n = _n; > + int i = 0; > + for (i = 0; n > 0; n >>= 1, i++) > + ; > + return i; > +} > + > +static inline unsigned long __fls(unsigned long word) > +{ > + return fls(word) - 1; > +} > + > + > +#endif /* !FreeBSD */ > + > +#ifdef KERNEL_SIDE > +/* sys/counter.h , to be moved to a file */ > +typedef uint64_t *counter_u64_t; // XXX kernel > +static inline void counter_u64_add(counter_u64_t c, int64_t v) > +{ > + *c += v; > +} > +static inline void counter_u64_zero(counter_u64_t c) > +{ > + *c = 0; > +} > +static inline uint64_t counter_u64_fetch(counter_u64_t c) > +{ > + return *c; > +} > + > +struct rm_priotracker { > +}; > + > +#define vslock(_a, _b) (0) > +#define vsunlock(_a, _b) > + > +typedef uint64_t u_register_t; // XXX not on osx ? > + > +typedef uintptr_t eventhandler_tag; > +#define EVENTHANDLER_REGISTER(_a, _b, ...) (uintptr_t)_b; > +#define EVENTHANDLER_DEREGISTER(_a, _b, ...) (void)_b; > + > +// XXX this needs to be completed > +#define if_name(_ifp) (_ifp->if_xname) > +#define ifunit_ref(_n) NULL // XXX > +#define if_rele(_n) > + > +#define rtalloc1_fib(_a, ...) NULL > +#define rt_key(_a) NULL > +#define rt_mask(_a) NULL > +#define RTFREE_LOCKED(_a) ((void)NULL) > +struct rtentry { > +}; > +#define rt_tables_get_rnh(_a, _b) NULL > + > +#endif /* KERNEL_SIDE */ > + > +#ifdef _KERNEL > +/* XXX kernel support */ > +/* on freebsd net/if.h XXX used */ > +#ifdef linux > +#define div64(a,b) (((int64_t)a)/((int64_t)b)) > +#define LINUX_VERSION_CODE 30003 > +#define KERNEL_VERSION(a,b,c) (a*10000+b*100 + c) > +#define __printflike(a,b) > +#endif /* linux */ > + > +#endif /* _KERNEL */ > + > +#ifndef __FreeBSD__ > +#ifndef IFNAMSIZ > +#define IFNAMSIZ 16 > +#endif > +#include "missing.h" > + > +struct if_data { > + /* ... */ > + u_long ifi_mtu; /* maximum transmission unit */ > +}; > + > +#endif > + > +#ifdef __APPLE__ > +#include <sys/socketvar.h> // need in kernel > + > +/* needed both in kernel and userspace */ > +struct if_data64 { // XXX Darwin version > + /* ... */ > + u_long ifi_mtu; /* maximum transmission unit */ > +}; > + > +struct net_event_data { > +}; > + > +struct in_addr; > +#endif /* __APPLE__ */ > + > +#define __PAST_END(v, idx) v[idx] > + > +/* > + * a fast copy routine > + */ > +#include <strings.h> > +// XXX only for multiples of 64 bytes, non overlapped. > +static inline void > +_pkt_copy(const void *_src, void *_dst, int l) > +{ > + const uint64_t *src = _src; > + uint64_t *dst = _dst; > +#define likely(x) __builtin_expect(!!(x), 1) > +#define unlikely(x) __builtin_expect(!!(x), 0) > + if (unlikely(l >= 1024)) { > + bcopy(src, dst, l); > + return; > + } > + for (; l > 0; l-=64) { > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + } > +} > + > +#endif /* !_GLUE_H */ > diff --git a/example/ipfw/extra/humanize_number.c b/example/ipfw/extra/humanize_number.c > new file mode 100644 > index 0000000..0b7382f > --- /dev/null > +++ b/example/ipfw/extra/humanize_number.c > @@ -0,0 +1,167 @@ > +/* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $ */ > + > +/* > + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. > + * All rights reserved. > + * > + * This code is derived from software contributed to The NetBSD Foundation > + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, > + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS > + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED > + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR > + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS > + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR > + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF > + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS > + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN > + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) > + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE > + * POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/lib/libutil/humanize_number.c 220582 2011-04-12 22:48:03Z delphij $"); > + > +#include <sys/types.h> > +#include <assert.h> > +#include <inttypes.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <locale.h> > +//#include <libutil.h> > + > +static const int maxscale = 7; > + > +int > +humanize_number(char *buf, size_t len, int64_t quotient, > + const char *suffix, int scale, int flags) > +{ > + const char *prefixes, *sep; > + int i, r, remainder, s1, s2, sign; > + int64_t divisor, max; > + size_t baselen; > + > + assert(buf != NULL); > + assert(suffix != NULL); > + assert(scale >= 0); > + assert(scale < maxscale || (((scale & (HN_AUTOSCALE | HN_GETSCALE)) != 0))); > + assert(!((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES))); > + > + remainder = 0; > + > + if (flags & HN_IEC_PREFIXES) { > + baselen = 2; > + /* > + * Use the prefixes for power of two recommended by > + * the International Electrotechnical Commission > + * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). > + * > + * HN_IEC_PREFIXES implies a divisor of 1024 here > + * (use of HN_DIVISOR_1000 would have triggered > + * an assertion earlier). > + */ > + divisor = 1024; > + if (flags & HN_B) > + prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; > + else > + prefixes = "\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; > + } else { > + baselen = 1; > + if (flags & HN_DIVISOR_1000) > + divisor = 1000; > + else > + divisor = 1024; > + > + if (flags & HN_B) > + prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; > + else > + prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; > + } > + > +#define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) > + > + if (scale < 0 || (scale >= maxscale && > + (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)) > + return (-1); > + > + if (buf == NULL || suffix == NULL) > + return (-1); > + > + if (len > 0) > + buf[0] = '\0'; > + if (quotient < 0) { > + sign = -1; > + quotient = -quotient; > + baselen += 2; /* sign, digit */ > + } else { > + sign = 1; > + baselen += 1; /* digit */ > + } > + if (flags & HN_NOSPACE) > + sep = ""; > + else { > + sep = " "; > + baselen++; > + } > + baselen += strlen(suffix); > + > + /* Check if enough room for `x y' + suffix + `\0' */ > + if (len < baselen + 1) > + return (-1); > + > + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { > + /* See if there is additional columns can be used. */ > + for (max = 1, i = len - baselen; i-- > 0;) > + max *= 10; > + > + /* > + * Divide the number until it fits the given column. > + * If there will be an overflow by the rounding below, > + * divide once more. > + */ > + for (i = 0; > + (quotient >= max || (quotient == max - 1 && remainder >= 950)) && > + i < maxscale; i++) { > + remainder = quotient % divisor; > + quotient /= divisor; > + } > + > + if (scale & HN_GETSCALE) > + return (i); > + } else { > + for (i = 0; i < scale && i < maxscale; i++) { > + remainder = quotient % divisor; > + quotient /= divisor; > + } > + } > + > + /* If a value <= 9.9 after rounding and ... */ > + if (quotient <= 9 && remainder < 950 && i > 0 && flags & HN_DECIMAL) { > + /* baselen + \0 + .N */ > + if (len < baselen + 1 + 2) > + return (-1); > + s1 = (int)quotient + ((remainder + 50) / 1000); > + s2 = ((remainder + 50) / 100) % 10; > + r = snprintf(buf, len, "%d%s%d%s%s%s", > + sign * s1, localeconv()->decimal_point, s2, > + sep, SCALE2PREFIX(i), suffix); > + } else > + r = snprintf(buf, len, "%" PRId64 "%s%s%s", > + sign * (quotient + (remainder + 50) / 1000), > + sep, SCALE2PREFIX(i), suffix); > + > + return (r); > +} > + > diff --git a/example/ipfw/extra/ipfw2_mod.c b/example/ipfw/extra/ipfw2_mod.c > new file mode 100644 > index 0000000..974c6af > --- /dev/null > +++ b/example/ipfw/extra/ipfw2_mod.c > @@ -0,0 +1,278 @@ > +/* > + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $Id: ipfw2_mod.c 7787 2010-11-19 21:15:50Z marta $ > + * > + * The main interface to build ipfw+dummynet as a linux module. > + * (and possibly as a windows module as well, though that part > + * is not complete yet). > + * > + * The control interface uses the sockopt mechanism > + * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). > + * > + * The data interface uses the netfilter interface, at the moment > + * hooked to the PRE_ROUTING and POST_ROUTING hooks. > + * Unfortunately the netfilter interface is a moving target, > + * so we need a set of macros to adapt to the various cases. > + * > + * In the netfilter hook we just mark packet as 'QUEUE' and then > + * let the queue handler to do the whole work (filtering and > + * possibly emulation). > + * As we receive packets, we wrap them with an mbuf descriptor > + * so the existing ipfw+dummynet code runs unmodified. > + */ > + > +#include <sys/cdefs.h> > +#include <sys/mbuf.h> /* sizeof struct mbuf */ > +#include <sys/param.h> /* NGROUPS */ > +#include <netinet/in.h> /* in_addr */ > +#include <netinet/ip_fw.h> /* ip_fw_ctl_t, ip_fw_chk_t */ > +#include <netinet/ip_dummynet.h> /* ip_dn_ctl_t, ip_dn_io_t */ > +#include <net/pfil.h> /* PFIL_IN, PFIL_OUT */ > +#include <net/route.h> /* inet_iif */ > + > +#include <netpfil/ipfw/ip_fw_private.h> /* ip_fw_ctl_t, ip_fw_chk_t */ > + > +/* > + * Here we allocate some global variables used in the firewall. > + */ > +//ip_dn_ctl_t *ip_dn_ctl_ptr; > +int (*ip_dn_ctl_ptr)(struct sockopt *); > + > +ip_fw_ctl_t *ip_fw_ctl_ptr; > + > +int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); > + > +void (*bridge_dn_p)(struct mbuf *, struct ifnet *); > + > +/* Divert hooks. */ > +void (*ip_divert_ptr)(struct mbuf *m, int incoming); > + > +/* ng_ipfw hooks. */ > +ng_ipfw_input_t *ng_ipfw_input_p = NULL; > + > + > +/*--- > + * Control hooks: > + * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. > + * then call the ipfw handler in order to manage requests. > + * In turn this is called by the linux set/get handlers. > + */ > +static int > +ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) > +{ > + struct thread t; > + int ret = EINVAL; > + > + memset(s, 0, sizeof(*s)); > + s->sopt_name = cmd; > + s->sopt_dir = dir; > + s->sopt_valsize = len; > + s->sopt_val = user; > + > + /* sopt_td is not used but it is referenced */ > + memset(&t, 0, sizeof(t)); > + s->sopt_td = &t; > + > + if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || > + cmd < IP_DUMMYNET_CONFIGURE)) > + ret = ip_fw_ctl_ptr(s); > + else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || > + cmd >= IP_DUMMYNET_CONFIGURE)) > + ret = ip_dn_ctl_ptr(s); > + > + return -ret; /* errors are < 0 on linux */ > +} > + > + > + > +/* > + * setsockopt hook has no return value other than the error code. > + */ > +int > +do_ipfw_set_ctl(void *sk, int cmd, > + void __user *user, unsigned int len) > +{ > + struct sockopt s; /* pass arguments */ > + return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); > +} > + > +/* > + * getsockopt can can return a block of data in response. > + */ > +int > +do_ipfw_get_ctl(void *sk, > + int cmd, void __user *user, int *len) > +{ > + struct sockopt s; /* pass arguments */ > + int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); > + > + *len = s.sopt_valsize; /* return lenght back to the caller */ > + return ret; > +} > + > + > +/* > + * Module glue - init and exit function. > + */ > +#include <sys/module.h> > +/* descriptors for the children, until i find a way for the > + * linker to produce them > + */ > +extern moduledata_t *moddesc_ipfw; > +extern moduledata_t *moddesc_dummynet; > +extern moduledata_t *moddesc_dn_fifo; > +extern moduledata_t *moddesc_dn_wf2qp; > +extern moduledata_t *moddesc_dn_rr; > +extern moduledata_t *moddesc_dn_qfq; > +extern moduledata_t *moddesc_dn_prio; > +extern int (*sysinit_ipfw_init)(void *); > +extern int (*sysuninit_ipfw_destroy)(void *); > +extern int (*sysinit_vnet_ipfw_init)(void *); > +extern int (*sysuninit_vnet_ipfw_uninit)(void *); > + > +/*--- > + * Glue code to implement the registration of children with the parent. > + * Each child should call my_mod_register() when linking, so that > + * module_init() and module_exit() can call init_children() and > + * fini_children() to provide the necessary initialization. > + * We use the same mechanism for MODULE_ and SYSINIT_. > + * The former only get a pointer to the moduledata, > + * the latter have two function pointers (init/uninit) > + */ > +#include <sys/module.h> > +struct mod_args { > + const char *name; > + int order; > + struct moduledata *mod; > + int (*init)(void *); > + int (*uninit)(void *); > +}; > + > +static unsigned int mod_idx; > +static struct mod_args mods[10]; /* hard limit to 10 modules */ > + > +int > +my_mod_register(const char *name, int order, > + struct moduledata *mod, int (*init)(void *), int (*uninit)(void *)); > +/* > + * my_mod_register should be called automatically as the init > + * functions in the submodules. Unfortunately this compiler/linker > + * trick is not supported yet so we call it manually. > + */ > +int > +my_mod_register(const char *name, int order, > + struct moduledata *mod, int (*init)(void *), int (*uninit)(void *)) > +{ > + struct mod_args m; > + > + m.name = name; > + m.order = order; > + m.mod = mod; > + m.init = init; > + m.uninit = uninit; > + > + ND("called for %s", name); > + if (mod_idx < sizeof(mods) / sizeof(mods[0])) > + mods[mod_idx++] = m; > + return 0; > +} > + > +static void > +init_children(void) > +{ > + unsigned int i; > + > + /* Call the functions registered at init time. */ > + printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); > + for (i = 0; i < mod_idx; i++) { > + struct mod_args *m = &mods[i]; > + printf("+++ start module %d %s %s at %p order 0x%x\n", > + i, m->name, m->mod ? m->mod->name : "SYSINIT", > + m->mod, m->order); > + if (m->mod && m->mod->evhand) > + m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); > + else if (m->init) > + m->init(NULL); > + } > +} > + > +static void > +fini_children(void) > +{ > + int i; > + > + /* Call the functions registered at init time. */ > + for (i = mod_idx - 1; i >= 0; i--) { > + struct mod_args *m = &mods[i]; > + printf("+++ end module %d %s %s at %p order 0x%x\n", > + i, m->name, m->mod ? m->mod->name : "SYSINIT", > + m->mod, m->order); > + if (m->mod && m->mod->evhand) > + m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); > + else if (m->uninit) > + m->uninit(NULL); > + } > +} > +/*--- end of module binding helper functions ---*/ > + > +int > +ipfw_module_init(void) > +{ > + int ret = 0; > + > + my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); > + my_mod_register("sy_ipfw", 2, NULL, > + sysinit_ipfw_init, sysuninit_ipfw_destroy); > + my_mod_register("sy_Vnet_ipfw", 3, NULL, > + sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); > + my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); > + my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); > + my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); > + my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); > + my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); > + my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); > + init_children(); > + > +#ifdef EMULATE_SYSCTL > + keinit_GST(); > +#endif > + > + return ret; > +} > + > +/* module shutdown */ > +void > +ipfw_module_exit(void) > +{ > +#ifdef EMULATE_SYSCTL > + keexit_GST(); > +#endif > + > + fini_children(); > + > + printf("%s unloaded\n", __FUNCTION__); > +} > diff --git a/example/ipfw/extra/linux_defs.h b/example/ipfw/extra/linux_defs.h > new file mode 100644 > index 0000000..b7994cf > --- /dev/null > +++ b/example/ipfw/extra/linux_defs.h > @@ -0,0 +1,144 @@ > +#ifndef __LINUX_DEFS_ > +#define __LINUX_DEFS_ > + > +/* define, includes and functions missing in linux */ > + > +#ifdef __linux__ > +/* include and define */ > +#include <arpa/inet.h> /* inet_ntoa */ > +#include <netinet/tcp.h> > + > +#include <linux/errno.h> /* error define */ > +#include <stdint.h> /* u_int32_t */ > +#include <stdio.h> /* snprintf */ > + > +typedef struct mtx spinlock_t; > +typedef struct mtx rwlock_t; > + > +/* > + * some network structure can be defined in the bsd way > + * by using the _FAVOR_BSD definition. This is not true > + * for icmp structure. > + * XXX struct icmp contains bsd names in > + * /usr/include/netinet/ip_icmp.h > + */ > +#define icmp_code code > +#define icmp_type type > + > +/* linux in6_addr has no member __u6_addr > + * replace the whole structure ? > + */ > +#define __u6_addr __in6_u > +// #define __u6_addr32 u6_addr32 > + > +/* defined in linux/sctp.h with no bsd definition */ > +struct sctphdr { > + uint16_t src_port; /* source port */ > + uint16_t dest_port; /* destination port */ > + uint32_t v_tag; /* verification tag of packet */ > + uint32_t checksum; /* Adler32 C-Sum */ > + /* chunks follow... */ > +} SCTP_PACKED; > + > +/* missing definition */ > +#define TH_FIN 0x01 > +#define TH_SYN 0x02 > +#define TH_RST 0x04 > +#define TH_ACK 0x10 > + > +#define RTF_CLONING 0x100 /* generate new routes on use */ > + > +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ > +#define IPPROTO_CARP 112 /* CARP */ > +#define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ > + > +#define CARP_VERSION 2 > +#define CARP_ADVERTISEMENT 0x01 > + > +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ > +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) /* linux/stdlib */ > + > +#define IP_FORWARDING 0x1 /* most of ip header exists */ > + > +#define NETISR_IP 2 /* same as AF_INET */ > + > +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ > + > +extern int securelevel; > + > +struct carp_header { > +#if BYTE_ORDER == LITTLE_ENDIAN > + u_int8_t carp_type:4, > + carp_version:4; > +#endif > +#if BYTE_ORDER == BIG_ENDIAN > + u_int8_t carp_version:4, > + carp_type:4; > +#endif > +}; > + > +struct pim { > +}; > + > +struct route { > + struct rtentry *ro_rt; > + struct sockaddr ro_dst; > +}; > + > + > +#if 0 // already in main header > +struct ifaltq { > + void *ifq_head; > +}; > + > +struct ifnet { > + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ > + struct ifaltq if_snd; /* output queue (includes altq) */ > +}; > + > +/* involves mbufs */ > +int in_cksum(struct mbuf *m, int len); > +#define divert_cookie(mtag) 0 > +#define divert_info(mtag) 0 > +#define INADDR_TO_IFP(a, b) b = NULL > +#define pf_find_mtag(a) NULL > +#define pf_get_mtag(a) NULL > +#define AF_LINK AF_ASH /* ? linux/socket.h */ > + > +struct pf_mtag { > + void *hdr; /* saved hdr pos in mbuf, for ECN */ > + sa_family_t af; /* for ECN */ > + u_int32_t qid; /* queue id */ > +}; > +#endif > + > +/* radix related */ > + > +#if 0 > +struct radix_node { > + caddr_t rn_key; /* object of search */ > + caddr_t rn_mask; /* netmask, if present */ > +}; > +#endif > + > + > +/* missing functions */ > + > +/* from bsd sys/queue.h */ > +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ > + for ((var) = TAILQ_FIRST((head)); \ > + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ > + (var) = (tvar)) > + > +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ > + for ((var) = SLIST_FIRST((head)); \ > + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ > + (var) = (tvar)) > + > +/* depending of linux version */ > +#ifndef ETHERTYPE_IPV6 > +#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ > +#endif > + > +#endif /* __linux__ */ > +#endif /* !__LINUX_DEFS_ */ > diff --git a/example/ipfw/extra/missing.c b/example/ipfw/extra/missing.c > new file mode 100644 > index 0000000..1713bdd > --- /dev/null > +++ b/example/ipfw/extra/missing.c > @@ -0,0 +1,732 @@ > +/* > + * $Id$ > + * > + * Support to compile the kernel side of ipfw/dummynet in userland. > + * This file contains variables and functions that are not available in > + * userland. It is compiled in a kernel-like environment, so > + * it has _KERNEL defined, together with malloc() and free(). > + * They must be redefined here as we build the real thing. > + */ > + > +#include "glue.h" /* normally comes from the command line */ > +#include "missing.h" /* normally comes from the command line */ > +#undef _KERNEL > +#include <sys/types.h> > +#include <pthread.h> > +#include <sys/select.h> > +#include <sys/time.h> /* timersub */ > +#define _KERNEL > + > +#include <sys/types.h> > +#include <sys/taskqueue.h> > + > +#include <sys/mbuf.h> > +#undef malloc > +#undef free > + > +#include <stdlib.h> // calloc > + > +#include <netinet/in.h> /* struct sockaddr, route, sockopt... */ > +#include <netinet/in_systm.h> > + > +#if 0 > +#define IF_NAMESIZE 16 /* ip_fw.h */ > +#define IFNAMSIZ IF_NAMESIZE /* ip_fw.h */ > +#endif > + > + > +/* > + * Global bariables in the kernel > + */ > +int ticks; /* kernel ticks counter */ > +int hz = 5000; /* default clock time */ > +long tick = 0; /* XXX is this 100000/hz ? */ > +int bootverbose = 0; > +time_t time_uptime = 0; > +struct timeval boottime; > + > +int max_protohdr = 14 + 4 + 20 + 20; /* mac, vlan, ip, tcp */ > +int max_linkhdr; > +int ip_defttl; > +u_long in_ifaddrhmask; /* mask for hash table */ > +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ > + > +u_int rt_numfibs = RT_NUMFIBS; > + > +void > +module_register_init(const void *foo) > +{ > + D("start for %p", foo); > +} > + > +/* defined as assert */ > +#include <assert.h> > +void > +panic(const char *fmt, ...) > +{ > + assert(1); > +} > + > +void > +getmicrouptime(struct timeval *tv) > +{ > + gettimeofday(tv, NULL); > +} > + > +/* > + * pfil hook support. > + * We make pfil_head_get return a non-null pointer, which is then ignored > + * in our 'add-hook' routines. > + */ > +struct pfil_head; > +typedef int (pfil_hook_t) > + (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); > + > +struct pfil_head * > +pfil_head_get(int proto, u_long flags) > +{ > + static int dummy; > + D("called"); > + return (struct pfil_head *)(void *)&dummy; > +} > + > +int > +pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) > +{ > + D("called"); > + return 0; > +} > + > +int > +pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) > +{ > + D("called"); > + return 0; > +} > + > +/* from sys/netinet/ip_output.c */ > +int > +ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, > + struct ip_moptions *imo, struct inpcb *inp) > +{ > + D("unimplemented"); > + return 0; > +} > + > +struct tags_freelist tags_freelist; > +int tags_minlen = 64; > +int tags_freelist_count = 0; > +static int tags_freelist_max = 0; > + > +struct mbuf *mbuf_freelist; > + > +void > +m_freem(struct mbuf *m) > +{ > + struct m_tag *t; > + > + /* free the m_tag chain */ > + while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { > + ND("free tag %p", &m->m_pkthdr.tags); > + SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); > + SLIST_INSERT_HEAD(&tags_freelist, t, m_tag_link); > + tags_freelist_count++; > + if (tags_freelist_count > tags_freelist_max) { > + static int pr=0; > + if ((pr++ % 1000) == 0) > + D("new max %d", tags_freelist_count); > + tags_freelist_max = tags_freelist_count; > + } > + } > + if (m->m_flags & M_STACK) { > + ND("free invalid mbuf %p", m); > + return; > + } > + /* free the mbuf */ > + ND("free(m = %p, M_IPFW);", m); > + m->m_next = mbuf_freelist; > + mbuf_freelist = m; > +} > + > +/* from net/netisr.c */ > +int > +netisr_dispatch(u_int proto, struct mbuf *m) > +{ > + if ((int)proto < 0) > + m_freem(m); > + else if (m->__m_callback) > + m->__m_callback(m, proto); > + else > + D("unimplemented proto %d mbuf %p", proto, m); > + return 0; > +} > + > +/* define empty body for kernel function */ > +int > +priv_check(struct thread *td, int priv) > +{ > + /* once connected, always allow */ > + ND("called"); > + return 0; > +} > + > +int > +securelevel_ge(struct ucred *cr, int level) > +{ > + /* we are always secure... */ > + ND("called"); > + return 0; > +} > + > +int > +sysctl_handle_int(SYSCTL_HANDLER_ARGS) > +{ > + int tmp; > + > + ND("called"); > + if (!req || !req->oldptr || req->oldlen != sizeof(int)) > + return EINVAL; > + tmp = arg1 ? *(int *)arg1 : arg2; > + bcopy(&tmp, req->oldptr, sizeof(int)); > + /* XXX check the SET routine */ > + if (req->newptr && arg1) > + bcopy(req->newptr, arg1, sizeof(int)); > + return 0; > +} > + > +int > +sysctl_handle_long(SYSCTL_HANDLER_ARGS) > +{ > + ND("called"); > + sysctl_handle_int(oidp, arg1, arg2, req); > + return 0; > +} > + > +void > +ether_demux(struct ifnet *ifp, struct mbuf *m) > +{ > + if (m->__m_callback) > + m->__m_callback(m, 0); > + else > + D("missing callback mbuf %p", m); > + return; > +} > + > +int > +ether_output_frame(struct ifnet *ifp, struct mbuf *m) > +{ > + D("incomplete"); > + return 0; > +} > + > +void > +in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) > +{ > + D("called"); > + return; > +} > + > +void > +icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) > +{ > + D("called"); > + return; > +} > + > +void > +rtfree(struct rtentry *rt) > +{ > + D("called"); > + return; > +} > + > +u_short > +in_cksum_skip(struct mbuf *m, int len, int skip) > +{ > + D("called"); > + return 0; > +} > + > +u_short > +in_cksum_hdr(struct ip *ip) > +{ > + D("called"); > + return 0; > +} > + > + > +struct mbuf * > +ip_reass(struct mbuf *clone) > +{ > + D("called"); > + return clone; > +} > +#ifdef INP_LOCK_ASSERT > +#undef INP_LOCK_ASSERT > +#define INP_LOCK_ASSERT(a) > +#endif > + > +int > +jailed(struct ucred *cred) > +{ > + D("called"); > + return 0; > +} > + > +/* > +* Return 1 if an internet address is for a ``local'' host > +* (one to which we have a connection). If subnetsarelocal > +* is true, this includes other subnets of the local net. > +* Otherwise, it includes only the directly-connected (sub)nets. > +*/ > +int > +in_localaddr(struct in_addr in) > +{ > + D("called"); > + return 1; > +} > + > +#if 0 > +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) > +{ > + return 1; > +} > +#endif > + > +/* > + * Procedures for the callout interface > + * > + * callout_init() initializes a descriptor, > + * callout_reset() starts a timer > + * callout_stop() stops a timer > + * > + * Internally we hold a list of callout entries etc etc. > + */ > + > +struct callout_tailq callout_head; > + > +#include <sys/systm.h> > +void > +callout_init(struct callout *c, int mpsafe) > +{ > + D("c %p mpsafe %d", c, mpsafe); > + bzero(c, sizeof(*c)); > +} > + > +int > +callout_reset_on(struct callout *c, int due_ticks, void (*func)(void *), void *arg, int p) > +{ > + return callout_reset(c, due_ticks, func, arg); > +} > + > +int > +callout_reset(struct callout *c, int due_ticks, void (*func)(void *), void *arg) > +{ > + struct callout *cur; > + > + ND("c %p ticks %d f %p(%p)", c, due_ticks, func, arg); > + if (c->c_flags & CALLOUT_ACTIVE) { > + D(" --- callout was already active"); > + return -1; > + } > + c->c_time = ticks + due_ticks; /* XXX not the original meaning */ > + c->c_func = func; > + c->c_arg = arg; > + c->c_flags |= CALLOUT_ACTIVE; > + TAILQ_FOREACH(cur, &callout_head, c_links.tqe) { > + if ( (c->c_time - cur->c_time) < 0) > + break; > + } > + if (cur) > + TAILQ_INSERT_BEFORE(cur, c, c_links.tqe); > + else > + TAILQ_INSERT_TAIL(&callout_head, c, c_links.tqe); > + return 0; /* no error */ > +} > + > +int > +_callout_stop_safe(struct callout *c, int safe) > +{ > + D("c %p safe %d", c, safe); > + TAILQ_REMOVE(&callout_head, c, c_links.tqe); > + return 0; > +} > + > +int > +callout_drain(struct callout *c) > +{ > + _callout_stop_safe(c, 1); > + return 0; > +} > + > +void > +callout_startup(void) > +{ > + D("start"); > + TAILQ_INIT( &callout_head); > +} > + > +void > +callout_run(void) > +{ > + struct callout *cur, *tmp; > + > + ND("Run pending callouts tick %d", ticks); > + TAILQ_FOREACH_SAFE(cur, &callout_head, c_links.tqe, tmp) { > + int delta = ticks - cur->c_time; > + if (delta < 0) { // early ? > + //fprintf(stderr, "c %p due at %d\n", cur, cur->c_time); > + continue; > + } > + if (delta > 100) > + RD(1,"running %p due at %d now %d", cur, cur->c_time, ticks); > + TAILQ_REMOVE(&callout_head, cur, c_links.tqe); > + cur->c_flags &= ~CALLOUT_ACTIVE; > + cur->c_func(cur->c_arg); > + } > +} > + > +/* > + * the taskqueue type is actually opaque > + */ > +struct taskqueue { > + STAILQ_ENTRY(taskqueue) tq_link; > + STAILQ_HEAD(, task) tq_queue; > + const char *tq_name; > + taskqueue_enqueue_fn tq_enqueue; > + void *tq_context; > + struct task *tq_running; > + int tq_pcount; > + int tq_spin; > + int tq_flags; > +}; > + > +#if 0 > +/* > + * instead of enqueueing, we run this immediately. > + */ > +int > +taskqueue_enqueue(struct taskqueue *queue, struct task *task) > +{ > + task->ta_func(task->ta_context, 1); > + return 0; > +} > +#endif > + > +void > +taskqueue_thread_enqueue(void *context) > +{ > + D("ctx %p", context); > +} > + > +struct taskqueue * > +taskqueue_create_fast(const char *name, int mflags, > + taskqueue_enqueue_fn enqueue, void *context) > +{ > + struct taskqueue *tq; > + > + tq = calloc(1, sizeof(*tq)); > + if (tq == NULL) > + return NULL; > + D("start %s fn %p ctx %p", name, enqueue, context); > + return tq; > +} > + > +int > +taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, > + const char *name, ...) > +{ > + D("tqp %p count %d (dummy)", tqp, count); > + return 0; > +} > + > +void > +taskqueue_drain(struct taskqueue *queue, struct task *task) > +{ > + D("q %p task %p", queue, task); > +} > + > +void > +taskqueue_free(struct taskqueue *queue) > +{ > + D("q %p", queue); > + free(queue); > +} > + > +void * > +kern_malloc(int sz) > +{ > + return calloc(sz, 1); /* most of the time we want zeroed memory */ > +} > + > +void > +kern_free(void *p) > +{ > + free(p); > +} > + > +#ifdef linux > +size_t > +strlcpy(char *dst, const char *src, size_t siz) > +{ > + char *d = dst; > + const char *s = src; > + size_t n = siz; > + > + /* Copy as many bytes as will fit */ > + if (n != 0 && --n != 0) { > + do { > + if ((*d++ = *s++) == 0) > + break; > + } while (--n != 0); > + } > + > + /* Not enough room in dst, add NUL and traverse rest of src */ > + if (n == 0) { > + if (siz != 0) > + *d = '\0'; /* NUL-terminate dst */ > + while (*s++) > + ; > + } > + > + return(s - src - 1); /* count does not include NUL */ > +} > +#endif // linux > + > +#ifdef EMULATE_SYSCTL > +/* > + * Support for sysctl emulation. > + * We transfer options as part of the IP_DUMMYNET3 sockopt emulation, > + * so we need to include ip_fw.h and ip_dummynet.h > + */ > + > +#include <netinet/ip_fw.h> /* struct ip_fw_args */ > +#include <netinet/ip_dummynet.h> /* struct dn_id */ > +static struct sysctltable GST; > + > +int > +kesysctl_emu_get(struct sockopt* sopt) > +{ > + struct dn_id* oid = sopt->sopt_val; > + struct sysctlhead* entry; > + int sizeneeded = sizeof(struct dn_id) + GST.totalsize + > + sizeof(struct sysctlhead); > + unsigned char* pstring; > + unsigned char* pdata; > + int i; > + > + if (sopt->sopt_valsize < sizeneeded) { > + // this is a probe to retrieve the space needed for > + // a dump of the sysctl table > + oid->id = sizeneeded; > + sopt->sopt_valsize = sizeof(struct dn_id); > + return 0; > + } > + > + entry = (struct sysctlhead*)(oid+1); > + /* [entry][data(datalen)][name(namelen)] */ > + ND("copying values"); > + for( i=0; i<GST.count; i++) { > + ND("entry %d %s flags 0x%x", > + i, GST.entry[i].name, GST.entry[i].head.flags); > + entry->blocklen = GST.entry[i].head.blocklen; > + entry->namelen = GST.entry[i].head.namelen; > + entry->flags = GST.entry[i].head.flags; > + entry->datalen = GST.entry[i].head.datalen; > + pdata = (unsigned char*)(entry+1); > + pstring = pdata+GST.entry[i].head.datalen; > + if (entry->flags & SYSCTLTYPE_PROC) { > + //int (*f)(SYSCTL_HANDLER_ARGS); > + sysctl_h_fn_t *f; > + int tmp = 0, ret; > + struct sysctl_req req; > + > + bzero(&req, sizeof(req)); > + req.oldlen = req.newlen = sizeof(int); > + req.oldptr = &tmp; > + f = GST.entry[i].fn; > + ND("-- %s is a proc -- at %p", GST.entry[i].name, f); > + ret = f(NULL, NULL, (int)(intptr_t)(GST.entry[i].data), &req); > + ND("-- %s returns %d", GST.entry[i].name, ret); > + bcopy(&tmp, pdata, sizeof(tmp)); > + } else { > + bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); > + } > + bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); > + entry = (struct sysctlhead*) > + ((unsigned char*)(entry) + GST.entry[i].head.blocklen); > + } > + sopt->sopt_valsize = sizeneeded; > + return 0; > +} > + > +int > +kesysctl_emu_set(void* p, int l) > +{ > + struct sysctlhead* entry; > + unsigned char* pdata; > + unsigned char* pstring; > + int i = 0; > + > + entry = (struct sysctlhead*)(((struct dn_id*)p)+1); > + pdata = (unsigned char*)(entry+1); > + pstring = pdata + entry->datalen; > + > + for (i=0; i<GST.count; i++) { > + if (strcmp(GST.entry[i].name, (char *)pstring) != 0) > + continue; > + ND("%s: match found! %s\n",__FUNCTION__,pstring); > + //sanity check on len, not really useful now since > + //we only accept int32 > + if (entry->datalen != GST.entry[i].head.datalen) { > + printf("%s: len mismatch, user %d vs kernel %d\n", > + __FUNCTION__, entry->datalen, > + GST.entry[i].head.datalen); > + return -1; > + } > + // check access (at the moment flags handles only the R/W rights > + //later on will be type + access > + if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { > + printf("%s: the entry %s is read only\n", > + __FUNCTION__,GST.entry[i].name); > + return -1; > + } > + if (GST.entry[i].head.flags & SYSCTLTYPE_PROC) { > + int (*f)(SYSCTL_HANDLER_ARGS); > + int tmp = 0, ret; > + struct sysctl_req req; > + > + bzero(&req, sizeof(req)); > + req.oldlen = req.newlen = sizeof(int); > + req.oldptr = &tmp; > + req.newptr = pdata; > + f = GST.entry[i].fn; > + ND("-- %s is a proc -- at %p", GST.entry[i].name, f); > + ret = f(NULL, NULL, (int)(intptr_t)(GST.entry[i].data), &req); > + ND("-- %s returns %d", GST.entry[i].name, ret); > + } else { > + bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); > + } > + return 0; > + } > + D("%s: match not found\n",__FUNCTION__); > + return 0; > +} > + > +/* convert all _ to . until the first . */ > +static void > +underscoretopoint(char* s) > +{ > + for (; *s && *s != '.'; s++) > + if (*s == '_') > + *s = '.'; > +} > + > +static int > +formatnames(void) > +{ > + int i; > + int size=0; > + char* name; > + > + for (i=0; i<GST.count; i++) > + size += GST.entry[i].head.namelen; > + GST.namebuffer = malloc(size); > + if (GST.namebuffer == NULL) > + return -1; > + name = GST.namebuffer; > + for (i=0; i<GST.count; i++) { > + bcopy(GST.entry[i].name, name, GST.entry[i].head.namelen); > + underscoretopoint(name); > + GST.entry[i].name = name; > + name += GST.entry[i].head.namelen; > + } > + return 0; > +} > + > +static void > +dumpGST(void) > +{ > + int i; > + > + for (i=0; i<GST.count; i++) { > + printf("SYSCTL: entry %i\n", i); > + printf("name %s\n", GST.entry[i].name); > + printf("namelen %i\n", GST.entry[i].head.namelen); > + printf("type %i access %i\n", > + GST.entry[i].head.flags >> 2, > + GST.entry[i].head.flags & 0x00000003); > + printf("data %i\n", *(int*)(GST.entry[i].data)); > + printf("datalen %i\n", GST.entry[i].head.datalen); > + printf("blocklen %i\n", GST.entry[i].head.blocklen); > + } > +} > + > +void sysctl_addgroup_f1(void); > +void sysctl_addgroup_f2(void); > +void sysctl_addgroup_f3(void); > +void sysctl_addgroup_f4(void); > + > +void > +keinit_GST(void) > +{ > + int ret; > + > + sysctl_addgroup_f1(); > + sysctl_addgroup_f2(); > + sysctl_addgroup_f3(); > + sysctl_addgroup_f4(); > + ret = formatnames(); > + if (ret != 0) > + printf("conversion of names failed for some reason\n"); > + if (0) > + dumpGST(); // XXX debugging > + printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", > + GST.count, GST.totalsize); > +} > + > +void > +keexit_GST(void) > +{ > + if (GST.namebuffer != NULL) > + free(GST.namebuffer); > + bzero(&GST, sizeof(GST)); > +} > + > +void > +sysctl_pushback(char* name, int flags, int datalen, void* data, sysctl_h_fn_t *fn) > +{ > + if (GST.count >= GST_HARD_LIMIT) { > + printf("WARNING: global sysctl table full, this entry will not be added," > + "please recompile the module increasing the table size\n"); > + return; > + } > + GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' > + GST.entry[GST.count].name = name; > + GST.entry[GST.count].head.flags = flags; > + GST.entry[GST.count].data = data; > + GST.entry[GST.count].fn = fn; > + GST.entry[GST.count].head.datalen = datalen; > + GST.entry[GST.count].head.blocklen = > + ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + > + GST.entry[GST.count].head.datalen)+3) & ~3; > + GST.totalsize += GST.entry[GST.count].head.blocklen; > + GST.count++; > +} > +#endif /* EMULATE_SYSCTL */ > + > +extern int mainloop(int argc, char *argv[]); > + > +/* > + * main program for ipfw kernel side when running an userspace emulation: > + * open a socket on which we receive requests from userland, > + * another socket for calls from the 'kernel' (simulating packet > + * arrivals etc), and then periodically run the tick handler. > + */ > +int > +main(int argc, char *argv[]) > +{ > + tick = 1000000/hz; > + D("initializing tick to %ld", tick); > + return mainloop(argc, argv); > +} > diff --git a/example/ipfw/extra/missing.h b/example/ipfw/extra/missing.h > new file mode 100644 > index 0000000..b5b65b2 > --- /dev/null > +++ b/example/ipfw/extra/missing.h > @@ -0,0 +1,801 @@ > +/* > + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $Id: missing.h 8377 2011-04-04 16:08:27Z marta $ > + * > + * Header for kernel variables and functions that are not available in > + * userland. > + */ > + > +#ifndef _MISSING_H_ > +#define _MISSING_H_ > + > +#define KLD_MODULE /* disable kernel dependencies */ > + > +/* defined as assert */ > +void panic(const char *fmt, ...); > + > +#define KASSERT(exp,msg) do { \ > + if (__predict_false(!(exp))) \ > + panic msg; \ > +} while (0) > +/* don't bother to optimize */ > +#ifndef __predict_false > +#define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ > +#endif // XXX > + > + > +#ifdef _KERNEL > +#define NEED_KERNEL > +#undef _KERNEL > +#endif > + > +#include <stdio.h> // printf > +#include <sys/socket.h> // IFNAMSIZ ? > +#include <string.h> // strncmp > +#include <stdlib.h> // bsearch > +#ifdef NEED_KERNEL > +#define _KERNEL > +#include <sys/cdefs.h> > +#include <sys/param.h> > + > +#define __user // not defined here ? > +#define __init > +#define __exit > + > +/* portability features, to be set before the rest: */ > +#define WITHOUT_BPF /* do not use bpf logging */ > + > +#define MALLOC_DECLARE(x) struct __hack /* nothing */ > +// XXX kernel malloc/free > +extern void *kern_malloc(int); > +extern void kern_free(void *); > +#define malloc(_size, type, flags) kern_malloc(_size) > +#define free(_var, type) kern_free(_var) > + > +/* inet_ntoa_r() differs in userspace and kernel. > + * We load netinet/in.h so we get the kernel prototype ? > + * but we also need to put #defines in the two places where > + * it is used XXX fixme > + */ > +#include <netinet/in.h> > + > +/* log() conflicts with the math function. > + * Revise, modifying the first argument. > + */ > +#define LOG_ERR 0x100 > +#define LOG_INFO 0x200 > +#ifndef LOG_SECURITY > +#define LOG_SECURITY 0x400 > +#endif > + > +#define log(_level, fmt, arg...) do { \ > + int __attribute__((unused)) _querty = _level; \ > + printf("kernel: " fmt, ##arg); } while (0) > + > +#endif /* _KERNEL */ > + > +/* > + * Kernel locking support. > + * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c > + * > + * In linux we use spinlock_bh to implement both. > + * For 'struct rwlock' we need an #ifdef to change it to spinlock_t > + */ > + > +#ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ > +#if defined(__APPLE__) > +#define DEFINE_SPINLOCK(x) struct mtx x; > +#else /* linux ? */ > +#define DEFINE_SPINLOCK(x) spinlock_t x // = SPIN_LOCK_UNLOCKED > +#endif > +#endif > + > +/* 20111031 > + * redefine mutex in terms of threads. > + */ > + > +#undef _KERNEL > +// #include <sys/types.h> > +#include <pthread.h> > +#ifdef NEED_KERNEL > +#define _KERNEL > +#endif > +struct mtx { > + pthread_mutex_t p0; > +}; > +struct rwlock { > + pthread_mutex_t p0; > +}; > +struct rmlock { > + pthread_mutex_t p0; > +}; > +extern pthread_mutex_t dummynet_mtx_p; > +extern pthread_mutex_t ipfw_dyn_mtx_p; > +extern pthread_mutex_t pfil_global_lock_p; > + > +#define mtx_assert(a, b) > +/* > + * the first argument to mtx_init is often a static variable, > + * so use (void)m to prevent a compiler warning > + */ > +#define mtx_init(m, a,b,c) do { \ > + (void)m; pthread_mutex_init(&((m)->p0), NULL); } while (0) > +#define MTX_SYSINIT(a, m, c, d) // pthread_mutex_init(m##_p, NULL) > +#define mtx_lock(m) pthread_mutex_lock(m.p0) > +#define mtx_unlock(m) pthread_mutex_unlock(m.p0) > +#define mtx_destroy(m) pthread_mutex_destroy(m.p0) > +#if 1 > +//------------------ > + > +#if 1 // used for IPFW_UH > +#define rw_assert(a, b) > +#define rw_destroy(_l) > +#define rw_init(_l, msg) // XXX mtx_init((_l), 0, 0, 0) > +#define rw_rlock(_l) mtx_lock(_l) > +#define rw_runlock(_l) mtx_unlock(_l) > +#define rw_wlock(_l) mtx_lock(_l) > +#define rw_wunlock(_l) mtx_unlock(_l) > +#define rw_init_flags(_l, s, v) > +#endif // XXX not used anymore > + > +#define rm_init(_l, msg) // mtx_init(...) > +#define rm_rlock(_l, _t) ((void)_t, mtx_lock(_l)) > +#define rm_runlock(_l, _t) mtx_unlock(_l) > +#define rm_wlock(_l) mtx_lock(_l) > +#define rm_wunlock(_l) mtx_unlock(_l) > +#define rm_destroy(_l) // XXX > +#define rm_assert(_l, _w) // XXX > + > + > +#endif // locking on linux ? > + > +/* end of locking support */ > + > +/* > + * Reference to an ipfw rule that can be carried outside critical sections. > + * A rule is identified by rulenum:rule_id which is ordered. > + * In version chain_id the rule can be found in slot 'slot', so > + * we don't need a lookup if chain_id == chain->id. > + * > + * On exit from the firewall this structure refers to the rule after > + * the matching one (slot points to the new rule; rulenum:rule_id-1 > + * is the matching rule), and additional info (e.g. info often contains > + * the insn argument or tablearg in the low 16 bits, in host format). > + * On entry, the structure is valid if slot>0, and refers to the starting > + * rules. 'info' contains the reason for reinject, e.g. divert port, > + * divert direction, and so on. > + */ > +struct ipfw_rule_ref { > + uint32_t slot; /* slot for matching rule */ > + uint32_t rulenum; /* matching rule number */ > + uint32_t rule_id; /* matching rule id */ > + uint32_t chain_id; /* ruleset id */ > + uint32_t info; /* see below */ > +}; > + > +/* ISO C restricts enumerator values to range of 'int' > + * so we need IN to have a smaller value > + */ > +enum { > + IPFW_INFO_MASK = 0x0000ffff, > + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ > + IPFW_INFO_IN = 0x00800000, /* incoming, overloads dir */ > + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ > + IPFW_IS_MASK = 0x30000000, /* which source ? */ > + IPFW_IS_DIVERT = 0x20000000, > + IPFW_IS_DUMMYNET =0x10000000, > + IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ > +}; > + > +/* in netinet/in.h */ > +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) > + > +/* ip_dummynet.c */ > +#ifndef __FreeBSD_version > +#define __FreeBSD_version 500035 > +#endif > + > +/* define some macro for ip_dummynet */ > + > +struct malloc_type { > +}; > + > +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ > + struct malloc_type type[1]; void *md_dummy_ ## type = type > + > +#define CTASSERT(x) > + > + > +/* > + * gettimeofday would be in sys/time.h but it is not > + * visible if _KERNEL is defined > + */ > +//int gettimeofday(struct timeval *, struct timezone *); > + > + > +extern int hz; > +extern long tick; /* exists in 2.4 but not in 2.6 */ > +extern int bootverbose; > +extern struct timeval boottime; > + > +/* time_uptime is a FreeBSD variable increased each second */ > +extern time_t time_uptime; > + > +extern int max_linkhdr; > +extern int ip_defttl; > +extern u_long in_ifaddrhmask; /* mask for hash table */ > +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ > + > +/*-------------------------------------------------*/ > + > +/* define, includes and functions missing in linux */ > +/* include and define */ > +#include <arpa/inet.h> /* inet_ntoa */ > + > +struct mbuf; > +// XXX #define M_MCAST 0x04 /* send/received as link-level multicast */ > + > + > +/* used by ip_dummynet.c */ > +void reinject_drop(struct mbuf* m); > + > +#include <sys/socket.h> /* for ETHERTYPE_IP */ > + > +#ifdef _KERNEL > +#define IF_NAMESIZE 16 > +#ifndef IFNAMSIZ > +#define IFNAMSIZ IF_NAMESIZE > +#endif > +//#include <net/if.h> /* IFNAMESIZ */ > +#endif > + > +/* > + * some network structure can be defined in the bsd way > + * by using the _FAVOR_BSD definition. This is not true > + * for icmp structure. > + * XXX struct icmp contains bsd names in > + * /usr/include/netinet/ip_icmp.h > + */ > + > +/* missing definition */ > +#define TH_FIN 0x01 > +#define TH_SYN 0x02 > +#define TH_RST 0x04 > +#define TH_ACK 0x10 > + > +/* 20131101 IPTOS from ip.h */ > +/* > + * Definitions for DiffServ Codepoints as per RFC2474 > + */ > +#define IPTOS_DSCP_CS0 0x00 > +#define IPTOS_DSCP_CS1 0x20 > +#define IPTOS_DSCP_AF11 0x28 > +#define IPTOS_DSCP_AF12 0x30 > +#define IPTOS_DSCP_AF13 0x38 > +#define IPTOS_DSCP_CS2 0x40 > +#define IPTOS_DSCP_AF21 0x48 > +#define IPTOS_DSCP_AF22 0x50 > +#define IPTOS_DSCP_AF23 0x58 > +#define IPTOS_DSCP_CS3 0x60 > +#define IPTOS_DSCP_AF31 0x68 > +#define IPTOS_DSCP_AF32 0x70 > +#define IPTOS_DSCP_AF33 0x78 > +#define IPTOS_DSCP_CS4 0x80 > +#define IPTOS_DSCP_AF41 0x88 > +#define IPTOS_DSCP_AF42 0x90 > +#define IPTOS_DSCP_AF43 0x98 > +#define IPTOS_DSCP_CS5 0xa0 > +#define IPTOS_DSCP_EF 0xb8 > +#define IPTOS_DSCP_CS6 0xc0 > +#define IPTOS_DSCP_CS7 0xe0 > + > +/* > + * ECN (Explicit Congestion Notification) codepoints in RFC3168 mapped to the > + * lower 2 bits of the TOS field. > + */ > +#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */ > +#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */ > +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ > +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ > +#define IPTOS_ECN_MASK 0x03 /* ECN field mask */ > + > +/*------------------------- */ > + > +#define RTF_CLONING 0x100 /* generate new routes on use */ > + > +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ > +#define IPPROTO_CARP 112 /* CARP */ > +#define CARP_VERSION 2 > +#define CARP_ADVERTISEMENT 0x01 > +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ > +#define IP_FORWARDING 0x1 /* most of ip header exists */ > +#define NETISR_IP 2 /* same as AF_INET */ > +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ > + > +extern int securelevel; > + > +#define if_xname name > +#define if_snd XXX > + > +// XXX we could use this to point to the incoming peer > +struct ifnet { > + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ > + uint32_t if_index; // IP_FW_3 > +}; > + > +struct ifaltq { > + > + void *ifq_head; > +}; > +int ffs(int); // XXX where > +int fls(int); // XXX where > + > +struct ip; > +/* machine/in_cksum.h */ > +int in_cksum(struct mbuf *m, int len); > +#ifndef __FreeBSD__ > +u_short in_cksum_hdr(struct ip *); > +#endif > + > + > +#define CTR3(a, ...) > +#define uma_zone_set_max(a, b) // XXX > + > +/* > + * ifnet->if_snd is used in ip_dummynet.c to take the transmission > + * clock. > + */ > +#if defined( __linux__) > +#define if_xname name > +#define if_snd XXX > + > +struct route_in6 { > +}; > + > +#elif defined( _WIN32 ) > +/* used in ip_dummynet.c */ > +struct ifnet { > + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ > +// struct ifaltq if_snd; /* output queue (includes altq) */ > +}; > + > +struct net_device { > + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ > +}; > +#elif defined(__APPLE__) > +typedef u_int32_t tcp_cc; > +#ifndef s6_addr32 // XXX > +#define s6_addr32 __u6_addr.__u6_addr32 > +#endif > +#include <netinet/tcp.h> > + > +struct route_in6 { > +}; > + > +struct icmphdr { > + u_char icmp_type; /* type of message, see below */ > + u_char icmp_code; /* type sub code */ > + u_short icmp_cksum; /* ones complement cksum of struct */ > +}; > + > +#define IPPROTO_SCTP 132 /* SCTP */ > + > +/* defined in linux/sctp.h with no bsd definition */ > +struct sctphdr { > + uint16_t src_port; /* source port */ > + uint16_t dest_port; /* destination port */ > + uint32_t v_tag; /* verification tag of packet */ > + uint32_t checksum; /* Adler32 C-Sum */ > + /* chunks follow... */ > +}; > + > +struct carp_header { > +#if BYTE_ORDER == LITTLE_ENDIAN > + u_int8_t carp_type:4, > + carp_version:4; > +#endif > +#if BYTE_ORDER == BIG_ENDIAN > + u_int8_t carp_version:4, > + carp_type:4; > +#endif > +}; > + > + > +struct pim { > + int dummy; /* windows compiler does not like empty definition */ > +}; > + > +#endif > + > +/* involves mbufs */ > +//int in_cksum(struct mbuf *m, int len); > +#define divert_cookie(mtag) 0 > +#define divert_info(mtag) 0 > +#define pf_find_mtag(a) NULL > +#define pf_get_mtag(a) NULL > +#if !defined(_WIN32) && !defined(AF_LINK) > +#define AF_LINK AF_ASH /* ? our sys/socket.h */ > +#endif > + > +/* search local the ip addresses, used for the "me" keyword */ > +#define INADDR_TO_IFP(ip, b) b = NULL > + > +/* we don't pullup, either success or free and fail */ > +#define m_pullup(m, x) \ > + ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) > + > +struct pf_mtag { > + void *hdr; /* saved hdr pos in mbuf, for ECN */ > + sa_family_t af; /* for ECN */ > + u_int32_t qid; /* queue id */ > +}; > + > +/* missing kernel functions */ > +char *inet_ntoa(struct in_addr ina); > +long random(void); > + > +/* > + * Return the risult of a/b > + * > + * this is used in linux kernel space, > + * since the 64bit division needs to > + * be done using a macro > + */ > +//int64_t div64(int64_t a, int64_t b); > + > +/* from bsd sys/queue.h */ > +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ > + for ((var) = TAILQ_FIRST((head)); \ > + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ > + (var) = (tvar)) > + > +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ > + for ((var) = SLIST_FIRST((head)); \ > + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ > + (var) = (tvar)) > + > +/*-------------------------------------------------*/ > +#define RT_NUMFIBS 1 > +extern u_int rt_numfibs; > + > +/* involves kernel locking function */ > +#ifdef RTFREE > +#undef RTFREE > +#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); > +#endif > + > +void getmicrouptime(struct timeval *tv); > + > +/* from sys/netinet/ip_output.c */ > +struct ip_moptions; > +struct route; > +struct ip; > + > +struct inpcb; > +struct mbuf *ip_reass(struct mbuf *); > +int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, > + struct ip_moptions *imo, struct inpcb *inp); > + > +/* from net/netisr.c -- fails on FreeBSD */ > +int netisr_dispatch(u_int proto, struct mbuf *m); > + > + > +/* definition moved in missing.c */ > +int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); > +int copyout(const void *kaddr, void *uaddr, size_t len); > + > +int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); > + > +/* defined in session.c */ > +int priv_check(struct thread *td, int priv); > + > +/* struct ucred is in linux/socket.h and has pid, uid, gid. > + * We need a 'bsd_ucred' to store also the extra info > + */ > + > +struct bsd_ucred { > + uid_t uid; > + gid_t gid; > + uint32_t xid; > + uint32_t nid; > +}; > + > +#ifdef _KERNEL > + > +#if 0 // XXX > +int > +cred_check(void *insn, int proto, struct ifnet *oif, > + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, > + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, > + struct sk_buff *skb); > +#endif > + > +struct ucred; > +int securelevel_ge(struct ucred *cr, int level); > + > +/* > + * stripped down version of the sysctl api > + */ > +struct sysctl_oid; > +struct sysctl_req { > + void *oldptr; /* store here the original value */ > + int oldlen; > + void *newptr; /* NULL on reads */ > + int newlen; > +}; > + > +#ifdef _WIN32 > +#define module_param_named(_name, _var, _ty, _perm) > +#else /* !_WIN32 */ > + > +#endif /* !_WIN32 so maybe __linux__ */ > + > +#if 0 // XXX disable sysctl defined (__linux__) && !defined (EMULATE_SYSCTL) > +#define SYSCTL_DECL(_1) > +#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) > +#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) > +#define _SYSCTL_BASE(_name, _var, _ty, _perm) \ > + module_param_named(_name, *(_var), _ty, \ > + ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) > +#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) > + > +#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ > + _SYSCTL_BASE(_name, _var, int, _mode) > + > +#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ > + _SYSCTL_BASE(_name, _var, long, _mode) > + > +#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ > + _SYSCTL_BASE(_name, _var, ulong, _mode) > + > +#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ > + _SYSCTL_BASE(_name, _var, uint, _mode) > + > +#define TUNABLE_INT(_name, _ptr) > + > +#define SYSCTL_VNET_PROC SYSCTL_PROC > +#define SYSCTL_VNET_INT SYSCTL_INT > +#define SYSCTL_VNET_UINT SYSCTL_UINT > + > +#endif > + > +#define SYSCTL_HANDLER_ARGS \ > + struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req > +typedef int (sysctl_h_fn_t)(SYSCTL_HANDLER_ARGS); > +int sysctl_handle_int(SYSCTL_HANDLER_ARGS); > +int sysctl_handle_long(SYSCTL_HANDLER_ARGS); > + > +#ifdef EMULATE_SYSCTL /* mandatory here */ > + > +#define STRINGIFY(x) #x > + > +#ifdef SYSCTL_NODE > +#undef SYSCTL_NODE > +#endif > +#define SYSCTL_NODE(a,b,c,d,e,f) int a; (void)a > +#define SYSCTL_DECL(a) > + > +#define GST_HARD_LIMIT 100 > + > +/* In the module, GST is implemented as an array of > + * sysctlentry, but while passing data to the userland > + * pointers are useless, the buffer is actually made of: > + * - sysctlhead (fixed size, containing lengths) > + * - data (typically 32 bit) > + * - name (zero-terminated and padded to mod4) > + */ > + > +struct sysctlentry { > + struct sysctlhead head; > + char* name; > + void* data; > + sysctl_h_fn_t *fn; > +}; > + > +struct sysctltable { > + int count; //number of valid tables > + int totalsize; //total size of valid entries of al the valid tables > + void* namebuffer; //a buffer for all chained names > + struct sysctlentry entry[GST_HARD_LIMIT]; > +}; > + > +#ifdef SYSBEGIN > +#undef SYSBEGIN > +#endif > +#define SYSBEGIN(x) void sysctl_addgroup_##x() { > +#ifdef SYSEND > +#undef SYSEND > +#endif > +#define SYSEND } > + > +/* XXX remove duplication */ > +#define SYSCTL_INT(a,b,c,d,e,f,g) \ > + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ > + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e, NULL) > + > +#define SYSCTL_UINT(a,b,c,d,e,f,g) \ > + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ > + (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e, NULL) > + > +#define SYSCTL_LONG(a,b,c,d,e,f,g) \ > + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ > + (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e, NULL) > + > +#define SYSCTL_ULONG(a,b,c,d,e,f,g) \ > + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ > + (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e, NULL) > +#define TUNABLE_INT(a,b) > + > +#define SYSCTL_PROC(a,b,c,d,e,f,g,h,i) \ > + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ > + (d), 4 /* XXX large */, (void *)(f /* arg2 */), g) > + > +#define SYSCTL_VNET_PROC SYSCTL_PROC > +#define SYSCTL_VNET_INT SYSCTL_INT > +#define SYSCTL_VNET_UINT SYSCTL_UINT > + > +void keinit_GST(void); > +void keexit_GST(void); > +int kesysctl_emu_set(void* p, int l); > +int kesysctl_emu_get(struct sockopt* sopt); > +void sysctl_pushback(char* name, int flags, int datalen, void* data, sysctl_h_fn_t *fn); > + > +#endif /* EMULATE_SYSCTL */ > + > +struct ifnet; > +void ether_demux(struct ifnet *ifp, struct mbuf *m); > + > +int ether_output_frame(struct ifnet *ifp, struct mbuf *m); > + > +void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); > + > +void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); > + > +#define in_localip(_x) (0) > + > +#ifndef __FreeBSD__ > +struct rtentry; > +#endif > +void rtfree(struct rtentry *rt); > + > +u_short in_cksum_skip(struct mbuf *m, int len, int skip); > + > +#ifdef INP_LOCK_ASSERT > +#undef INP_LOCK_ASSERT > +#define INP_LOCK_ASSERT(a) > +#endif > + > +int jailed(struct ucred *cred); > + > +/* > +* Return 1 if an internet address is for a ``local'' host > +* (one to which we have a connection). If subnetsarelocal > +* is true, this includes other subnets of the local net. > +* Otherwise, it includes only the directly-connected (sub)nets. > +*/ > +int in_localaddr(struct in_addr in); > + > +int fnmatch(const char *pattern, const char *string, int flags); > + > +/* vnet wrappers, in vnet.h and ip_var.h */ > +//int ipfw_init(void); > +//void ipfw_destroy(void); > + > +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ > +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ > +#define MTAG_IPFW_CALL 1308397630 /* call stack */ > + > +#ifdef __APPLE__ > +#define offsetof(type, field) __builtin_offsetof(type, field) > +#endif > +struct ip_fw_args; > +extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); > + > +#if 1 /* include vnet.h */ > +#define curvnet NULL > +#define CURVNET_SET(_v) > +#define CURVNET_RESTORE() > +#define VNET_ASSERT(condition) > + > +#define VNET_NAME(n) n > +#define VNET_DECLARE(t, n) extern t n > +#define VNET_DEFINE(t, n) t n > +#define _VNET_PTR(b, n) &VNET_NAME(n) > +/* > + * Virtualized global variable accessor macros. > + */ > +#define VNET_VNET_PTR(vnet, n) (&(n)) > +#define VNET_VNET(vnet, n) (n) > + > +#define VNET_PTR(n) (&(n)) > +#define VNET(n) (n) > + > +#define IS_DEFAULT_VNET(x) (1) // always true > +#endif > + > +VNET_DECLARE(int, ip_defttl); > +#define V_ip_defttl VNET(ip_defttl); > + > + > +// int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); > +// XXX used in netmap_io.c > +int ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); > +int ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); > + > +/* hooks for divert */ > +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); > + > +extern int (*ip_dn_ctl_ptr)(struct sockopt *); > +typedef int ip_fw_ctl_t(struct sockopt *); > +extern ip_fw_ctl_t *ip_fw_ctl_ptr; > + > + > +/* netgraph prototypes */ > +typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); > +extern ng_ipfw_input_t *ng_ipfw_input_p; > + > +/* For kernel ipfw_ether and ipfw_bridge. */ > +struct ip_fw_args; > + > +#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) > +#define V_tcbinfo VNET(tcbinfo) > +#define V_udbinfo VNET(udbinfo) > +#endif /* _KERNEL */ > + > +// sys/eventhandler.h > +#define EVENTHANDLER_DECLARE(a, b) > + > +/* application specific */ > +struct sess; > +typedef int (handler_t)(struct sess *sess, void *arg); > + > +/* > + * flags to control the callback > + * WANT_READ select on read > + * WANT_WRITE select on write > + * WANT_RUN run unconditionally > + * WANT_DELETE session is exiting > + */ > +enum flags_t { > + WANT_READ=1, WANT_WRITE=2, WANT_RUN=4, > + WANT_DELETE=0x8000 > +}; > + > +struct sess { > + struct sess *next; > + int fd; > + handler_t *func; > + void *arg; > + enum flags_t flags; > + void *private; /* pointer managed by the session code */ > +}; > +struct sess * > +new_session(int fd, handler_t *func, void *arg, enum flags_t flags); > + > + > +void netmap_add_port(const char *dev); > +#endif /* !_MISSING_H_ */ > diff --git a/example/ipfw/extra/session.c b/example/ipfw/extra/session.c > new file mode 100644 > index 0000000..333edd3 > --- /dev/null > +++ b/example/ipfw/extra/session.c > @@ -0,0 +1,644 @@ > +/* > + * Session handler to simulate soopt* and network communication > + * over a TCP socket, and also run the callbacks. > + */ > + > +#ifdef _KERNEL > +#undef _KERNEL > +#endif > +/* these headers need to be compiled without _KERNEL */ > +#include <sys/types.h> > +#include <sys/select.h> > +#include <sys/socket.h> > +#include <netinet/in.h> > +#include <netinet/tcp.h> // TCP_NODELAY > +#include <sys/cpuset.h> // freebsd, used in rmlock > +#include <net/pfil.h> // PFIL_IN > +#include <sys/errno.h> > +extern int errno; > + > + > +#ifdef free > +/* we are built in a pseudo-kernel env so malloc and free are redefined */ > +#undef free > +#undef malloc > +#endif /* free */ > + > +#include <stdio.h> > +#include <pthread.h> > +#include <fcntl.h> > +#include <sys/time.h> /* timersub */ > +#include <stdlib.h> > +#include <string.h> > +#include <unistd.h> /* read() */ > + > +#include <sys/mbuf.h> /* mbuf */ > +#define _KERNEL > + > +/* args for ipfw */ > +#include <netinet/ip_fw.h> > +#include <netpfil/ipfw/ip_fw_private.h> > + > +/* > + * Global variables need to be somewhere... > + */ > +void ip_dn_init(void); > +int ipfw_init(void); > +void ipfw_destroy(void); > + > +extern int (*ip_fw_ctl_ptr)(struct sockopt *); > +extern int (*ip_dn_ctl_ptr)(struct sockopt *); > +extern struct ip_fw *ip_fw_default_rule; > + > +extern int ticks; /* kernel ticks counter */ > + > +int callout_startup(void); > +int callout_run(void); > + > +/* > + * generic handler for sockopt functions > + */ > +static int > +ctl_handler(struct sockopt *sopt) > +{ > + int error = EINVAL; > + > + ND("called, level %d", sopt->sopt_level); > + if (sopt->sopt_level != IPPROTO_IP) > + return (EINVAL); > + switch (sopt->sopt_name) { > + default: > + D("command not recognised %d", sopt->sopt_name); > + break; > + case IP_FW3: // XXX untested > + case IP_FW_ADD: /* ADD actually returns the body... */ > + case IP_FW_GET: > + case IP_FW_DEL: > + case IP_FW_TABLE_GETSIZE: > + case IP_FW_TABLE_LIST: > + case IP_FW_NAT_GET_CONFIG: > + case IP_FW_NAT_GET_LOG: > + case IP_FW_FLUSH: > + case IP_FW_ZERO: > + case IP_FW_RESETLOG: > + case IP_FW_TABLE_ADD: > + case IP_FW_TABLE_DEL: > + case IP_FW_TABLE_FLUSH: > + case IP_FW_NAT_CFG: > + case IP_FW_NAT_DEL: > + if (ip_fw_ctl_ptr != NULL) > + error = ip_fw_ctl_ptr(sopt); > + else { > + D("ipfw not enabled"); > + error = ENOPROTOOPT; > + } > + break; > + > + case IP_DUMMYNET_GET: > + case IP_DUMMYNET_CONFIGURE: > + case IP_DUMMYNET_DEL: > + case IP_DUMMYNET_FLUSH: > + case IP_DUMMYNET3: > + if (ip_dn_ctl_ptr != NULL) > + error = ip_dn_ctl_ptr(sopt); > + else > + error = ENOPROTOOPT; > + break ; > + } > + ND("returning error %d", error); > + return error; > +} > + > +/* > + * copy data back to userland > + */ > +int > +sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) > +{ > + size_t valsize = sopt->sopt_valsize; > + > + ND("data len %d sopt_len %d", (int)len, (int)valsize); > + if (len < valsize) > + sopt->sopt_valsize = valsize = len; > + bcopy(buf, sopt->sopt_val, valsize); > + return 0; > +} > + > +int > +copyout(const void *kaddr, void *uaddr, size_t len) > +{ > + bcopy(kaddr, uaddr, len); > + return 0; /* no fault */ > +} > + > +/* > + * copy data from userland to kernel > + */ > +int > +sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) > +{ > + size_t valsize = sopt->sopt_valsize; > + > + ND("have %d len %d minlen %d", (int)valsize, (int)len, (int)minlen); > + if (valsize < minlen) > + return EINVAL; > + if (valsize > len) > + sopt->sopt_valsize = valsize = len; > + bcopy(sopt->sopt_val, buf, valsize); > + return 0; > +} > + > +/* > + * session description for event-based programming > + */ > +/* event-based session support */ > + > +#define SOCK_QLEN 5 /* listen lenght for incoming connection */ > + > +static struct sess *all_sessions, *new_sessions; > + > +struct sess * > +new_session(int fd, handler_t *func, void *arg, enum flags_t flags) > +{ > + struct sess *desc; > + desc = calloc(1, sizeof(*desc)); > + if (desc == NULL) > + return NULL; > + desc->fd = fd; > + desc->func = func; > + desc->arg = arg; > + desc->flags = flags; > + desc->next = new_sessions; > + new_sessions = desc; > + return desc; > +} > + > +/* remove deleted sessions, merge with new ones */ > +static void > +merge_sessions(void) > +{ > + struct sess *cur, *prev, *tmp; > + > + for (prev = NULL, cur = all_sessions; cur; prev = cur, cur = tmp) { > + tmp = cur->next; > + if ( (cur->flags & WANT_DELETE) == 0) > + continue; > + if (prev) > + prev->next = cur->next; > + else > + all_sessions = cur->next; > + memset(cur, 0, sizeof(*cur)); > + free(cur); > + cur = prev; > + } > + if (prev) > + prev->next = new_sessions; > + else > + all_sessions = new_sessions; > + new_sessions = NULL; > +} > + > +/* set the fdset, return the fdmax+1 for select() */ > +int > +set_sessions(fd_set *r, fd_set *w) > +{ > + struct sess *cur; > + int fd_max = -1; > + int count = 0,ready = 0; > + > + FD_ZERO(r); > + FD_ZERO(w); > + merge_sessions(); > + for (cur = all_sessions; cur; cur = cur->next) { > + count++; > + if (cur->flags & WANT_RUN) { > + ND("WANT_RUN on session %p", cur); > + cur->flags &= ~WANT_RUN; > + cur->func(cur, cur->arg); > + } > + if (cur->flags & WANT_READ) > + FD_SET(cur->fd, r); > + if (cur->flags & WANT_WRITE) > + FD_SET(cur->fd, w); > + if (cur->flags & (WANT_WRITE|WANT_READ)) { > + ready ++; > + if (cur->fd > fd_max) > + fd_max = cur->fd; > + } > + } > + ND("%d session %d waiting", count, ready); > + return fd_max + 1; > +} > + > +int > +run_sessions(fd_set *r, fd_set *w) > +{ > + struct sess *cur; > + > + for (cur = all_sessions; cur; cur = cur->next) { > + int fd = cur->fd; > + // fprintf(stderr, "%s sess %p\n", __FUNCTION__, cur); > + if (FD_ISSET(fd, r) || FD_ISSET(fd, w)) > + cur->func(cur, cur->arg); > + } > + return 0; > +} > + > +struct sess_buf { > + int len; /* allocation length */ > + int used; /* bytes used */ > + int start; /* start position for next write */ > + char data[0]; > +}; > + > +struct sess_buf * > +get_buf(int size, struct sess_buf *old) > +{ > + struct sess_buf *p = old; > + > + if (!p) { > + ND("new buffer size %d", size); > + p = calloc(1, sizeof(*p) + size); > + } else if (p->len >= size) { > + return p; > + } else { > + ND("calling realloc %p %d", old, size); > + p = realloc(old, sizeof(*p) + size); > + } > + if (!p) { > + if (old) > + free(old); > + } else { > + p->len = size; > + } > + return p; > +} > + > +/* > + * do a non-blocking read into the buffer, reallocating if space > + * is needed. > + */ > +static struct sess_buf * > +get_data(int fd, struct sess_buf *buf, int want) > +{ > + int l; > + > + buf = get_buf(want, buf); > + if (buf == NULL) > + return buf; > + l = read(fd, buf->data + buf->used, want - buf->used); > + if (l > 0) > + buf->used += l; > + return buf; > +} > + > +/* > + * Handler for a request coming from the control socket. > + */ > +enum sockopt_state { > + READING = 0, WRITING = 1 > +}; > + > +struct sockopt_desc { > + int state; /* internal state */ > + struct sess_buf *rd; > + struct sess_buf *wr; > +}; > + > +/* header prepended to data in all transactions */ > +struct rx_hdr { > + uint32_t optlen; /* data len */ > + uint32_t level; /* or error ? */ > + uint32_t optname; /* or desired len ? */ > + uint32_t dir; /* in or out */ > +}; > + > +/* > + * Return the number of remainig bytes from the buffer. > + * The meessage is int optname; [int optlen; int data] > + * where the second part is present or not depending on the > + * message type. > + */ > +int > +get_want(struct sess_buf *rd, struct rx_hdr *r) > +{ > + struct rx_hdr _r; > + int l = sizeof(_r); > + > + if (r == NULL) > + r = &_r; > + if (!rd || rd->used < l) { > + ND("short buffer (%d), return %d to bootstrap", > + rd ? rd->used : -1, l); > + return l; > + } > + bcopy(rd->data, r, l); > + /* header fields are in network format, convert to host fmt */ > + r->optlen = ntohl(r->optlen); > + r->level = ntohl(r->level); > + r->optname = ntohl(r->optname); > + r->dir = ntohl(r->dir); > + l += r->optlen; > + return l; > +} > + > +/* > + * The sockopt commands are sent in network format (at least the header) > + */ > +int > +sockopt_handler(struct sess *sess, void *arg) > +{ > + struct sockopt_desc *d; > + int error = 1; > + > + ND("sess %p arg %p", sess, arg); > + if (sess->private == NULL) > + sess->private = calloc(1, sizeof(struct sockopt_desc)); > + d = sess->private; > + if (d == NULL) > + goto done; > + if (sess->flags & WANT_READ) { > + int l, want, prev; > + struct rx_hdr r; > + struct sockopt sopt; > + struct thread dummy; > + > + want = get_want(d->rd, &r); > + prev = d->rd ? d->rd->used : 0; > + ND("total message size is %d (prev %d)", want, prev); > + > + d->rd = get_data(sess->fd, d->rd, want); > + l = d->rd ? d->rd->used : 0; > + ND("read %d prev %d want %d", l, prev, want); > + if (l == prev) /* no data -> error */ > + goto done; > + want = get_want(d->rd, &r); > + ND("again, want %d l %d", want, l); > + if (l < want) /* must read more data */ > + return 0; > + sopt.sopt_dir = r.dir; > + sopt.sopt_level = r.level; > + sopt.sopt_name = r.optname; > + sopt.sopt_val = > + (l <= sizeof(r)) ? NULL : d->rd->data + sizeof(r); > + sopt.sopt_valsize = r.optlen; > + sopt.sopt_td = &dummy; > + ND("dir 0x%x lev %d opt %d optval %p optlen %d", > + sopt.sopt_dir, > + sopt.sopt_level, > + sopt.sopt_name, > + sopt.sopt_val, > + (int)sopt.sopt_valsize); > + > + /* now call the handler */ > + r.level = htonl(ctl_handler(&sopt)); > + ND("handler returns %d", ntohl(r.level)); > + r.optlen = htonl(0); /* default len */ > + r.dir = htonl(sopt.sopt_dir); > + /* prepare the buffer for writing */ > + if (d->wr != NULL) { /* previous write buffer */ > + free(d->wr); > + } > + d->wr = d->rd; > + d->rd = NULL; > + d->wr->used = sopt.sopt_valsize + sizeof(r); > + d->wr->start = 0; > + /* now update the header */ > + if (sopt.sopt_dir == SOPT_GET) > + r.optlen = htonl(sopt.sopt_valsize); > + > + bcopy(&r, d->wr->data, sizeof(r)); > + > + sess->flags = WANT_WRITE; > + return 0; > + } > + if (sess->flags & WANT_WRITE) { > + struct sess_buf *wr = d->wr; > + > + int l = write(sess->fd, wr->data + wr->start, > + wr->used - wr->start); > + ND("written %d bytes out of %d", l, > + wr->used - wr->start); > + if (l <= 0) { > + if (errno == EAGAIN) > + return 0; > + goto done; /* error */ > + } > + wr->start += l; > + if (wr->start < wr->used) > + return 0; > + // prepare for another rpc > + sess->flags = WANT_READ; > + return 0; > + //goto done; > + } > +done: > + ND("closing session"); > + if (d) { > + if (sess->fd >= 0) > + close(sess->fd); > + if (d->rd) > + free(d->rd); > + if (d->wr) > + free(d->wr); > + d->rd = d->wr = NULL; > + free(d); /* private data */ > + sess->flags = WANT_DELETE; > + } > + return error; > +} > + > + > +/* > + * testing code when reading fake packets from socket 5556. > + * Turns out that ipfw_check_hook() is a lot slower than ipfw_chk() > + * XXX new ipfw uses ipfw_check_frame or ipfw_check_packet > + */ > +int > +packet_handler(struct sess *sess, void *arg) > +{ > + char fake_buf[2048]; > + struct mbuf dm; > + int i; > + > + bzero(&dm, sizeof(dm)); > + dm.m_data = fake_buf + 14; /* skip mac hdr */ > + dm.m_len = dm.m_pkthdr.len = 128; > + fake_buf[14] = 0x45; // ip > + *(uint16_t *)(fake_buf+16) = htons(64); // bytes > + *(uint32_t *)(fake_buf+26) = htonl(0x01020304); // src > + *(uint32_t *)(fake_buf+30) = htonl(0x05060708); // dst > + { > +#if 0 > + struct ip_fw_args args; > + bzero(&args, sizeof(args)); > + args.m = &dm; > + for (i = 0; i < 1000; i++) > + ipfw_chk(&args); > +#else > + struct ifnet *ifp = NULL; > + struct inpcb *inp = NULL; > + struct mbuf *m = &dm; > + ND("sess %p arg %p", sess, arg); > + for (i = 0; i < 1000; i++) > + ipfw_check_packet(NULL, &m, ifp, PFIL_IN, inp); > +#endif > + } > + return 0; > +} > + > + > +/* > + * This task accepts a new connection and creates a new session. > + */ > +static int > +listener(struct sess *sess, void *arg) > +{ > + int fd; > + > + ND("sess %p arg %p", sess, arg); > + fd = accept(sess->fd, NULL, NULL); > + if (fd < 0) > + return -1; > + fcntl(fd, F_SETFL, O_NONBLOCK); > +#ifdef setsockopt /* make sure we don't redefine it */ > +#error cannot compile this > +#endif > + { > + int on = 1, ret; > + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); > + ND("TCP_NODELAY returns %d", ret); > + } > + new_session(fd, sess->arg ? sockopt_handler: packet_handler, > + sess->arg, WANT_READ); > + return 0; > +} > + > +/* > + * listen on a socket, > + * return the listen fd or -1 on error. > + */ > +static int > +do_server(const char *addr, int port) > +{ > + int fd = -1, on; > + struct sockaddr_in server; > + > + /* open the listen socket */ > + fd = socket(AF_INET, SOCK_STREAM, 0); > + if (fd < 0) { > + perror( "socket" ); > + return -1; > + } > + > + on = 1; > +#ifdef SO_REUSEADDR > + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) > + perror("SO_REUSEADDR failed(non fatal)"); > +#endif > +#ifdef SO_REUSEPORT > + on = 1; > + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)) == -1) > + perror("SO_REUSEPORT failed(non fatal)"); > +#endif > + > + /* fill the server struct */ > + bzero(&server, sizeof(server)); > + server.sin_family = AF_INET; > + inet_aton(addr, &server.sin_addr); > + server.sin_port = htons(port); > + > + /* bind the local address */ > + if (bind(fd, (struct sockaddr*) &server, sizeof(server)) < 0) { > + perror( "bind" ); > + return -1; > + } > + D("+++ listening tcp %s:%d", > + inet_ntoa(server.sin_addr), ntohs(server.sin_port)); > + > + /* listen for incoming connection */ > + if (listen(fd, SOCK_QLEN) < 0) { > + perror( "listen" ); > + return -1; > + } > + return fd; > +} > + > +extern int ipfw_module_init(void); > + > +/* > + * main program for ipfw kernel side when running an userspace emulation: > + * open a socket on which we receive requests from userland, > + * another socket for calls from the 'kernel' (simulating packet > + * arrivals etc), and then periodically run the tick handler. > + */ > +int > +mainloop(int argc, char *argv[]) > +{ > + int listen_fd; > + struct timeval t0; > + const char *s, *addr = LOCALADDR; > + int port = IPFW_PORT; > + int i; > + int old_ticks; > + uint64_t callouts = 0, skipped = 0; > + > + gettimeofday(&t0, NULL); > + old_ticks = ticks = 0; > + callout_startup(); > + > + ipfw_module_init(); > + > + /* override the host if set in the environment */ > + s = getenv("IPFW_HOST"); > + if (s) > + addr = s; > + s = getenv("IPFW_PORT"); > + if (s && atoi(s) > 0) > + port = atoi(s); > + /* start the server */ > + listen_fd = do_server(addr, port); > + if (listen_fd < 0) { > + printf("Error starting server\n"); > + return -1; > + } > + new_session(listen_fd, listener, (void *)1, WANT_READ); > + > +#ifdef WITH_NETMAP > + for (i = 1; i < argc; i++) { > + netmap_add_port(argv[i]); > + } > +#endif /* WITH_NETMAP */ > + > +#if 0 // test code: a telnet on 5556 becomes an infinite source > + { > + int net_fd = do_server(addr, port+1); > + if (net_fd >= 0) > + new_session(net_fd, listener, NULL, WANT_READ); > + } > +#endif > + > + for (;;) { > + struct timeval now, delta = { 0, tick} ; > + int n; > + fd_set r, w; > + > + n = set_sessions(&r, &w); > + select(n, &r, &w, NULL, &delta); > + run_sessions(&r, &w); > + gettimeofday(&now, 0); > + timersub(&now, &t0, &delta); > + /* compute absolute ticks. */ > + ticks = (delta.tv_sec * hz) + (delta.tv_usec * hz) / 1000000; > + if (old_ticks != ticks) { > + callouts++; > + callout_run(); > + old_ticks = ticks; > + } else { > + skipped++; > + } > + RD(1, "callouts %lu skipped %lu", (u_long)callouts, (u_long)skipped); > + } > + ipfw_destroy(); > + return 0; > +} > diff --git a/example/ipfw/extra/sys/contrib/pf/net/pfvar.h b/example/ipfw/extra/sys/contrib/pf/net/pfvar.h > new file mode 100644 > index 0000000..257bbd6 > --- /dev/null > +++ b/example/ipfw/extra/sys/contrib/pf/net/pfvar.h > @@ -0,0 +1,27 @@ > +/* > + * replacement for FreeBSD's pfqueue.h > + */ > +#include <sys/queue.h> > + > +#define DIOCSTARTALTQ _IO ('D', 42) > +#define DIOCSTOPALTQ _IO ('D', 43) > + > +struct pf_altq { > + TAILQ_ENTRY(pf_altq) entries; > + /* ... */ > + u_int32_t qid; /* return value */ > + > +#define PF_QNAME_SIZE 64 > + char qname[PF_QNAME_SIZE]; /* queue name */ > + > +}; > + > +struct pfioc_altq { > + u_int32_t action; > + u_int32_t ticket; > + u_int32_t nr; > + struct pf_altq altq; > +}; > + > +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) > +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) > diff --git a/example/ipfw/extra/sys/sys/kernel.h b/example/ipfw/extra/sys/sys/kernel.h > new file mode 100644 > index 0000000..f234d4a > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/kernel.h > @@ -0,0 +1,26 @@ > +/* > + * from freebsd's kernel.h > + */ > +#ifndef _SYS_KERNEL_H_ > +#define _SYS_KERNEL_H_ > + > +#define SYSINIT(a, b, c, d, e) \ > + int (*sysinit_ ## d)(void *) = (int (*)(void *))(d) > +#define VNET_SYSINIT(a, b, c, d, e) \ > + SYSINIT(a, b, c, d, e) > +#define SYSUNINIT(a, b, c, d, e) \ > + int (*sysuninit_ ## d)(void *) = (int (*)(void *))(d) > +#define VNET_SYSUNINIT(a, b, c, d, e) \ > + SYSUNINIT(a, b, c, d, e) > + > +/* > + * Some enumerated orders; "ANY" sorts last. > + */ > +enum sysinit_elem_order { > + SI_ORDER_FIRST = 0x0000000, /* first*/ > + SI_ORDER_SECOND = 0x0000001, /* second*/ > + SI_ORDER_THIRD = 0x0000002, /* third*/ > + SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ > + SI_ORDER_ANY = 0xfffffff /* last*/ > +}; > +#endif > diff --git a/example/ipfw/extra/sys/sys/malloc.h b/example/ipfw/extra/sys/sys/malloc.h > new file mode 100644 > index 0000000..9bc64a3 > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/malloc.h > @@ -0,0 +1,13 @@ > +/* > + * $Id$ > + * replacement for sys/malloc.h to compile kernel in userspace > + */ > + > +#ifndef _SYS_MALLOC_H_ > +#define _SYS_MALLOC_H_ > + > +#define M_WAITOK 0x0000 /* can block */ > +#define M_NOWAIT 0x0001 /* do not block */ > +#define M_ZERO 0x0100 /* bzero the allocation */ > +#endif /* _SYS_MALLOC_H_ */ > + > diff --git a/example/ipfw/extra/sys/sys/mbuf.h b/example/ipfw/extra/sys/sys/mbuf.h > new file mode 100644 > index 0000000..1f3af63 > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/mbuf.h > @@ -0,0 +1,383 @@ > +/* > + * Copyright (C) 2012 Luigi Rizzo, Universita` di Pisa > + * > + * BSD copyright. > + * > + * A simple compatibility interface to map mbufs onto userspace structs > + */ > + > +#ifndef _SYS_MBUF_H_ > +#define _SYS_MBUF_H_ > +#define VM_UMA_H // kill this one // maybe not needed > +#define _VM_UMA_H_ // kill this one too > + > +// #include <sys/malloc.h> /* we use free() */ > +/* hopefully queue.h is already included by someone else */ > +#include <sys/queue.h> > +#ifdef _KERNEL > + > +/* bzero not present on linux, but this should go in glue.h */ > +// #define bzero(s, n) memset(s, 0, n) > + > +/* > + * We implement a very simplified UMA allocator where the backend > + * is simply malloc, and uma_zone only stores the length of the components. > + */ > +typedef int uma_zone_t; /* the zone size */ > + > +#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) > +typedef int (*uma_init)(void *mem, int size, int flags); > +typedef void (*uma_fini)(void *mem, int size); > + > + > +#define uma_zfree(zone, item) free(item, M_IPFW) > +#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) > +#define uma_zdestroy(zone) do {} while (0) > + > +/*- > + * Macros for type conversion: > + * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. > + */ > +#define mtod(m, t) ((t)((m)->m_data)) > + > +#endif /* _KERNEL */ > + > +/* > + * Packet tag structure (see below for details). > + */ > +struct m_tag { > + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ > + u_int16_t m_tag_id; /* Tag ID */ > + u_int16_t m_tag_len; /* Length of data */ > + u_int32_t m_tag_cookie; /* ABI/Module ID */ > +// void (*m_tag_free)(struct m_tag *); > +}; > + > +/* > + * Auxiliary structure to store values from the sk_buf. > + * Note that we should not alter the sk_buff, and if we do > + * so make sure to keep the values in sync between the mbuf > + * and the sk_buff (especially m_len and m_pkthdr.len). > + */ > + > +struct skbuf; > + > +struct mbuf { > + struct mbuf *m_next; > + struct mbuf *m_nextpkt; > + void * m_data; /* XXX should change to caddr_t */ > + int32_t m_len; /* length in this mbuf */ > + int m_flags; > + struct { > + struct ifnet *rcvif; > + int len; /* total packet len */ > + SLIST_HEAD (packet_tags, m_tag) tags; > + } m_pkthdr; > + struct skbuf *m_skb; > + int __max_m_len; /* original value */ > + > + /* > + * in-stack mbuffers point to an external buffer, > + * the two variables below contain base and size, > + * and have M_STACK set in m_flags. > + * Buffers from the heap have __m_extbuf = (char *)m + MSIZE > + */ > + void *__m_extbuf; /* external buffer base */ > + int __m_extlen; /* data in ext buffer */ > + void (*__m_callback)(struct mbuf *, int); > + void *__m_peer; /* argument attached to the mbuf */ > +}; > + > +/* > + * note we also have M_FASTFWD_OURS mapped to M_PROTO1 0x10 > + */ > +#ifndef M_SKIP_FIREWALL /* XXX conflict in FreeBSD */ > +#define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ > +#else > +#define M_PROTO3 0x01 // FreeBSD 10 and 11 > +#endif /* XXX conflict in FreeBSD */ > + > +#define M_BCAST 0x02 /* send/received as link-level broadcast */ > +#define M_MCAST 0x04 /* send/received as link-level multicast */ > +#define M_PROTO1 0x10 > +#define M_PROTO2 0x20 > +#define M_FASTFWD_OURS M_PROTO1 > +#define M_IP_NEXTHOP M_PROTO2 > +#define M_STACK 0x1000 /* allocated on the stack */ > + > +void m_freem(struct mbuf *m); > + > +#ifdef _KERNEL > + > +/* > + * m_dup() is used in the TEE case, currently unsupported so we > + * just return. > + */ > +static __inline struct mbuf *m_dup(struct mbuf *m, int n) > +{ > + (void)m; /* UNUSED */ > + (void)n; /* UNUSED */ > + D("unimplemented, expect panic"); > + return NULL; > +} > + > + > +static __inline void > +m_tag_prepend(struct mbuf *m, struct m_tag *t) > +{ > + ND("m %p tag %p", m, t); > + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); > +} > + > +/* > + * Unlink a tag from the list of tags associated with an mbuf. > + */ > +static __inline void > +m_tag_unlink(struct mbuf *m, struct m_tag *t) > +{ > + > + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); > +} > + > +/* > + * Return the next tag in the list of tags associated with an mbuf. > + */ > +static __inline struct m_tag * > +m_tag_next(struct mbuf *m, struct m_tag *t) > +{ > + D("mbuf %p tag %p", m, t); > + return (SLIST_NEXT(t, m_tag_link)); > +} > + > +extern SLIST_HEAD (tags_freelist, m_tag) tags_freelist; > +extern int tags_minlen; > +extern int tags_freelist_count; > + > +extern int max_protohdr; /* uipc_mbuf.c - max proto header */ > + > +/* > + * Create an mtag of the given type > + */ > +static __inline struct m_tag * > +m_tag_alloc(uint32_t cookie, int type, int length, int wait) > +{ > + static int maxlen = 0; > + int l = length + sizeof(struct m_tag); > + struct m_tag *m = NULL; > + > + if (l > maxlen) { > + D("new maxlen %d (%d)", l, length ); > + maxlen = l; > + } > + if (l <= tags_minlen) { > + l = tags_minlen; > + m = SLIST_FIRST(&tags_freelist); > + } > + if (m) { > + SLIST_REMOVE_HEAD(&tags_freelist, m_tag_link); > + ND("allocate from freelist"); > + tags_freelist_count--; > + } else { > + ND("size %d allocate from malloc", l); > + m = malloc(l, 0, M_NOWAIT); > + } > + if (m) { > + bzero(m, l); > + m->m_tag_id = type; > + m->m_tag_len = length; > + m->m_tag_cookie = cookie; > + ND("tag %p cookie %d type %d", m, cookie, type); > + } > + return m; > +} > + > +#define MTAG_ABI_COMPAT 0 /* compatibility ABI */ > + > +static __inline struct m_tag * > +m_tag_get(int type, int length, int wait) > +{ > + return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); > +} > + > +static __inline struct m_tag * > +m_tag_first(struct mbuf *m) > +{ > + struct m_tag *t; > + t = SLIST_FIRST(&m->m_pkthdr.tags); > + ND("mbuf %p has %p", m, t); > + return t; > +} > + > +static __inline void > +m_tag_delete(struct mbuf *m, struct m_tag *t) > +{ > + D("mbuf %p tag %p, ******* unimplemented", m, t); > +} > + > +static __inline struct m_tag * > +m_tag_locate(struct mbuf *m, u_int32_t cookie, int x, struct m_tag *t) > +{ > + struct m_tag *tag; > + > + ND("search %d %d in mbuf %p at %p", cookie, x, m, t); > + if (t) > + D("--- XXX ignore non-null t %p", t); > + tag = SLIST_FIRST(&m->m_pkthdr.tags); > + if (tag == NULL) > + return NULL; > + > + ND("found tag %p cookie %d type %d (want %d %d)", > + tag, tag->m_tag_cookie, tag->m_tag_id, cookie, x); > + if (tag->m_tag_cookie != cookie || tag->m_tag_id != x) { > + ND("want %d %d have %d %d, expect panic", > + cookie, x, tag->m_tag_cookie, tag->m_tag_id); > + return NULL; > + } else > + return tag; > +} > + > +static __inline struct m_tag * > +m_tag_find(struct mbuf *m, int type, struct m_tag *start) > +{ > + D("m %p", m); > + return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : > + m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); > +} > + > +#define M_SETFIB(_m, _fib) /* nothing on linux */ > + > + > +/* m_pullup is not supported, there is a macro in missing.h */ > + > +#define M_GETFIB(_m) 0 > + > +/* macro used to create a new mbuf */ > +#define MT_DATA 1 /* dynamic (data) allocation */ > +#ifndef MSIZE // defined on osx > +#define MSIZE 256 /* size of an mbuf */ > +#endif > +#define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) > +#define MY_MCLBYTES 2048 /* XXX make slightly less */ > + > + > +extern struct mbuf *mbuf_freelist; > + > +/* allocate and init a new mbuf using the same structure of FreeBSD */ > +/* > + * XXX for the userspace version, we actually allocate > + * MCLBYTES right after the buffer to store a copy of the packet. > + */ > +static __inline struct mbuf * > +m_gethdr(int how, short type) > +{ > + struct mbuf *m; > + static const struct mbuf m0; /* zero-initialized */ > + > + if (mbuf_freelist) { > + m = mbuf_freelist; > + mbuf_freelist = m->m_next; > + *m = m0; > + } else { > + m = malloc(MY_MCLBYTES, M_IPFW, M_NOWAIT); > + } > + > + ND("new mbuf %p", m); > + if (m == NULL) { > + panic("mgethdr failed"); > + return m; > + } > + > + /* here we have MSIZE - sizeof(struct mbuf) available */ > + m->m_data = m + 1; > + m->__m_extbuf = (char *)m + MSIZE; > + m->__m_extlen = MY_MCLBYTES - MSIZE; > + > + return m; > +} > + > + > +/* > + * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be > + * allocated, how specifies whether to wait. If the allocation fails, the > + * original mbuf chain is freed and m is set to NULL. > + */ > +static inline void M_PREPEND(struct mbuf *m, int plen, int how) > +{ \ > + if (plen < 0 || plen + m->m_len > m->__max_m_len) { > + D("size too large"); > + } else { > + m->m_data -= plen; > + m->m_len += plen; > + } > +} > + > +static inline void > +m_adj(struct mbuf *mp, int req_len) > +{ > + if (req_len < 0 || req_len > mp->m_len) { > + D("no m_adj for len %d in mlen %d", req_len, mp->m_len); > + } else { > + mp->m_data += req_len; > + mp->m_len -= req_len; > + } > +} > + > +#define M_PREPEND_GOOD(m, plen, how) do { \ > + struct mbuf **_mmp = &(m); \ > + struct mbuf *_mm = *_mmp; \ > + int _mplen = (plen); \ > + int __mhow = (how); \ > + \ > + MBUF_CHECKSLEEP(how); \ > + if (M_LEADINGSPACE(_mm) >= _mplen) { \ > + _mm->m_data -= _mplen; \ > + _mm->m_len += _mplen; \ > + } else \ > + _mm = m_prepend(_mm, _mplen, __mhow); \ > + if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ > + _mm->m_pkthdr.len += _mplen; \ > + *_mmp = _mm; \ > +} while (0) > + > +/* > + * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise > + * tags are expected to ``vanish'' when they pass through a network > + * interface. For most interfaces this happens normally as the tags are > + * reclaimed when the mbuf is free'd. However in some special cases > + * reclaiming must be done manually. An example is packets that pass through > + * the loopback interface. Also, one must be careful to do this when > + * ``turning around'' packets (e.g., icmp_reflect). > + * > + * To mark a tag persistent bit-or this flag in when defining the tag id. > + * The tag will then be treated as described above. > + */ > +#define MTAG_PERSISTENT 0x800 > + > +#define PACKET_TAG_NONE 0 /* Nadda */ > + > +/* Packet tags for use with PACKET_ABI_COMPAT. */ > +#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ > +#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ > +#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ > +#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ > +#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ > +#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ > +#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ > +#define PACKET_TAG_GIF 8 /* GIF processing done */ > +#define PACKET_TAG_GRE 9 /* GRE processing done */ > +#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ > +#define PACKET_TAG_ENCAP 11 /* Encap. processing */ > +#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ > +#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ > +#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ > +#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ > +#define PACKET_TAG_DIVERT 17 /* divert info */ > +#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ > +#define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ > +#define PACKET_TAG_PF 21 /* PF + ALTQ information */ > +#define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ > +#define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ > +#define PACKET_TAG_CARP 28 /* CARP info */ > + > +#endif /* _KERNEL */ > +#endif /* !_SYS_MBUF_H_ */ > diff --git a/example/ipfw/extra/sys/sys/module.h b/example/ipfw/extra/sys/sys/module.h > new file mode 100644 > index 0000000..310e22b > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/module.h > @@ -0,0 +1,43 @@ > +/* > + * trivial module support > + */ > +#ifndef _SYS_MODULE_H_ > +#define _SYS_MODULE_H_ > +typedef struct module *module_t; > +typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); > + > +typedef enum modeventtype { > + MOD_LOAD, > + MOD_UNLOAD, > + MOD_SHUTDOWN, > + MOD_QUIESCE > +} modeventtype_t; > + > +typedef struct moduledata { > + const char *name; /* module name */ > + modeventhand_t evhand; /* event handler */ > + void *priv; /* extra data */ > +} moduledata_t; > + > +/* > + * Hook the module descriptor, md, into our list of things to do. > + * We should in principle respect the order of loading. > + * > + * XXX use the gcc .init functions > + */ > +#define DECLARE_MODULE(a, md, c,d) \ > + moduledata_t *moddesc_##a = &md > + > +/* > + * XXX MODULE_VERSION is define in linux too > + */ > +#define MODULE_DEPEND(a,b,c,d,e) struct __module_depend > +#if 1 // !defined(__FreeBSD__) // defined( __linux__ ) || defined( _WIN32 ) > +#undef MODULE_VERSION > +#define MODULE_VERSION(a,b) struct __module_version > +#endif > + > +#define FEATURE(a, b) struct __feature > + > +#endif /* _SYS_MODULE_H_ */ > + > diff --git a/example/ipfw/extra/sys/sys/systm.h b/example/ipfw/extra/sys/sys/systm.h > new file mode 100644 > index 0000000..94036c9 > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/systm.h > @@ -0,0 +1,159 @@ > +#ifndef _SYS_SYSTM_H_ > +#define _SYS_SYSTM_H_ > + > +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ > +#define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ > + > +#if defined(USERSPACE) // freebsd userspace > + > +#include <sys/queue.h> > +#ifdef __FreeBSD__ > +#include <sys/taskqueue.h> > +#endif > + > +/// SLIST_HEAD(callout_list, callout); > +struct callout; > +TAILQ_HEAD(callout_tailq, callout); > +struct callout { > + union { > + //SLIST_ENTRY(callout) sle; > + TAILQ_ENTRY(callout) tqe; > + } c_links; > + int c_time; /* ticks to the event */ > + void *c_arg; /* function argument */ > + void (*c_func)(void *); /* function to call */ > + struct lock_object *c_lock; /* lock to handle */ > + int c_flags; /* state of this entry */ > + volatile int c_cpu; /* CPU we're scheduled on */ > + > +}; > + > + > +int callout_drain(struct callout *c); > +void callout_init(struct callout *c, int safe); > +int callout_reset(struct callout *c, int ticks, void (*fn)(void *), void *arg); > +int callout_reset_on(struct callout *c, int ticks, void (*fn)(void *), void *arg, int cpu); > + > +#else /* linux or windows */ > + > +#ifndef _WIN32 /* this is the linux version */ > +/* callout support, in <sys/callout.h> on FreeBSD */ > +/* > + * callout support on linux module is done using timers > + */ > +#include <linux/timer.h> > +#ifdef LINUX_24 > +#include <linux/sched.h> /* jiffies definition is here in 2.4 */ > +#endif > +#define callout timer_list > +static __inline int > +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) > +{ > + co->expires = jiffies + ticks; > + co->function = (void (*)(unsigned long))fn; > + co->data = (unsigned long)arg; > + /* > + * Linux 2.6.31 and above has add_timer_on(co, cpu), > + * otherwise add_timer() always schedules a callout on the same > + * CPU used the first time, so we don't need more. > + */ > + add_timer(co); > + return 0; > +} > + > +#define callout_init(co, safe) init_timer(co) > +#define callout_drain(co) del_timer(co) > +#define callout_stop(co) del_timer(co) > + > +#else /* _WIN32 */ > +#include <ndis.h> > + > +/* This is the windows part for callout support */ > +struct callout { > + KTIMER thetimer; > + KDPC timerdpc; > + int dpcinitialized; > + LARGE_INTEGER duetime; > +}; > + > +void dummynet (void*); > +VOID dummynet_dpc( > + __in struct _KDPC *Dpc, > + __in_opt PVOID DeferredContext, > + __in_opt PVOID SystemArgument1, > + __in_opt PVOID SystemArgument2 > + ); > + > +VOID ipfw_dpc( > + __in struct _KDPC *Dpc, > + __in_opt PVOID DeferredContext, > + __in_opt PVOID SystemArgument1, > + __in_opt PVOID SystemArgument2 > + ); > + > +/* callout_reset must handle two problems: > + * - dummynet() scheduler must be run always on the same processor > + * because do_gettimeofday() is based on cpu performance counter, and > + * _occasionally_ can leap backward in time if we query another cpu. > + * typically this won't happen that much, and the cpu will almost always > + * be the same even without the affinity restriction, but better to be sure. > + * - ipfw_tick() does not have the granularity requirements of dummynet() > + * but we need to pass a pointer as argument. > + * > + * for these reasons, if we are called for dummynet() timer, > + * KeInitializeDpc is called only once as it should be, and the thread > + * is forced on cpu0 (which is always present), while if we're called > + * for ipfw_tick(), we re-initialize the DPC each time, using > + * parameter DeferredContext to pass the needed pointer. since this > + * timer is called only once a sec, this won't hurt that much. > + */ > +static __inline int > +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) > +{ > + if(fn == &dummynet) > + { > + if(co->dpcinitialized == 0) > + { > + KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); > + KeSetTargetProcessorDpc(&co->timerdpc, cpu); > + co->dpcinitialized = 1; > + } > + } > + else > + { > + KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); > + } > + co->duetime.QuadPart = (-ticks)*10000; > + KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); > + return 0; > +} > + > +static __inline void > +callout_init(struct callout* co, int safe) > +{ > + printf("%s: initializing timer at %p\n",__FUNCTION__,co); > + KeInitializeTimer(&co->thetimer); > +} > + > +static __inline int > +callout_drain(struct callout* co) > +{ > + BOOLEAN canceled = KeCancelTimer(&co->thetimer); > + while (canceled != TRUE) > + { > + canceled = KeCancelTimer(&co->thetimer); > + } > + printf("%s: stopping timer at %p\n",__FUNCTION__,co); > + return 0; > +} > + > +static __inline int > +callout_stop(struct callout* co) > +{ > + return callout_drain(co); > +} > + > +#endif /* _WIN32 */ > +#endif /* linux or windows */ > + > +#endif /* _SYS_SYSTM_H_ */ > diff --git a/example/ipfw/extra/sys/sys/taskqueue.h b/example/ipfw/extra/sys/sys/taskqueue.h > new file mode 100644 > index 0000000..a9f79a0 > --- /dev/null > +++ b/example/ipfw/extra/sys/sys/taskqueue.h > @@ -0,0 +1,51 @@ > +#ifndef _SYS_TASKQUEUE_H_ > +#define _SYS_TASKQUEUE_H_ > + > +/* > + * Remap taskqueue to direct calls > + */ > + > +#ifdef _WIN32 > +struct task { > + void (*func)(void*, int); > +}; > +#define taskqueue_enqueue_fast(tq, ta) (ta)->func(NULL,1) > +#define TASK_INIT(a,b,c,d) do { \ > + (a)->func = (c); } while (0) > +#else > +struct task { > + void (*func)(void); > +}; > +#define taskqueue_enqueue_fast(tq, ta) (ta)->func() > +#define TASK_INIT(a,b,c,d) do { \ > + (a)->func = (void (*)(void))c; } while (0) > + > + > +#endif > +typedef void (*taskqueue_enqueue_fn)(void *context); > + > +// #define taskqueue_create(_a, _b, _c, _d) NULL > +struct taskqueue *taskqueue_create_fast(const char *name, int mflags, > + taskqueue_enqueue_fn enqueue, > + void *context); > +void taskqueue_thread_enqueue(void *context); > + > + > +// #define taskqueue_create_fast(_a, _b, _c, _d) NULL > +int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, > + const char *name, ...) __printflike(4, 5); > + > + > +// #define taskqueue_drain(_a, _b) /* XXX to be completed */ > +// #define taskqueue_free(_a) /* XXX to be completed */ > +void taskqueue_drain(struct taskqueue *queue, struct task *task); > +void taskqueue_free(struct taskqueue *queue); > + > + > +#define PRI_MIN (0) /* Highest priority. */ > +#define PRI_MIN_ITHD (PRI_MIN) > +#ifndef __FreeBSD__ > +#define PI_NET (PRI_MIN_ITHD + 16) > +#endif > + > +#endif /* !_SYS_TASKQUEUE_H_ */ > diff --git a/example/ipfw/ipfw/altq.c b/example/ipfw/ipfw/altq.c > new file mode 100644 > index 0000000..ba6b639 > --- /dev/null > +++ b/example/ipfw/ipfw/altq.c > @@ -0,0 +1,151 @@ > +/* > + * Copyright (c) 2002-2003 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * NEW command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/altq.c 270424 2014-08-23 17:37:18Z melifaro $ > + * > + * altq interface > + */ > + > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <sys/sockio.h> > + > +#include "ipfw2.h" > + > +#include <err.h> > +#include <errno.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > +#include <unistd.h> > +#include <fcntl.h> > + > +#include <net/if.h> /* IFNAMSIZ */ > +#include <net/pfvar.h> > +#include <netinet/in.h> /* in_addr */ > +#include <netinet/ip_fw.h> > + > +/* > + * Map between current altq queue id numbers and names. > + */ > +static TAILQ_HEAD(, pf_altq) altq_entries = > + TAILQ_HEAD_INITIALIZER(altq_entries); > + > +void > +altq_set_enabled(int enabled) > +{ > + int pffd; > + > + pffd = open("/dev/pf", O_RDWR); > + if (pffd == -1) > + err(EX_UNAVAILABLE, > + "altq support opening pf(4) control device"); > + if (enabled) { > + if (ioctl(pffd, DIOCSTARTALTQ) != 0 && errno != EEXIST) > + err(EX_UNAVAILABLE, "enabling altq"); > + } else { > + if (ioctl(pffd, DIOCSTOPALTQ) != 0 && errno != ENOENT) > + err(EX_UNAVAILABLE, "disabling altq"); > + } > + close(pffd); > +} > + > +static void > +altq_fetch(void) > +{ > + struct pfioc_altq pfioc; > + struct pf_altq *altq; > + int pffd; > + unsigned int mnr; > + static int altq_fetched = 0; > + > + if (altq_fetched) > + return; > + altq_fetched = 1; > + pffd = open("/dev/pf", O_RDONLY); > + if (pffd == -1) { > + warn("altq support opening pf(4) control device"); > + return; > + } > + bzero(&pfioc, sizeof(pfioc)); > + if (ioctl(pffd, DIOCGETALTQS, &pfioc) != 0) { > + warn("altq support getting queue list"); > + close(pffd); > + return; > + } > + mnr = pfioc.nr; > + for (pfioc.nr = 0; pfioc.nr < mnr; pfioc.nr++) { > + if (ioctl(pffd, DIOCGETALTQ, &pfioc) != 0) { > + if (errno == EBUSY) > + break; > + warn("altq support getting queue list"); > + close(pffd); > + return; > + } > + if (pfioc.altq.qid == 0) > + continue; > + altq = safe_calloc(1, sizeof(*altq)); > + *altq = pfioc.altq; > + TAILQ_INSERT_TAIL(&altq_entries, altq, entries); > + } > + close(pffd); > +} > + > +u_int32_t > +altq_name_to_qid(const char *name) > +{ > + struct pf_altq *altq; > + > + altq_fetch(); > + TAILQ_FOREACH(altq, &altq_entries, entries) > + if (strcmp(name, altq->qname) == 0) > + break; > + if (altq == NULL) > + errx(EX_DATAERR, "altq has no queue named `%s'", name); > + return altq->qid; > +} > + > +static const char * > +altq_qid_to_name(u_int32_t qid) > +{ > + struct pf_altq *altq; > + > + altq_fetch(); > + TAILQ_FOREACH(altq, &altq_entries, entries) > + if (qid == altq->qid) > + break; > + if (altq == NULL) > + return NULL; > + return altq->qname; > +} > + > +void > +print_altq_cmd(struct buf_pr *bp, ipfw_insn_altq *altqptr) > +{ > + if (altqptr) { > + const char *qname; > + > + qname = altq_qid_to_name(altqptr->qid); > + if (qname == NULL) > + bprintf(bp, " altq ?<%u>", altqptr->qid); > + else > + bprintf(bp, " altq %s", qname); > + } > +} > diff --git a/example/ipfw/ipfw/dummynet.c b/example/ipfw/ipfw/dummynet.c > new file mode 100644 > index 0000000..1938307 > --- /dev/null > +++ b/example/ipfw/ipfw/dummynet.c > @@ -0,0 +1,1410 @@ > +/* > + * Copyright (c) 2002-2003,2010 Luigi Rizzo > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * $FreeBSD: head/sbin/ipfw/dummynet.c 270424 2014-08-23 17:37:18Z melifaro $ > + * > + * dummynet support > + */ > + > +#include <sys/types.h> > +#include <sys/socket.h> > +/* XXX there are several sysctl leftover here */ > +#include <sys/sysctl.h> > + > +#include "ipfw2.h" > + > +#include <ctype.h> > +#include <err.h> > +#include <errno.h> > +#include <libutil.h> > +#include <netdb.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > + > +#include <net/if.h> > +#include <netinet/in.h> > +#include <netinet/ip_fw.h> > +#include <netinet/ip_dummynet.h> > +#include <arpa/inet.h> /* inet_ntoa */ > + > + > +static struct _s_x dummynet_params[] = { > + { "plr", TOK_PLR }, > + { "noerror", TOK_NOERROR }, > + { "buckets", TOK_BUCKETS }, > + { "dst-ip", TOK_DSTIP }, > + { "src-ip", TOK_SRCIP }, > + { "dst-port", TOK_DSTPORT }, > + { "src-port", TOK_SRCPORT }, > + { "proto", TOK_PROTO }, > + { "weight", TOK_WEIGHT }, > + { "lmax", TOK_LMAX }, > + { "maxlen", TOK_LMAX }, > + { "all", TOK_ALL }, > + { "mask", TOK_MASK }, /* alias for both */ > + { "sched_mask", TOK_SCHED_MASK }, > + { "flow_mask", TOK_FLOW_MASK }, > + { "droptail", TOK_DROPTAIL }, > + { "ecn", TOK_ECN }, > + { "red", TOK_RED }, > + { "gred", TOK_GRED }, > + { "bw", TOK_BW }, > + { "bandwidth", TOK_BW }, > + { "delay", TOK_DELAY }, > + { "link", TOK_LINK }, > + { "pipe", TOK_PIPE }, > + { "queue", TOK_QUEUE }, > + { "flowset", TOK_FLOWSET }, > + { "sched", TOK_SCHED }, > + { "pri", TOK_PRI }, > + { "priority", TOK_PRI }, > + { "type", TOK_TYPE }, > + { "flow-id", TOK_FLOWID}, > + { "dst-ipv6", TOK_DSTIP6}, > + { "dst-ip6", TOK_DSTIP6}, > + { "src-ipv6", TOK_SRCIP6}, > + { "src-ip6", TOK_SRCIP6}, > + { "profile", TOK_PROFILE}, > + { "burst", TOK_BURST}, > + { "dummynet-params", TOK_NULL }, > + { NULL, 0 } /* terminator */ > +}; > + > +#define O_NEXT(p, len) ((void *)((char *)p + len)) > + > +static void > +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) > +{ > + oid->len = len; > + oid->type = type; > + oid->subtype = 0; > + oid->id = id; > +} > + > +/* make room in the buffer and move the pointer forward */ > +static void * > +o_next(struct dn_id **o, int len, int type) > +{ > + struct dn_id *ret = *o; > + oid_fill(ret, len, type, 0); > + *o = O_NEXT(*o, len); > + return ret; > +} > + > +#if 0 > +static int > +sort_q(void *arg, const void *pa, const void *pb) > +{ > + int rev = (co.do_sort < 0); > + int field = rev ? -co.do_sort : co.do_sort; > + long long res = 0; > + const struct dn_flow_queue *a = pa; > + const struct dn_flow_queue *b = pb; > + > + switch (field) { > + case 1: /* pkts */ > + res = a->len - b->len; > + break; > + case 2: /* bytes */ > + res = a->len_bytes - b->len_bytes; > + break; > + > + case 3: /* tot pkts */ > + res = a->tot_pkts - b->tot_pkts; > + break; > + > + case 4: /* tot bytes */ > + res = a->tot_bytes - b->tot_bytes; > + break; > + } > + if (res < 0) > + res = -1; > + if (res > 0) > + res = 1; > + return (int)(rev ? res : -res); > +} > +#endif > + > +/* print a mask and header for the subsequent list of flows */ > +static void > +print_mask(struct ipfw_flow_id *id) > +{ > + if (!IS_IP6_FLOW_ID(id)) { > + printf(" " > + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", > + id->extra ? "queue," : "", > + id->proto, > + id->src_ip, id->src_port, > + id->dst_ip, id->dst_port); > + } else { > + char buf[255]; > + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", > + id->extra ? "queue," : "", > + id->proto, id->flow_id6); > + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); > + printf("%s/0x%04x -> ", buf, id->src_port); > + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); > + printf("%s/0x%04x\n", buf, id->dst_port); > + } > +} > + > +static void > +print_header(struct ipfw_flow_id *id) > +{ > + if (!IS_IP6_FLOW_ID(id)) > + printf("BKT Prot ___Source IP/port____ " > + "____Dest. IP/port____ " > + "Tot_pkt/bytes Pkt/Byte Drp\n"); > + else > + printf("BKT ___Prot___ _flow-id_ " > + "______________Source IPv6/port_______________ " > + "_______________Dest. IPv6/port_______________ " > + "Tot_pkt/bytes Pkt/Byte Drp\n"); > +} > + > +static void > +list_flow(struct buf_pr *bp, struct dn_flow *ni) > +{ > + char buff[255]; > + struct protoent *pe = NULL; > + struct in_addr ina; > + struct ipfw_flow_id *id = &ni->fid; > + > + pe = getprotobynumber(id->proto); > + /* XXX: Should check for IPv4 flows */ > + bprintf(bp, "%3u%c", (ni->oid.id) & 0xff, > + id->extra ? '*' : ' '); > + if (!IS_IP6_FLOW_ID(id)) { > + if (pe) > + bprintf(bp, "%-4s ", pe->p_name); > + else > + bprintf(bp, "%4u ", id->proto); > + ina.s_addr = htonl(id->src_ip); > + bprintf(bp, "%15s/%-5d ", > + inet_ntoa(ina), id->src_port); > + ina.s_addr = htonl(id->dst_ip); > + bprintf(bp, "%15s/%-5d ", > + inet_ntoa(ina), id->dst_port); > + } else { > + /* Print IPv6 flows */ > + if (pe != NULL) > + bprintf(bp, "%9s ", pe->p_name); > + else > + bprintf(bp, "%9u ", id->proto); > + bprintf(bp, "%7d %39s/%-5d ", id->flow_id6, > + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), > + id->src_port); > + bprintf(bp, " %39s/%-5d ", > + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), > + id->dst_port); > + } > + pr_u64(bp, &ni->tot_pkts, 4); > + pr_u64(bp, &ni->tot_bytes, 8); > + bprintf(bp, "%2u %4u %3u", > + ni->length, ni->len_bytes, ni->drops); > +} > + > +static void > +print_flowset_parms(struct dn_fs *fs, char *prefix) > +{ > + int l; > + char qs[30]; > + char plr[30]; > + char red[90]; /* Display RED parameters */ > + > + l = fs->qsize; > + if (fs->flags & DN_QSIZE_BYTES) { > + if (l >= 8192) > + sprintf(qs, "%d KB", l / 1024); > + else > + sprintf(qs, "%d B", l); > + } else > + sprintf(qs, "%3d sl.", l); > + if (fs->plr) > + sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); > + else > + plr[0] = '\0'; > + > + if (fs->flags & DN_IS_RED) { /* RED parameters */ > + sprintf(red, > + "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", > + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', > + 1.0 * fs->w_q / (double)(1 << SCALE_RED), > + fs->min_th, > + fs->max_th, > + 1.0 * fs->max_p / (double)(1 << SCALE_RED)); > + if (fs->flags & DN_IS_ECN) > + strncat(red, " (ecn)", 6); > + } else > + sprintf(red, "droptail"); > + > + if (prefix[0]) { > + printf("%s %s%s %d queues (%d buckets) %s\n", > + prefix, qs, plr, fs->oid.id, fs->buckets, red); > + prefix[0] = '\0'; > + } else { > + printf("q%05d %s%s %d flows (%d buckets) sched %d " > + "weight %d lmax %d pri %d %s\n", > + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, > + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); > + if (fs->flags & DN_HAVE_MASK) > + print_mask(&fs->flow_mask); > + } > +} > + > +static void > +print_extra_delay_parms(struct dn_profile *p) > +{ > + double loss; > + if (p->samples_no <= 0) > + return; > + > + loss = p->loss_level; > + loss /= p->samples_no; > + printf("\t profile: name \"%s\" loss %f samples %d\n", > + p->name, loss, p->samples_no); > +} > + > +static void > +flush_buf(char *buf) > +{ > + if (buf[0]) > + printf("%s\n", buf); > + buf[0] = '\0'; > +} > + > +/* > + * generic list routine. We expect objects in a specific order, i.e. > + * PIPES AND SCHEDULERS: > + * link; scheduler; internal flowset if any; instances > + * we can tell a pipe from the number. > + * > + * FLOWSETS: > + * flowset; queues; > + * link i (int queue); scheduler i; si(i) { flowsets() : queues } > + */ > +static void > +list_pipes(struct dn_id *oid, struct dn_id *end) > +{ > + char buf[160]; /* pending buffer */ > + int toPrint = 1; /* print header */ > + struct buf_pr bp; > + > + buf[0] = '\0'; > + bp_alloc(&bp, 4096); > + for (; oid != end; oid = O_NEXT(oid, oid->len)) { > + if (oid->len < sizeof(*oid)) > + errx(1, "invalid oid len %d\n", oid->len); > + > + switch (oid->type) { > + default: > + flush_buf(buf); > + printf("unrecognized object %d size %d\n", oid->type, oid->len); > + break; > + case DN_TEXT: /* list of attached flowsets */ > + { > + int i, l; > + struct { > + struct dn_id id; > + uint32_t p[0]; > + } *d = (void *)oid; > + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); > + if (l == 0) > + break; > + printf(" Children flowsets: "); > + for (i = 0; i < l; i++) > + printf("%u ", d->p[i]); > + printf("\n"); > + break; > + } > + case DN_CMD_GET: > + if (co.verbose) > + printf("answer for cmd %d, len %d\n", oid->type, oid->id); > + break; > + case DN_SCH: { > + struct dn_sch *s = (struct dn_sch *)oid; > + flush_buf(buf); > + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", > + s->sched_nr, > + s->name, s->flags, s->buckets, s->oid.id); > + if (s->flags & DN_HAVE_MASK) > + print_mask(&s->sched_mask); > + } > + break; > + > + case DN_FLOW: > + if (toPrint != 0) { > + print_header(&((struct dn_flow *)oid)->fid); > + toPrint = 0; > + } > + list_flow(&bp, (struct dn_flow *)oid); > + printf("%s\n", bp.buf); > + break; > + > + case DN_LINK: { > + struct dn_link *p = (struct dn_link *)oid; > + double b = p->bandwidth; > + char bwbuf[30]; > + char burst[5 + 7]; > + > + /* This starts a new object so flush buffer */ > + flush_buf(buf); > + /* data rate */ > + if (b == 0) > + sprintf(bwbuf, "unlimited "); > + else if (b >= 1000000) > + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); > + else if (b >= 1000) > + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); > + else > + sprintf(bwbuf, "%7.3f bit/s ", b); > + > + if (humanize_number(burst, sizeof(burst), p->burst, > + "", HN_AUTOSCALE, 0) < 0 || co.verbose) > + sprintf(burst, "%d", (int)p->burst); > + sprintf(buf, "%05d: %s %4d ms burst %s", > + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); > + } > + break; > + > + case DN_FS: > + print_flowset_parms((struct dn_fs *)oid, buf); > + break; > + case DN_PROFILE: > + flush_buf(buf); > + print_extra_delay_parms((struct dn_profile *)oid); > + } > + flush_buf(buf); // XXX does it really go here ? > + } > + > + bp_free(&bp); > +} > + > +/* > + * Delete pipe, queue or scheduler i > + */ > +int > +ipfw_delete_pipe(int do_pipe, int i) > +{ > + struct { > + struct dn_id oid; > + uintptr_t a[1]; /* add more if we want a list */ > + } cmd; > + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); > + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : > + ( (do_pipe == 2) ? DN_FS : DN_SCH); > + cmd.a[0] = i; > + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); > + if (i) { > + i = 1; > + warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); > + } > + return i; > +} > + > +/* > + * Code to parse delay profiles. > + * > + * Some link types introduce extra delays in the transmission > + * of a packet, e.g. because of MAC level framing, contention on > + * the use of the channel, MAC level retransmissions and so on. > + * From our point of view, the channel is effectively unavailable > + * for this extra time, which is constant or variable depending > + * on the link type. Additionally, packets may be dropped after this > + * time (e.g. on a wireless link after too many retransmissions). > + * We can model the additional delay with an empirical curve > + * that represents its distribution. > + * > + * cumulative probability > + * 1.0 ^ > + * | > + * L +-- loss-level x > + * | ****** > + * | * > + * | ***** > + * | * > + * | ** > + * | * > + * +-------*-------------------> > + * delay > + * > + * The empirical curve may have both vertical and horizontal lines. > + * Vertical lines represent constant delay for a range of > + * probabilities; horizontal lines correspond to a discontinuty > + * in the delay distribution: the link will use the largest delay > + * for a given probability. > + * > + * To pass the curve to dummynet, we must store the parameters > + * in a file as described below, and issue the command > + * > + * ipfw pipe <n> config ... bw XXX profile <filename> ... > + * > + * The file format is the following, with whitespace acting as > + * a separator and '#' indicating the beginning a comment: > + * > + * samples N > + * the number of samples used in the internal > + * representation (2..1024; default 100); > + * > + * loss-level L > + * The probability above which packets are lost. > + * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); > + * > + * name identifier > + * Optional a name (listed by "ipfw pipe show") > + * to identify the distribution; > + * > + * "delay prob" | "prob delay" > + * One of these two lines is mandatory and defines > + * the format of the following lines with data points. > + * > + * XXX YYY > + * 2 or more lines representing points in the curve, > + * with either delay or probability first, according > + * to the chosen format. > + * The unit for delay is milliseconds. > + * > + * Data points does not need to be ordered or equal to the number > + * specified in the "samples" line. ipfw will sort and interpolate > + * the curve as needed. > + * > + * Example of a profile file: > + > + name bla_bla_bla > + samples 100 > + loss-level 0.86 > + prob delay > + 0 200 # minimum overhead is 200ms > + 0.5 200 > + 0.5 300 > + 0.8 1000 > + 0.9 1300 > + 1 1300 > + > + * Internally, we will convert the curve to a fixed number of > + * samples, and when it is time to transmit a packet we will > + * model the extra delay as extra bits in the packet. > + * > + */ > + > +#define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN > +#define ED_TOK_SAMPLES "samples" > +#define ED_TOK_LOSS "loss-level" > +#define ED_TOK_NAME "name" > +#define ED_TOK_DELAY "delay" > +#define ED_TOK_PROB "prob" > +#define ED_TOK_BW "bw" > +#define ED_SEPARATORS " \t\n" > +#define ED_MIN_SAMPLES_NO 2 > + > +/* > + * returns 1 if s is a non-negative number, with at least one '.' > + */ > +static int > +is_valid_number(const char *s) > +{ > + int i, dots_found = 0; > + int len = strlen(s); > + > + for (i = 0; i<len; ++i) > + if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1)) > + return 0; > + return 1; > +} > + > +/* > + * Take as input a string describing a bandwidth value > + * and return the numeric bandwidth value. > + * set clocking interface or bandwidth value > + */ > +static void > +read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) > +{ > + if (*bandwidth != -1) > + warnx("duplicate token, override bandwidth value!"); > + > + if (arg[0] >= 'a' && arg[0] <= 'z') { > + if (!if_name) { > + errx(1, "no if support"); > + } > + if (namelen >= IFNAMSIZ) > + warn("interface name truncated"); > + namelen--; > + /* interface name */ > + strncpy(if_name, arg, namelen); > + if_name[namelen] = '\0'; > + *bandwidth = 0; > + } else { /* read bandwidth value */ > + int bw; > + char *end = NULL; > + > + bw = strtoul(arg, &end, 0); > + if (*end == 'K' || *end == 'k') { > + end++; > + bw *= 1000; > + } else if (*end == 'M' || *end == 'm') { > + end++; > + bw *= 1000000; > + } > + if ((*end == 'B' && > + _substrcmp2(end, "Bi", "Bit/s") != 0) || > + _substrcmp2(end, "by", "bytes") == 0) > + bw *= 8; > + > + if (bw < 0) > + errx(EX_DATAERR, "bandwidth too large"); > + > + *bandwidth = bw; > + if (if_name) > + if_name[0] = '\0'; > + } > +} > + > +struct point { > + double prob; > + double delay; > +}; > + > +static int > +compare_points(const void *vp1, const void *vp2) > +{ > + const struct point *p1 = vp1; > + const struct point *p2 = vp2; > + double res = 0; > + > + res = p1->prob - p2->prob; > + if (res == 0) > + res = p1->delay - p2->delay; > + if (res < 0) > + return -1; > + else if (res > 0) > + return 1; > + else > + return 0; > +} > + > +#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno > + > +static void > +load_extra_delays(const char *filename, struct dn_profile *p, > + struct dn_link *link) > +{ > + char line[ED_MAX_LINE_LEN]; > + FILE *f; > + int lineno = 0; > + int i; > + > + int samples = -1; > + double loss = -1.0; > + char profile_name[ED_MAX_NAME_LEN]; > + int delay_first = -1; > + int do_points = 0; > + struct point points[ED_MAX_SAMPLES_NO]; > + int points_no = 0; > + > + /* XXX link never NULL? */ > + p->link_nr = link->link_nr; > + > + profile_name[0] = '\0'; > + f = fopen(filename, "r"); > + if (f == NULL) > + err(EX_UNAVAILABLE, "fopen: %s", filename); > + > + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ > + char *s, *cur = line, *name = NULL, *arg = NULL; > + > + ++lineno; > + > + /* parse the line */ > + while (cur) { > + s = strsep(&cur, ED_SEPARATORS); > + if (s == NULL || *s == '#') > + break; > + if (*s == '\0') > + continue; > + if (arg) > + errx(ED_EFMT("too many arguments")); > + if (name == NULL) > + name = s; > + else > + arg = s; > + } > + if (name == NULL) /* empty line */ > + continue; > + if (arg == NULL) > + errx(ED_EFMT("missing arg for %s"), name); > + > + if (!strcasecmp(name, ED_TOK_SAMPLES)) { > + if (samples > 0) > + errx(ED_EFMT("duplicate ``samples'' line")); > + if (atoi(arg) <=0) > + errx(ED_EFMT("invalid number of samples")); > + samples = atoi(arg); > + if (samples>ED_MAX_SAMPLES_NO) > + errx(ED_EFMT("too many samples, maximum is %d"), > + ED_MAX_SAMPLES_NO); > + do_points = 0; > + } else if (!strcasecmp(name, ED_TOK_BW)) { > + char buf[IFNAMSIZ]; > + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); > + } else if (!strcasecmp(name, ED_TOK_LOSS)) { > + if (loss != -1.0) > + errx(ED_EFMT("duplicated token: %s"), name); > + if (!is_valid_number(arg)) > + errx(ED_EFMT("invalid %s"), arg); > + loss = atof(arg); > + if (loss > 1) > + errx(ED_EFMT("%s greater than 1.0"), name); > + do_points = 0; > + } else if (!strcasecmp(name, ED_TOK_NAME)) { > + if (profile_name[0] != '\0') > + errx(ED_EFMT("duplicated token: %s"), name); > + strncpy(profile_name, arg, sizeof(profile_name) - 1); > + profile_name[sizeof(profile_name)-1] = '\0'; > + do_points = 0; > + } else if (!strcasecmp(name, ED_TOK_DELAY)) { > + if (do_points) > + errx(ED_EFMT("duplicated token: %s"), name); > + delay_first = 1; > + do_points = 1; > + } else if (!strcasecmp(name, ED_TOK_PROB)) { > + if (do_points) > + errx(ED_EFMT("duplicated token: %s"), name); > + delay_first = 0; > + do_points = 1; > + } else if (do_points) { > + if (!is_valid_number(name) || !is_valid_number(arg)) > + errx(ED_EFMT("invalid point found")); > + if (delay_first) { > + points[points_no].delay = atof(name); > + points[points_no].prob = atof(arg); > + } else { > + points[points_no].delay = atof(arg); > + points[points_no].prob = atof(name); > + } > + if (points[points_no].prob > 1.0) > + errx(ED_EFMT("probability greater than 1.0")); > + ++points_no; > + } else { > + errx(ED_EFMT("unrecognised command '%s'"), name); > + } > + } > + > + fclose (f); > + > + if (samples == -1) { > + warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); > + samples = 100; > + } > + > + if (loss == -1.0) { > + warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); > + loss = 1; > + } > + > + /* make sure that there are enough points. */ > + if (points_no < ED_MIN_SAMPLES_NO) > + errx(ED_EFMT("too few samples, need at least %d"), > + ED_MIN_SAMPLES_NO); > + > + qsort(points, points_no, sizeof(struct point), compare_points); > + > + /* interpolation */ > + for (i = 0; i<points_no-1; ++i) { > + double y1 = points[i].prob * samples; > + double x1 = points[i].delay; > + double y2 = points[i+1].prob * samples; > + double x2 = points[i+1].delay; > + > + int ix = y1; > + int stop = y2; > + > + if (x1 == x2) { > + for (; ix<stop; ++ix) > + p->samples[ix] = x1; > + } else { > + double m = (y2-y1)/(x2-x1); > + double c = y1 - m*x1; > + for (; ix<stop ; ++ix) > + p->samples[ix] = (ix - c)/m; > + } > + } > + p->samples_no = samples; > + p->loss_level = loss * samples; > + strncpy(p->name, profile_name, sizeof(p->name)); > +} > + > +/* > + * configuration of pipes, schedulers, flowsets. > + * When we configure a new scheduler, an empty pipe is created, so: > + * > + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility > + * sched N+Delta type fifo sched_mask ... > + * pipe N+Delta <parameters> > + * flowset N+Delta pipe N+Delta (no parameters) > + * sched N type wf2q+ sched_mask ... > + * pipe N <parameters> > + * > + * do_pipe = 2 -> flowset N config > + * flowset N parameters > + * > + * do_pipe = 3 -> sched N config > + * sched N parameters (default no pipe) > + * optional Pipe N config ... > + * pipe ==> > + */ > +void > +ipfw_config_pipe(int ac, char **av) > +{ > + int i; > + u_int j; > + char *end; > + struct dn_id *buf, *base; > + struct dn_sch *sch = NULL; > + struct dn_link *p = NULL; > + struct dn_fs *fs = NULL; > + struct dn_profile *pf = NULL; > + struct ipfw_flow_id *mask = NULL; > + int lmax; > + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; > + > + /* > + * allocate space for 1 header, > + * 1 scheduler, 1 link, 1 flowset, 1 profile > + */ > + lmax = sizeof(struct dn_id); /* command header */ > + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + > + sizeof(struct dn_fs) + sizeof(struct dn_profile); > + > + av++; ac--; > + /* Pipe number */ > + if (ac && isdigit(**av)) { > + i = atoi(*av); av++; ac--; > + } else > + i = -1; > + if (i <= 0) > + errx(EX_USAGE, "need a pipe/flowset/sched number"); > + base = buf = safe_calloc(1, lmax); > + /* all commands start with a 'CONFIGURE' and a version */ > + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); > + base->id = DN_API_VERSION; > + > + switch (co.do_pipe) { > + case 1: /* "pipe N config ..." */ > + /* Allocate space for the WF2Q+ scheduler, its link > + * and the FIFO flowset. Set the number, but leave > + * the scheduler subtype and other parameters to 0 > + * so the kernel will use appropriate defaults. > + * XXX todo: add a flag to record if a parameter > + * is actually configured. > + * If we do a 'pipe config' mask -> sched_mask. > + * The FIFO scheduler and link are derived from the > + * WF2Q+ one in the kernel. > + */ > + sch = o_next(&buf, sizeof(*sch), DN_SCH); > + p = o_next(&buf, sizeof(*p), DN_LINK); > + fs = o_next(&buf, sizeof(*fs), DN_FS); > + > + sch->sched_nr = i; > + sch->oid.subtype = 0; /* defaults to WF2Q+ */ > + mask = &sch->sched_mask; > + flags = &sch->flags; > + buckets = &sch->buckets; > + *flags |= DN_PIPE_CMD; > + > + p->link_nr = i; > + > + /* This flowset is only for the FIFO scheduler */ > + fs->fs_nr = i + 2*DN_MAX_ID; > + fs->sched_nr = i + DN_MAX_ID; > + break; > + > + case 2: /* "queue N config ... " */ > + fs = o_next(&buf, sizeof(*fs), DN_FS); > + fs->fs_nr = i; > + mask = &fs->flow_mask; > + flags = &fs->flags; > + buckets = &fs->buckets; > + break; > + > + case 3: /* "sched N config ..." */ > + sch = o_next(&buf, sizeof(*sch), DN_SCH); > + fs = o_next(&buf, sizeof(*fs), DN_FS); > + sch->sched_nr = i; > + mask = &sch->sched_mask; > + flags = &sch->flags; > + buckets = &sch->buckets; > + /* fs is used only with !MULTIQUEUE schedulers */ > + fs->fs_nr = i + DN_MAX_ID; > + fs->sched_nr = i; > + break; > + } > + /* set to -1 those fields for which we want to reuse existing > + * values from the kernel. > + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. > + * XXX todo: support reuse of the mask. > + */ > + if (p) > + p->bandwidth = -1; > + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) > + fs->par[j] = -1; > + while (ac > 0) { > + double d; > + int tok = match_token(dummynet_params, *av); > + ac--; av++; > + > + switch(tok) { > + case TOK_NOERROR: > + NEED(fs, "noerror is only for pipes"); > + fs->flags |= DN_NOERROR; > + break; > + > + case TOK_PLR: > + NEED(fs, "plr is only for pipes"); > + NEED1("plr needs argument 0..1\n"); > + d = strtod(av[0], NULL); > + if (d > 1) > + d = 1; > + else if (d < 0) > + d = 0; > + fs->plr = (int)(d*0x7fffffff); > + ac--; av++; > + break; > + > + case TOK_QUEUE: > + NEED(fs, "queue is only for pipes or flowsets"); > + NEED1("queue needs queue size\n"); > + end = NULL; > + fs->qsize = strtoul(av[0], &end, 0); > + if (*end == 'K' || *end == 'k') { > + fs->flags |= DN_QSIZE_BYTES; > + fs->qsize *= 1024; > + } else if (*end == 'B' || > + _substrcmp2(end, "by", "bytes") == 0) { > + fs->flags |= DN_QSIZE_BYTES; > + } > + ac--; av++; > + break; > + > + case TOK_BUCKETS: > + NEED(fs, "buckets is only for pipes or flowsets"); > + NEED1("buckets needs argument\n"); > + *buckets = strtoul(av[0], NULL, 0); > + ac--; av++; > + break; > + > + case TOK_FLOW_MASK: > + case TOK_SCHED_MASK: > + case TOK_MASK: > + NEED(mask, "tok_mask"); > + NEED1("mask needs mask specifier\n"); > + /* > + * per-flow queue, mask is dst_ip, dst_port, > + * src_ip, src_port, proto measured in bits > + */ > + > + bzero(mask, sizeof(*mask)); > + end = NULL; > + > + while (ac >= 1) { > + uint32_t *p32 = NULL; > + uint16_t *p16 = NULL; > + uint32_t *p20 = NULL; > + struct in6_addr *pa6 = NULL; > + uint32_t a; > + > + tok = match_token(dummynet_params, *av); > + ac--; av++; > + switch(tok) { > + case TOK_ALL: > + /* > + * special case, all bits significant > + * except 'extra' (the queue number) > + */ > + mask->dst_ip = ~0; > + mask->src_ip = ~0; > + mask->dst_port = ~0; > + mask->src_port = ~0; > + mask->proto = ~0; > + n2mask(&mask->dst_ip6, 128); > + n2mask(&mask->src_ip6, 128); > + mask->flow_id6 = ~0; > + *flags |= DN_HAVE_MASK; > + goto end_mask; > + > + case TOK_QUEUE: > + mask->extra = ~0; > + *flags |= DN_HAVE_MASK; > + goto end_mask; > + > + case TOK_DSTIP: > + mask->addr_type = 4; > + p32 = &mask->dst_ip; > + break; > + > + case TOK_SRCIP: > + mask->addr_type = 4; > + p32 = &mask->src_ip; > + break; > + > + case TOK_DSTIP6: > + mask->addr_type = 6; > + pa6 = &mask->dst_ip6; > + break; > + > + case TOK_SRCIP6: > + mask->addr_type = 6; > + pa6 = &mask->src_ip6; > + break; > + > + case TOK_FLOWID: > + mask->addr_type = 6; > + p20 = &mask->flow_id6; > + break; > + > + case TOK_DSTPORT: > + p16 = &mask->dst_port; > + break; > + > + case TOK_SRCPORT: > + p16 = &mask->src_port; > + break; > + > + case TOK_PROTO: > + break; > + > + default: > + ac++; av--; /* backtrack */ > + goto end_mask; > + } > + if (ac < 1) > + errx(EX_USAGE, "mask: value missing"); > + if (*av[0] == '/') { > + a = strtoul(av[0]+1, &end, 0); > + if (pa6 == NULL) > + a = (a == 32) ? ~0 : (1 << a) - 1; > + } else > + a = strtoul(av[0], &end, 0); > + if (p32 != NULL) > + *p32 = a; > + else if (p16 != NULL) { > + if (a > 0xFFFF) > + errx(EX_DATAERR, > + "port mask must be 16 bit"); > + *p16 = (uint16_t)a; > + } else if (p20 != NULL) { > + if (a > 0xfffff) > + errx(EX_DATAERR, > + "flow_id mask must be 20 bit"); > + *p20 = (uint32_t)a; > + } else if (pa6 != NULL) { > + if (a > 128) > + errx(EX_DATAERR, > + "in6addr invalid mask len"); > + else > + n2mask(pa6, a); > + } else { > + if (a > 0xFF) > + errx(EX_DATAERR, > + "proto mask must be 8 bit"); > + mask->proto = (uint8_t)a; > + } > + if (a != 0) > + *flags |= DN_HAVE_MASK; > + ac--; av++; > + } /* end while, config masks */ > +end_mask: > + break; > + > + case TOK_RED: > + case TOK_GRED: > + NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); > + fs->flags |= DN_IS_RED; > + if (tok == TOK_GRED) > + fs->flags |= DN_IS_GENTLE_RED; > + /* > + * the format for parameters is w_q/min_th/max_th/max_p > + */ > + if ((end = strsep(&av[0], "/"))) { > + double w_q = strtod(end, NULL); > + if (w_q > 1 || w_q <= 0) > + errx(EX_DATAERR, "0 < w_q <= 1"); > + fs->w_q = (int) (w_q * (1 << SCALE_RED)); > + } > + if ((end = strsep(&av[0], "/"))) { > + fs->min_th = strtoul(end, &end, 0); > + if (*end == 'K' || *end == 'k') > + fs->min_th *= 1024; > + } > + if ((end = strsep(&av[0], "/"))) { > + fs->max_th = strtoul(end, &end, 0); > + if (*end == 'K' || *end == 'k') > + fs->max_th *= 1024; > + } > + if ((end = strsep(&av[0], "/"))) { > + double max_p = strtod(end, NULL); > + if (max_p > 1 || max_p < 0) > + errx(EX_DATAERR, "0 <= max_p <= 1"); > + fs->max_p = (int)(max_p * (1 << SCALE_RED)); > + } > + ac--; av++; > + break; > + > + case TOK_ECN: > + fs->flags |= DN_IS_ECN; > + break; > + > + case TOK_DROPTAIL: > + NEED(fs, "droptail is only for flowsets"); > + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); > + break; > + > + case TOK_BW: > + NEED(p, "bw is only for links"); > + NEED1("bw needs bandwidth or interface\n"); > + read_bandwidth(av[0], &p->bandwidth, NULL, 0); > + ac--; av++; > + break; > + > + case TOK_DELAY: > + NEED(p, "delay is only for links"); > + NEED1("delay needs argument 0..10000ms\n"); > + p->delay = strtoul(av[0], NULL, 0); > + ac--; av++; > + break; > + > + case TOK_TYPE: { > + int l; > + NEED(sch, "type is only for schedulers"); > + NEED1("type needs a string"); > + l = strlen(av[0]); > + if (l == 0 || l > 15) > + errx(1, "type %s too long\n", av[0]); > + strcpy(sch->name, av[0]); > + sch->oid.subtype = 0; /* use string */ > + ac--; av++; > + break; > + } > + > + case TOK_WEIGHT: > + NEED(fs, "weight is only for flowsets"); > + NEED1("weight needs argument\n"); > + fs->par[0] = strtol(av[0], &end, 0); > + ac--; av++; > + break; > + > + case TOK_LMAX: > + NEED(fs, "lmax is only for flowsets"); > + NEED1("lmax needs argument\n"); > + fs->par[1] = strtol(av[0], &end, 0); > + ac--; av++; > + break; > + > + case TOK_PRI: > + NEED(fs, "priority is only for flowsets"); > + NEED1("priority needs argument\n"); > + fs->par[2] = strtol(av[0], &end, 0); > + ac--; av++; > + break; > + > + case TOK_SCHED: > + case TOK_PIPE: > + NEED(fs, "pipe/sched"); > + NEED1("pipe/link/sched needs number\n"); > + fs->sched_nr = strtoul(av[0], &end, 0); > + ac--; av++; > + break; > + > + case TOK_PROFILE: > + NEED((!pf), "profile already set"); > + NEED(p, "profile"); > + { > + NEED1("extra delay needs the file name\n"); > + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); > + load_extra_delays(av[0], pf, p); //XXX can't fail? > + --ac; ++av; > + } > + break; > + > + case TOK_BURST: > + NEED(p, "burst"); > + NEED1("burst needs argument\n"); > + errno = 0; > + if (expand_number(av[0], &p->burst) < 0) > + if (errno != ERANGE) > + errx(EX_DATAERR, > + "burst: invalid argument"); > + if (errno || p->burst > (1ULL << 48) - 1) > + errx(EX_DATAERR, > + "burst: out of range (0..2^48-1)"); > + ac--; av++; > + break; > + > + default: > + errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); > + } > + } > + > + /* check validity of parameters */ > + if (p) { > + if (p->delay > 10000) > + errx(EX_DATAERR, "delay must be < 10000"); > + if (p->bandwidth == -1) > + p->bandwidth = 0; > + } > + if (fs) { > + /* XXX accept a 0 scheduler to keep the default */ > + if (fs->flags & DN_QSIZE_BYTES) { > + size_t len; > + long limit; > + > + len = sizeof(limit); > + if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", > + &limit, &len, NULL, 0) == -1) > + limit = 1024*1024; > + if (fs->qsize > limit) > + errx(EX_DATAERR, "queue size must be < %ldB", limit); > + } else { > + size_t len; > + long limit; > + > + len = sizeof(limit); > + if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", > + &limit, &len, NULL, 0) == -1) > + limit = 100; > + if (fs->qsize > limit) > + errx(EX_DATAERR, "2 <= queue size <= %ld", limit); > + } > + > + if ((fs->flags & DN_IS_ECN) && !(fs->flags & DN_IS_RED)) > + errx(EX_USAGE, "enable red/gred for ECN"); > + > + if (fs->flags & DN_IS_RED) { > + size_t len; > + int lookup_depth, avg_pkt_size; > + > + if (!(fs->flags & DN_IS_ECN) && (fs->min_th >= fs->max_th)) > + errx(EX_DATAERR, "min_th %d must be < than max_th %d", > + fs->min_th, fs->max_th); > + else if ((fs->flags & DN_IS_ECN) && (fs->min_th > fs->max_th)) > + errx(EX_DATAERR, "min_th %d must be =< than max_th %d", > + fs->min_th, fs->max_th); > + > + if (fs->max_th == 0) > + errx(EX_DATAERR, "max_th must be > 0"); > + > + len = sizeof(int); > + if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", > + &lookup_depth, &len, NULL, 0) == -1) > + lookup_depth = 256; > + if (lookup_depth == 0) > + errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" > + " must be greater than zero"); > + > + len = sizeof(int); > + if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", > + &avg_pkt_size, &len, NULL, 0) == -1) > + avg_pkt_size = 512; > + > + if (avg_pkt_size == 0) > + errx(EX_DATAERR, > + "net.inet.ip.dummynet.red_avg_pkt_size must" > + " be greater than zero"); > + > +#if 0 /* the following computation is now done in the kernel */ > + /* > + * Ticks needed for sending a medium-sized packet. > + * Unfortunately, when we are configuring a WF2Q+ queue, we > + * do not have bandwidth information, because that is stored > + * in the parent pipe, and also we have multiple queues > + * competing for it. So we set s=0, which is not very > + * correct. But on the other hand, why do we want RED with > + * WF2Q+ ? > + */ > + if (p.bandwidth==0) /* this is a WF2Q+ queue */ > + s = 0; > + else > + s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; > + /* > + * max idle time (in ticks) before avg queue size becomes 0. > + * NOTA: (3/w_q) is approx the value x so that > + * (1-w_q)^x < 10^-3. > + */ > + w_q = ((double)fs->w_q) / (1 << SCALE_RED); > + idle = s * 3. / w_q; > + fs->lookup_step = (int)idle / lookup_depth; > + if (!fs->lookup_step) > + fs->lookup_step = 1; > + weight = 1 - w_q; > + for (t = fs->lookup_step; t > 1; --t) > + weight *= 1 - w_q; > + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); > +#endif /* code moved in the kernel */ > + } > + } > + > + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); > + > + if (i) > + err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); > +} > + > +void > +dummynet_flush(void) > +{ > + struct dn_id oid; > + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); > + do_cmd(IP_DUMMYNET3, &oid, oid.len); > +} > + > +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' > + * Returns the number of ranges, and possibly stores them > + * in the array v of size len. > + */ > +static int > +parse_range(int ac, char *av[], uint32_t *v, int len) > +{ > + int n = 0; > + char *endptr, *s; > + uint32_t base[2]; > + > + if (v == NULL || len < 2) { > + v = base; > + len = 2; > + } > + > + for (s = *av; s != NULL; av++, ac--) { > + v[0] = strtoul(s, &endptr, 10); > + v[1] = (*endptr != '-') ? v[0] : > + strtoul(endptr+1, &endptr, 10); > + if (*endptr == '\0') { /* prepare for next round */ > + s = (ac > 0) ? *(av+1) : NULL; > + } else { > + if (*endptr != ',') { > + warn("invalid number: %s", s); > + s = ++endptr; > + continue; > + } > + /* continue processing from here */ > + s = ++endptr; > + ac++; > + av--; > + } > + if (v[1] < v[0] || > + v[1] >= DN_MAX_ID-1 || > + v[1] >= DN_MAX_ID-1) { > + continue; /* invalid entry */ > + } > + n++; > + /* translate if 'pipe list' */ > + if (co.do_pipe == 1) { > + v[0] += DN_MAX_ID; > + v[1] += DN_MAX_ID; > + } > + v = (n*2 < len) ? v + 2 : base; > + } > + return n; > +} > + > +/* main entry point for dummynet list functions. co.do_pipe indicates > + * which function we want to support. > + * av may contain filtering arguments, either individual entries > + * or ranges, or lists (space or commas are valid separators). > + * Format for a range can be n1-n2 or n3 n4 n5 ... > + * In a range n1 must be <= n2, otherwise the range is ignored. > + * A number 'n4' is translate in a range 'n4-n4' > + * All number must be > 0 and < DN_MAX_ID-1 > + */ > +void > +dummynet_list(int ac, char *av[], int show_counters) > +{ > + struct dn_id *oid, *x = NULL; > + int ret, i; > + int n; /* # of ranges */ > + u_int buflen, l; > + u_int max_size; /* largest obj passed up */ > + > + (void)show_counters; // XXX unused, but we should use it. > + ac--; > + av++; /* skip 'list' | 'show' word */ > + > + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ > + > + /* Allocate space to store ranges */ > + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; > + oid = safe_calloc(1, l); > + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); > + > + if (n > 0) /* store ranges in idx */ > + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); > + /* > + * Compute the size of the largest object returned. If the > + * response leaves at least this much spare space in the > + * buffer, then surely the response is complete; otherwise > + * there might be a risk of truncation and we will need to > + * retry with a larger buffer. > + * XXX don't bother with smaller structs. > + */ > + max_size = sizeof(struct dn_fs); > + if (max_size < sizeof(struct dn_sch)) > + max_size = sizeof(struct dn_sch); > + if (max_size < sizeof(struct dn_flow)) > + max_size = sizeof(struct dn_flow); > + > + switch (co.do_pipe) { > + case 1: > + oid->subtype = DN_LINK; /* list pipe */ > + break; > + case 2: > + oid->subtype = DN_FS; /* list queue */ > + break; > + case 3: > + oid->subtype = DN_SCH; /* list sched */ > + break; > + } > + > + /* > + * Ask the kernel an estimate of the required space (result > + * in oid.id), unless we are requesting a subset of objects, > + * in which case the kernel does not give an exact answer. > + * In any case, space might grow in the meantime due to the > + * creation of new queues, so we must be prepared to retry. > + */ > + if (n > 0) { > + buflen = 4*1024; > + } else { > + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); > + if (ret != 0 || oid->id <= sizeof(*oid)) > + goto done; > + buflen = oid->id + max_size; > + oid->len = sizeof(*oid); /* restore */ > + } > + /* Try a few times, until the buffer fits */ > + for (i = 0; i < 20; i++) { > + l = buflen; > + x = safe_realloc(x, l); > + bcopy(oid, x, oid->len); > + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); > + if (ret != 0 || x->id <= sizeof(*oid)) > + goto done; /* no response */ > + if (l + max_size <= buflen) > + break; /* ok */ > + buflen *= 2; /* double for next attempt */ > + } > + list_pipes(x, O_NEXT(x, l)); > +done: > + if (x) > + free(x); > + free(oid); > +} > diff --git a/example/ipfw/ipfw/ipfw.8 b/example/ipfw/ipfw/ipfw.8 > new file mode 100644 > index 0000000..9b8946b > --- /dev/null > +++ b/example/ipfw/ipfw/ipfw.8 > @@ -0,0 +1,3723 @@ > +.\" > +.\" $FreeBSD: head/sbin/ipfw/ipfw.8 274925 2014-11-23 21:00:00Z joel $ > +.\" > +.Dd Aug 13, 2014 > +.Dt IPFW 8 > +.Os > +.Sh NAME > +.Nm ipfw > +.Nd User interface for firewall, traffic shaper, packet scheduler, > +in-kernel NAT. > +.Sh SYNOPSIS > +.Ss FIREWALL CONFIGURATION > +.Nm > +.Op Fl cq > +.Cm add > +.Ar rule > +.Nm > +.Op Fl acdefnNStT > +.Op Cm set Ar N > +.Brq Cm list | show > +.Op Ar rule | first-last ... > +.Nm > +.Op Fl f | q > +.Op Cm set Ar N > +.Cm flush > +.Nm > +.Op Fl q > +.Op Cm set Ar N > +.Brq Cm delete | zero | resetlog > +.Op Ar number ... > +.Pp > +.Nm > +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... > +.Nm > +.Cm set move > +.Op Cm rule > +.Ar number Cm to Ar number > +.Nm > +.Cm set swap Ar number number > +.Nm > +.Cm set show > +.Ss SYSCTL SHORTCUTS > +.Nm > +.Cm enable > +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive > +.Nm > +.Cm disable > +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive > +.Ss LOOKUP TABLES > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm create Ar create-options > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm destroy > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm modify Ar modify-options > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm swap Ar name > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm add Ar table-key Op Ar value > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm add Op Ar table-key Ar value ... > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm atomic add Op Ar table-key Ar value ... > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm delete Op Ar table-key ... > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm lookup Ar addr > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm lock > +.Nm > +.Oo Cm set Ar N Oc Cm table Ar name Cm unlock > +.Nm > +.Oo Cm set Ar N Oc Cm table > +.Brq Ar name | all > +.Cm list > +.Nm > +.Oo Cm set Ar N Oc Cm table > +.Brq Ar name | all > +.Cm info > +.Nm > +.Oo Cm set Ar N Oc Cm table > +.Brq Ar name | all > +.Cm detail > +.Nm > +.Oo Cm set Ar N Oc Cm table > +.Brq Ar name | all > +.Cm flush > +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) > +.Nm > +.Brq Cm pipe | queue | sched > +.Ar number > +.Cm config > +.Ar config-options > +.Nm > +.Op Fl s Op Ar field > +.Brq Cm pipe | queue | sched > +.Brq Cm delete | list | show > +.Op Ar number ... > +.Ss IN-KERNEL NAT > +.Nm > +.Op Fl q > +.Cm nat > +.Ar number > +.Cm config > +.Ar config-options > +.Pp > +.Nm > +.Op Fl cfnNqS > +.Oo > +.Fl p Ar preproc > +.Oo > +.Ar preproc-flags > +.Oc > +.Oc > +.Ar pathname > +.Ss INTERNAL DIAGNOSTICS > +.Nm > +.Cm internal iflist > +.Nm > +.Cm internal talist > +.Nm > +.Cm internal vlist > +.Sh DESCRIPTION > +The > +.Nm > +utility is the user interface for controlling the > +.Xr ipfw 4 > +firewall, the > +.Xr dummynet 4 > +traffic shaper/packet scheduler, and the > +in-kernel NAT services. > +.Pp > +A firewall configuration, or > +.Em ruleset , > +is made of a list of > +.Em rules > +numbered from 1 to 65535. > +Packets are passed to the firewall > +from a number of different places in the protocol stack > +(depending on the source and destination of the packet, > +it is possible for the firewall to be > +invoked multiple times on the same packet). > +The packet passed to the firewall is compared > +against each of the rules in the > +.Em ruleset , > +in rule-number order > +(multiple rules with the same number are permitted, in which case > +they are processed in order of insertion). > +When a match is found, the action corresponding to the > +matching rule is performed. > +.Pp > +Depending on the action and certain system settings, packets > +can be reinjected into the firewall at some rule after the > +matching one for further processing. > +.Pp > +A ruleset always includes a > +.Em default > +rule (numbered 65535) which cannot be modified or deleted, > +and matches all packets. > +The action associated with the > +.Em default > +rule can be either > +.Cm deny > +or > +.Cm allow > +depending on how the kernel is configured. > +.Pp > +If the ruleset includes one or more rules with the > +.Cm keep-state > +or > +.Cm limit > +option, > +the firewall will have a > +.Em stateful > +behaviour, i.e., upon a match it will create > +.Em dynamic rules , > +i.e., rules that match packets with the same 5-tuple > +(protocol, source and destination addresses and ports) > +as the packet which caused their creation. > +Dynamic rules, which have a limited lifetime, are checked > +at the first occurrence of a > +.Cm check-state , > +.Cm keep-state > +or > +.Cm limit > +rule, and are typically used to open the firewall on-demand to > +legitimate traffic only. > +See the > +.Sx STATEFUL FIREWALL > +and > +.Sx EXAMPLES > +Sections below for more information on the stateful behaviour of > +.Nm . > +.Pp > +All rules (including dynamic ones) have a few associated counters: > +a packet count, a byte count, a log count and a timestamp > +indicating the time of the last match. > +Counters can be displayed or reset with > +.Nm > +commands. > +.Pp > +Each rule belongs to one of 32 different > +.Em sets > +, and there are > +.Nm > +commands to atomically manipulate sets, such as enable, > +disable, swap sets, move all rules in a set to another > +one, delete all rules in a set. > +These can be useful to > +install temporary configurations, or to test them. > +See Section > +.Sx SETS OF RULES > +for more information on > +.Em sets . > +.Pp > +Rules can be added with the > +.Cm add > +command; deleted individually or in groups with the > +.Cm delete > +command, and globally (except those in set 31) with the > +.Cm flush > +command; displayed, optionally with the content of the > +counters, using the > +.Cm show > +and > +.Cm list > +commands. > +Finally, counters can be reset with the > +.Cm zero > +and > +.Cm resetlog > +commands. > +.Pp > +.Ss COMMAND OPTIONS > +The following general options are available when invoking > +.Nm : > +.Bl -tag -width indent > +.It Fl a > +Show counter values when listing rules. > +The > +.Cm show > +command implies this option. > +.It Fl b > +Only show the action and the comment, not the body of a rule. > +Implies > +.Fl c . > +.It Fl c > +When entering or showing rules, print them in compact form, > +i.e., omitting the "ip from any to any" string > +when this does not carry any additional information. > +.It Fl d > +When listing, show dynamic rules in addition to static ones. > +.It Fl e > +When listing and > +.Fl d > +is specified, also show expired dynamic rules. > +.It Fl f > +Do not ask for confirmation for commands that can cause problems > +if misused, i.e., > +.Cm flush . > +If there is no tty associated with the process, this is implied. > +.It Fl i > +When listing a table (see the > +.Sx LOOKUP TABLES > +section below for more information on lookup tables), format values > +as IP addresses. > +By default, values are shown as integers. > +.It Fl n > +Only check syntax of the command strings, without actually passing > +them to the kernel. > +.It Fl N > +Try to resolve addresses and service names in output. > +.It Fl q > +Be quiet when executing the > +.Cm add , > +.Cm nat , > +.Cm zero , > +.Cm resetlog > +or > +.Cm flush > +commands; > +(implies > +.Fl f ) . > +This is useful when updating rulesets by executing multiple > +.Nm > +commands in a script > +(e.g., > +.Ql sh\ /etc/rc.firewall ) , > +or by processing a file with many > +.Nm > +rules across a remote login session. > +It also stops a table add or delete > +from failing if the entry already exists or is not present. > +.Pp > +The reason why this option may be important is that > +for some of these actions, > +.Nm > +may print a message; if the action results in blocking the > +traffic to the remote client, > +the remote login session will be closed > +and the rest of the ruleset will not be processed. > +Access to the console would then be required to recover. > +.It Fl S > +When listing rules, show the > +.Em set > +each rule belongs to. > +If this flag is not specified, disabled rules will not be > +listed. > +.It Fl s Op Ar field > +When listing pipes, sort according to one of the four > +counters (total or current packets or bytes). > +.It Fl t > +When listing, show last match timestamp converted with ctime(). > +.It Fl T > +When listing, show last match timestamp as seconds from the epoch. > +This form can be more convenient for postprocessing by scripts. > +.El > +.Ss LIST OF RULES AND PREPROCESSING > +To ease configuration, rules can be put into a file which is > +processed using > +.Nm > +as shown in the last synopsis line. > +An absolute > +.Ar pathname > +must be used. > +The file will be read line by line and applied as arguments to the > +.Nm > +utility. > +.Pp > +Optionally, a preprocessor can be specified using > +.Fl p Ar preproc > +where > +.Ar pathname > +is to be piped through. > +Useful preprocessors include > +.Xr cpp 1 > +and > +.Xr m4 1 . > +If > +.Ar preproc > +does not start with a slash > +.Pq Ql / > +as its first character, the usual > +.Ev PATH > +name search is performed. > +Care should be taken with this in environments where not all > +file systems are mounted (yet) by the time > +.Nm > +is being run (e.g.\& when they are mounted over NFS). > +Once > +.Fl p > +has been specified, any additional arguments are passed on to the preprocessor > +for interpretation. > +This allows for flexible configuration files (like conditionalizing > +them on the local hostname) and the use of macros to centralize > +frequently required arguments like IP addresses. > +.Ss TRAFFIC SHAPER CONFIGURATION > +The > +.Nm > +.Cm pipe , queue > +and > +.Cm sched > +commands are used to configure the traffic shaper and packet scheduler. > +See the > +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION > +Section below for details. > +.Pp > +If the world and the kernel get out of sync the > +.Nm > +ABI may break, preventing you from being able to add any rules. > +This can adversely affect the booting process. > +You can use > +.Nm > +.Cm disable > +.Cm firewall > +to temporarily disable the firewall to regain access to the network, > +allowing you to fix the problem. > +.Sh PACKET FLOW > +A packet is checked against the active ruleset in multiple places > +in the protocol stack, under control of several sysctl variables. > +These places and variables are shown below, and it is important to > +have this picture in mind in order to design a correct ruleset. > +.Bd -literal -offset indent > + ^ to upper layers V > + | | > + +----------->-----------+ > + ^ V > + [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 > + | | > + ^ V > + [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 > + | | > + +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 > + ^ V > + | to devices | > +.Ed > +.Pp > +The number of > +times the same packet goes through the firewall can > +vary between 0 and 4 depending on packet source and > +destination, and system configuration. > +.Pp > +Note that as packets flow through the stack, headers can be > +stripped or added to it, and so they may or may not be available > +for inspection. > +E.g., incoming packets will include the MAC header when > +.Nm > +is invoked from > +.Cm ether_demux() , > +but the same packets will have the MAC header stripped off when > +.Nm > +is invoked from > +.Cm ip_input() > +or > +.Cm ip6_input() . > +.Pp > +Also note that each packet is always checked against the complete ruleset, > +irrespective of the place where the check occurs, or the source of the packet. > +If a rule contains some match patterns or actions which are not valid > +for the place of invocation (e.g.\& trying to match a MAC header within > +.Cm ip_input > +or > +.Cm ip6_input ), > +the match pattern will not match, but a > +.Cm not > +operator in front of such patterns > +.Em will > +cause the pattern to > +.Em always > +match on those packets. > +It is thus the responsibility of > +the programmer, if necessary, to write a suitable ruleset to > +differentiate among the possible places. > +.Cm skipto > +rules can be useful here, as an example: > +.Bd -literal -offset indent > +# packets from ether_demux or bdg_forward > +ipfw add 10 skipto 1000 all from any to any layer2 in > +# packets from ip_input > +ipfw add 10 skipto 2000 all from any to any not layer2 in > +# packets from ip_output > +ipfw add 10 skipto 3000 all from any to any not layer2 out > +# packets from ether_output_frame > +ipfw add 10 skipto 4000 all from any to any layer2 out > +.Ed > +.Pp > +(yes, at the moment there is no way to differentiate between > +ether_demux and bdg_forward). > +.Sh SYNTAX > +In general, each keyword or argument must be provided as > +a separate command line argument, with no leading or trailing > +spaces. > +Keywords are case-sensitive, whereas arguments may > +or may not be case-sensitive depending on their nature > +(e.g.\& uid's are, hostnames are not). > +.Pp > +Some arguments (e.g., port or address lists) are comma-separated > +lists of values. > +In this case, spaces after commas ',' are allowed to make > +the line more readable. > +You can also put the entire > +command (including flags) into a single argument. > +E.g., the following forms are equivalent: > +.Bd -literal -offset indent > +ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 > +ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 > +ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" > +.Ed > +.Sh RULE FORMAT > +The format of firewall rules is the following: > +.Bd -ragged -offset indent > +.Bk -words > +.Op Ar rule_number > +.Op Cm set Ar set_number > +.Op Cm prob Ar match_probability > +.Ar action > +.Op Cm log Op Cm logamount Ar number > +.Op Cm altq Ar queue > +.Oo > +.Bro Cm tag | untag > +.Brc Ar number > +.Oc > +.Ar body > +.Ek > +.Ed > +.Pp > +where the body of the rule specifies which information is used > +for filtering packets, among the following: > +.Pp > +.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact > +.It Layer-2 header fields > +When available > +.It IPv4 and IPv6 Protocol > +TCP, UDP, ICMP, etc. > +.It Source and dest. addresses and ports > +.It Direction > +See Section > +.Sx PACKET FLOW > +.It Transmit and receive interface > +By name or address > +.It Misc. IP header fields > +Version, type of service, datagram length, identification, > +fragment flag (non-zero IP offset), > +Time To Live > +.It IP options > +.It IPv6 Extension headers > +Fragmentation, Hop-by-Hop options, > +Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. > +.It IPv6 Flow-ID > +.It Misc. TCP header fields > +TCP flags (SYN, FIN, ACK, RST, etc.), > +sequence number, acknowledgment number, > +window > +.It TCP options > +.It ICMP types > +for ICMP packets > +.It ICMP6 types > +for ICMP6 packets > +.It User/group ID > +When the packet can be associated with a local socket. > +.It Divert status > +Whether a packet came from a divert socket (e.g., > +.Xr natd 8 ) . > +.It Fib annotation state > +Whether a packet has been tagged for using a specific FIB (routing table) > +in future forwarding decisions. > +.El > +.Pp > +Note that some of the above information, e.g.\& source MAC or IP addresses and > +TCP/UDP ports, can be easily spoofed, so filtering on those fields > +alone might not guarantee the desired results. > +.Bl -tag -width indent > +.It Ar rule_number > +Each rule is associated with a > +.Ar rule_number > +in the range 1..65535, with the latter reserved for the > +.Em default > +rule. > +Rules are checked sequentially by rule number. > +Multiple rules can have the same number, in which case they are > +checked (and listed) according to the order in which they have > +been added. > +If a rule is entered without specifying a number, the kernel will > +assign one in such a way that the rule becomes the last one > +before the > +.Em default > +rule. > +Automatic rule numbers are assigned by incrementing the last > +non-default rule number by the value of the sysctl variable > +.Ar net.inet.ip.fw.autoinc_step > +which defaults to 100. > +If this is not possible (e.g.\& because we would go beyond the > +maximum allowed rule number), the number of the last > +non-default value is used instead. > +.It Cm set Ar set_number > +Each rule is associated with a > +.Ar set_number > +in the range 0..31. > +Sets can be individually disabled and enabled, so this parameter > +is of fundamental importance for atomic ruleset manipulation. > +It can be also used to simplify deletion of groups of rules. > +If a rule is entered without specifying a set number, > +set 0 will be used. > +.br > +Set 31 is special in that it cannot be disabled, > +and rules in set 31 are not deleted by the > +.Nm ipfw flush > +command (but you can delete them with the > +.Nm ipfw delete set 31 > +command). > +Set 31 is also used for the > +.Em default > +rule. > +.It Cm prob Ar match_probability > +A match is only declared with the specified probability > +(floating point number between 0 and 1). > +This can be useful for a number of applications such as > +random packet drop or > +(in conjunction with > +.Nm dummynet ) > +to simulate the effect of multiple paths leading to out-of-order > +packet delivery. > +.Pp > +Note: this condition is checked before any other condition, including > +ones such as keep-state or check-state which might have side effects. > +.It Cm log Op Cm logamount Ar number > +Packets matching a rule with the > +.Cm log > +keyword will be made available for logging in two ways: > +if the sysctl variable > +.Va net.inet.ip.fw.verbose > +is set to 0 (default), one can use > +.Xr bpf 4 > +attached to the > +.Li ipfw0 > +pseudo interface. > +This pseudo interface can be created after a boot > +manually by using the following command: > +.Bd -literal -offset indent > +# ifconfig ipfw0 create > +.Ed > +.Pp > +Or, automatically at boot time by adding the following > +line to the > +.Xr rc.conf 5 > +file: > +.Bd -literal -offset indent > +firewall_logif="YES" > +.Ed > +.Pp > +There is no overhead if no > +.Xr bpf 4 > +is attached to the pseudo interface. > +.Pp > +If > +.Va net.inet.ip.fw.verbose > +is set to 1, packets will be logged to > +.Xr syslogd 8 > +with a > +.Dv LOG_SECURITY > +facility up to a maximum of > +.Cm logamount > +packets. > +If no > +.Cm logamount > +is specified, the limit is taken from the sysctl variable > +.Va net.inet.ip.fw.verbose_limit . > +In both cases, a value of 0 means unlimited logging. > +.Pp > +Once the limit is reached, logging can be re-enabled by > +clearing the logging counter or the packet counter for that entry, see the > +.Cm resetlog > +command. > +.Pp > +Note: logging is done after all other packet matching conditions > +have been successfully verified, and before performing the final > +action (accept, deny, etc.) on the packet. > +.It Cm tag Ar number > +When a packet matches a rule with the > +.Cm tag > +keyword, the numeric tag for the given > +.Ar number > +in the range 1..65534 will be attached to the packet. > +The tag acts as an internal marker (it is not sent out over > +the wire) that can be used to identify these packets later on. > +This can be used, for example, to provide trust between interfaces > +and to start doing policy-based filtering. > +A packet can have multiple tags at the same time. > +Tags are "sticky", meaning once a tag is applied to a packet by a > +matching rule it exists until explicit removal. > +Tags are kept with the packet everywhere within the kernel, but are > +lost when packet leaves the kernel, for example, on transmitting > +packet out to the network or sending packet to a > +.Xr divert 4 > +socket. > +.Pp > +To check for previously applied tags, use the > +.Cm tagged > +rule option. > +To delete previously applied tag, use the > +.Cm untag > +keyword. > +.Pp > +Note: since tags are kept with the packet everywhere in kernelspace, > +they can be set and unset anywhere in the kernel network subsystem > +(using the > +.Xr mbuf_tags 9 > +facility), not only by means of the > +.Xr ipfw 4 > +.Cm tag > +and > +.Cm untag > +keywords. > +For example, there can be a specialized > +.Xr netgraph 4 > +node doing traffic analyzing and tagging for later inspecting > +in firewall. > +.It Cm untag Ar number > +When a packet matches a rule with the > +.Cm untag > +keyword, the tag with the number > +.Ar number > +is searched among the tags attached to this packet and, > +if found, removed from it. > +Other tags bound to packet, if present, are left untouched. > +.It Cm altq Ar queue > +When a packet matches a rule with the > +.Cm altq > +keyword, the ALTQ identifier for the given > +.Ar queue > +(see > +.Xr altq 4 ) > +will be attached. > +Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, > +and not being rejected or going to divert sockets. > +Note that if there is insufficient memory at the time the packet is > +processed, it will not be tagged, so it is wise to make your ALTQ > +"default" queue policy account for this. > +If multiple > +.Cm altq > +rules match a single packet, only the first one adds the ALTQ classification > +tag. > +In doing so, traffic may be shaped by using > +.Cm count Cm altq Ar queue > +rules for classification early in the ruleset, then later applying > +the filtering decision. > +For example, > +.Cm check-state > +and > +.Cm keep-state > +rules may come later and provide the actual filtering decisions in > +addition to the fallback ALTQ tag. > +.Pp > +You must run > +.Xr pfctl 8 > +to set up the queues before IPFW will be able to look them up by name, > +and if the ALTQ disciplines are rearranged, the rules in containing the > +queue identifiers in the kernel will likely have gone stale and need > +to be reloaded. > +Stale queue identifiers will probably result in misclassification. > +.Pp > +All system ALTQ processing can be turned on or off via > +.Nm > +.Cm enable Ar altq > +and > +.Nm > +.Cm disable Ar altq . > +The usage of > +.Va net.inet.ip.fw.one_pass > +is irrelevant to ALTQ traffic shaping, as the actual rule action is followed > +always after adding an ALTQ tag. > +.El > +.Ss RULE ACTIONS > +A rule can be associated with one of the following actions, which > +will be executed when the packet matches the body of the rule. > +.Bl -tag -width indent > +.It Cm allow | accept | pass | permit > +Allow packets that match rule. > +The search terminates. > +.It Cm check-state > +Checks the packet against the dynamic ruleset. > +If a match is found, execute the action associated with > +the rule which generated this dynamic rule, otherwise > +move to the next rule. > +.br > +.Cm Check-state > +rules do not have a body. > +If no > +.Cm check-state > +rule is found, the dynamic ruleset is checked at the first > +.Cm keep-state > +or > +.Cm limit > +rule. > +.It Cm count > +Update counters for all packets that match rule. > +The search continues with the next rule. > +.It Cm deny | drop > +Discard packets that match this rule. > +The search terminates. > +.It Cm divert Ar port > +Divert packets that match this rule to the > +.Xr divert 4 > +socket bound to port > +.Ar port . > +The search terminates. > +.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port > +Change the next-hop on matching packets to > +.Ar ipaddr , > +which can be an IP address or a host name. > +For IPv4, the next hop can also be supplied by the last table > +looked up for the packet by using the > +.Cm tablearg > +keyword instead of an explicit address. > +The search terminates if this rule matches. > +.Pp > +If > +.Ar ipaddr > +is a local address, then matching packets will be forwarded to > +.Ar port > +(or the port number in the packet if one is not specified in the rule) > +on the local machine. > +.br > +If > +.Ar ipaddr > +is not a local address, then the port number > +(if specified) is ignored, and the packet will be > +forwarded to the remote address, using the route as found in > +the local routing table for that IP. > +.br > +A > +.Ar fwd > +rule will not match layer-2 packets (those received > +on ether_input, ether_output, or bridged). > +.br > +The > +.Cm fwd > +action does not change the contents of the packet at all. > +In particular, the destination address remains unmodified, so > +packets forwarded to another system will usually be rejected by that system > +unless there is a matching rule on that system to capture them. > +For packets forwarded locally, > +the local address of the socket will be > +set to the original destination address of the packet. > +This makes the > +.Xr netstat 1 > +entry look rather weird but is intended for > +use with transparent proxy servers. > +.It Cm nat Ar nat_nr | tablearg > +Pass packet to a > +nat instance > +(for network address translation, address redirect, etc.): > +see the > +.Sx NETWORK ADDRESS TRANSLATION (NAT) > +Section for further information. > +.It Cm pipe Ar pipe_nr > +Pass packet to a > +.Nm dummynet > +.Dq pipe > +(for bandwidth limitation, delay, etc.). > +See the > +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION > +Section for further information. > +The search terminates; however, on exit from the pipe and if > +the > +.Xr sysctl 8 > +variable > +.Va net.inet.ip.fw.one_pass > +is not set, the packet is passed again to the firewall code > +starting from the next rule. > +.It Cm queue Ar queue_nr > +Pass packet to a > +.Nm dummynet > +.Dq queue > +(for bandwidth limitation using WF2Q+). > +.It Cm reject > +(Deprecated). > +Synonym for > +.Cm unreach host . > +.It Cm reset > +Discard packets that match this rule, and if the > +packet is a TCP packet, try to send a TCP reset (RST) notice. > +The search terminates. > +.It Cm reset6 > +Discard packets that match this rule, and if the > +packet is a TCP packet, try to send a TCP reset (RST) notice. > +The search terminates. > +.It Cm skipto Ar number | tablearg > +Skip all subsequent rules numbered less than > +.Ar number . > +The search continues with the first rule numbered > +.Ar number > +or higher. > +It is possible to use the > +.Cm tablearg > +keyword with a skipto for a > +.Em computed > +skipto. Skipto may work either in O(log(N)) or in O(1) depending > +on amount of memory and/or sysctl variables. > +See the > +.Sx SYSCTL VARIABLES > +section for more details. > +.It Cm call Ar number | tablearg > +The current rule number is saved in the internal stack and > +ruleset processing continues with the first rule numbered > +.Ar number > +or higher. > +If later a rule with the > +.Cm return > +action is encountered, the processing returns to the first rule > +with number of this > +.Cm call > +rule plus one or higher > +(the same behaviour as with packets returning from > +.Xr divert 4 > +socket after a > +.Cm divert > +action). > +This could be used to make somewhat like an assembly language > +.Dq subroutine > +calls to rules with common checks for different interfaces, etc. > +.Pp > +Rule with any number could be called, not just forward jumps as with > +.Cm skipto . > +So, to prevent endless loops in case of mistakes, both > +.Cm call > +and > +.Cm return > +actions don't do any jumps and simply go to the next rule if memory > +cannot be allocated or stack overflowed/underflowed. > +.Pp > +Internally stack for rule numbers is implemented using > +.Xr mbuf_tags 9 > +facility and currently has size of 16 entries. > +As mbuf tags are lost when packet leaves the kernel, > +.Cm divert > +should not be used in subroutines to avoid endless loops > +and other undesired effects. > +.It Cm return > +Takes rule number saved to internal stack by the last > +.Cm call > +action and returns ruleset processing to the first rule > +with number greater than number of corresponding > +.Cm call > +rule. > +See description of the > +.Cm call > +action for more details. > +.Pp > +Note that > +.Cm return > +rules usually end a > +.Dq subroutine > +and thus are unconditional, but > +.Nm > +command-line utility currently requires every action except > +.Cm check-state > +to have body. > +While it is sometimes useful to return only on some packets, > +usually you want to print just > +.Dq return > +for readability. > +A workaround for this is to use new syntax and > +.Fl c > +switch: > +.Bd -literal -offset indent > +# Add a rule without actual body > +ipfw add 2999 return via any > + > +# List rules without "from any to any" part > +ipfw -c list > +.Ed > +.Pp > +This cosmetic annoyance may be fixed in future releases. > +.It Cm tee Ar port > +Send a copy of packets matching this rule to the > +.Xr divert 4 > +socket bound to port > +.Ar port . > +The search continues with the next rule. > +.It Cm unreach Ar code > +Discard packets that match this rule, and try to send an ICMP > +unreachable notice with code > +.Ar code , > +where > +.Ar code > +is a number from 0 to 255, or one of these aliases: > +.Cm net , host , protocol , port , > +.Cm needfrag , srcfail , net-unknown , host-unknown , > +.Cm isolated , net-prohib , host-prohib , tosnet , > +.Cm toshost , filter-prohib , host-precedence > +or > +.Cm precedence-cutoff . > +The search terminates. > +.It Cm unreach6 Ar code > +Discard packets that match this rule, and try to send an ICMPv6 > +unreachable notice with code > +.Ar code , > +where > +.Ar code > +is a number from 0, 1, 3 or 4, or one of these aliases: > +.Cm no-route, admin-prohib, address > +or > +.Cm port . > +The search terminates. > +.It Cm netgraph Ar cookie > +Divert packet into netgraph with given > +.Ar cookie . > +The search terminates. > +If packet is later returned from netgraph it is either > +accepted or continues with the next rule, depending on > +.Va net.inet.ip.fw.one_pass > +sysctl variable. > +.It Cm ngtee Ar cookie > +A copy of packet is diverted into netgraph, original > +packet continues with the next rule. > +See > +.Xr ng_ipfw 4 > +for more information on > +.Cm netgraph > +and > +.Cm ngtee > +actions. > +.It Cm setfib Ar fibnum | tablearg > +The packet is tagged so as to use the FIB (routing table) > +.Ar fibnum > +in any subsequent forwarding decisions. > +In the current implementation, this is limited to the values 0 through 15, see > +.Xr setfib 2 . > +Processing continues at the next rule. > +It is possible to use the > +.Cm tablearg > +keyword with setfib. > +If the tablearg value is not within the compiled range of fibs, > +the packet's fib is set to 0. > +.It Cm setdscp Ar DSCP | number | tablearg > +Set specified DiffServ codepoint for an IPv4/IPv6 packet. > +Processing continues at the next rule. > +Supported values are: > +.Pp > +.Cm CS0 > +.Pq Dv 000000 , > +.Cm CS1 > +.Pq Dv 001000 , > +.Cm CS2 > +.Pq Dv 010000 , > +.Cm CS3 > +.Pq Dv 011000 , > +.Cm CS4 > +.Pq Dv 100000 , > +.Cm CS5 > +.Pq Dv 101000 , > +.Cm CS6 > +.Pq Dv 110000 , > +.Cm CS7 > +.Pq Dv 111000 , > +.Cm AF11 > +.Pq Dv 001010 , > +.Cm AF12 > +.Pq Dv 001100 , > +.Cm AF13 > +.Pq Dv 001110 , > +.Cm AF21 > +.Pq Dv 010010 , > +.Cm AF22 > +.Pq Dv 010100 , > +.Cm AF23 > +.Pq Dv 010110 , > +.Cm AF31 > +.Pq Dv 011010 , > +.Cm AF32 > +.Pq Dv 011100 , > +.Cm AF33 > +.Pq Dv 011110 , > +.Cm AF41 > +.Pq Dv 100010 , > +.Cm AF42 > +.Pq Dv 100100 , > +.Cm AF43 > +.Pq Dv 100110 , > +.Cm EF > +.Pq Dv 101110 , > +.Cm BE > +.Pq Dv 000000 . > +Additionally, DSCP value can be specified by number (0..64). > +It is also possible to use the > +.Cm tablearg > +keyword with setdscp. > +If the tablearg value is not within the 0..64 range, lower 6 bits of supplied > +value are used. > +.It Cm reass > +Queue and reassemble IP fragments. > +If the packet is not fragmented, counters are updated and > +processing continues with the next rule. > +If the packet is the last logical fragment, the packet is reassembled and, if > +.Va net.inet.ip.fw.one_pass > +is set to 0, processing continues with the next rule. > +Otherwise, the packet is allowed to pass and the search terminates. > +If the packet is a fragment in the middle of a logical group of fragments, > +it is consumed and > +processing stops immediately. > +.Pp > +Fragment handling can be tuned via > +.Va net.inet.ip.maxfragpackets > +and > +.Va net.inet.ip.maxfragsperpacket > +which limit, respectively, the maximum number of processable > +fragments (default: 800) and > +the maximum number of fragments per packet (default: 16). > +.Pp > +NOTA BENE: since fragments do not contain port numbers, > +they should be avoided with the > +.Nm reass > +rule. > +Alternatively, direction-based (like > +.Nm in > +/ > +.Nm out > +) and source-based (like > +.Nm via > +) match patterns can be used to select fragments. > +.Pp > +Usually a simple rule like: > +.Bd -literal -offset indent > +# reassemble incoming fragments > +ipfw add reass all from any to any in > +.Ed > +.Pp > +is all you need at the beginning of your ruleset. > +.El > +.Ss RULE BODY > +The body of a rule contains zero or more patterns (such as > +specific source and destination addresses or ports, > +protocol options, incoming or outgoing interfaces, etc.) > +that the packet must match in order to be recognised. > +In general, the patterns are connected by (implicit) > +.Cm and > +operators -- i.e., all must match in order for the > +rule to match. > +Individual patterns can be prefixed by the > +.Cm not > +operator to reverse the result of the match, as in > +.Pp > +.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" > +.Pp > +Additionally, sets of alternative match patterns > +.Pq Em or-blocks > +can be constructed by putting the patterns in > +lists enclosed between parentheses ( ) or braces { }, and > +using the > +.Cm or > +operator as follows: > +.Pp > +.Dl "ipfw add 100 allow ip from { x or not y or z } to any" > +.Pp > +Only one level of parentheses is allowed. > +Beware that most shells have special meanings for parentheses > +or braces, so it is advisable to put a backslash \\ in front of them > +to prevent such interpretations. > +.Pp > +The body of a rule must in general include a source and destination > +address specifier. > +The keyword > +.Ar any > +can be used in various places to specify that the content of > +a required field is irrelevant. > +.Pp > +The rule body has the following format: > +.Bd -ragged -offset indent > +.Op Ar proto Cm from Ar src Cm to Ar dst > +.Op Ar options > +.Ed > +.Pp > +The first part (proto from src to dst) is for backward > +compatibility with earlier versions of > +.Fx . > +In modern > +.Fx > +any match pattern (including MAC headers, IP protocols, > +addresses and ports) can be specified in the > +.Ar options > +section. > +.Pp > +Rule fields have the following meaning: > +.Bl -tag -width indent > +.It Ar proto : protocol | Cm { Ar protocol Cm or ... } > +.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number > +An IP protocol specified by number or name > +(for a complete list see > +.Pa /etc/protocols ) , > +or one of the following keywords: > +.Bl -tag -width indent > +.It Cm ip4 | ipv4 > +Matches IPv4 packets. > +.It Cm ip6 | ipv6 > +Matches IPv6 packets. > +.It Cm ip | all > +Matches any packet. > +.El > +.Pp > +The > +.Cm ipv6 > +in > +.Cm proto > +option will be treated as inner protocol. > +And, the > +.Cm ipv4 > +is not available in > +.Cm proto > +option. > +.Pp > +The > +.Cm { Ar protocol Cm or ... } > +format (an > +.Em or-block ) > +is provided for convenience only but its use is deprecated. > +.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports > +An address (or a list, see below) > +optionally followed by > +.Ar ports > +specifiers. > +.Pp > +The second format > +.Em ( or-block > +with multiple addresses) is provided for convenience only and > +its use is discouraged. > +.It Ar addr : Oo Cm not Oc Bro > +.Cm any | me | me6 | > +.Cm table Ns Pq Ar name Ns Op , Ns Ar value > +.Ar | addr-list | addr-set > +.Brc > +.Bl -tag -width indent > +.It Cm any > +matches any IP address. > +.It Cm me > +matches any IP address configured on an interface in the system. > +.It Cm me6 > +matches any IPv6 address configured on an interface in the system. > +The address list is evaluated at the time the packet is > +analysed. > +.It Cm table Ns Pq Ar name Ns Op , Ns Ar value > +Matches any IPv4 or IPv6 address for which an entry exists in the lookup table > +.Ar number . > +If an optional 32-bit unsigned > +.Ar value > +is also specified, an entry will match only if it has this value. > +See the > +.Sx LOOKUP TABLES > +section below for more information on lookup tables. > +.El > +.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list > +.It Ar ip-addr : > +A host or subnet address specified in one of the following ways: > +.Bl -tag -width indent > +.It Ar numeric-ip | hostname > +Matches a single IPv4 address, specified as dotted-quad or a hostname. > +Hostnames are resolved at the time the rule is added to the firewall list. > +.It Ar addr Ns / Ns Ar masklen > +Matches all addresses with base > +.Ar addr > +(specified as an IP address, a network number, or a hostname) > +and mask width of > +.Cm masklen > +bits. > +As an example, 1.2.3.4/25 or 1.2.3.0/25 will match > +all IP numbers from 1.2.3.0 to 1.2.3.127 . > +.It Ar addr Ns : Ns Ar mask > +Matches all addresses with base > +.Ar addr > +(specified as an IP address, a network number, or a hostname) > +and the mask of > +.Ar mask , > +specified as a dotted quad. > +As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match > +1.*.3.*. > +This form is advised only for non-contiguous > +masks. > +It is better to resort to the > +.Ar addr Ns / Ns Ar masklen > +format for contiguous masks, which is more compact and less > +error-prone. > +.El > +.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } > +.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list > +Matches all addresses with base address > +.Ar addr > +(specified as an IP address, a network number, or a hostname) > +and whose last byte is in the list between braces { } . > +Note that there must be no spaces between braces and > +numbers (spaces after commas are allowed). > +Elements of the list can be specified as single entries > +or ranges. > +The > +.Ar masklen > +field is used to limit the size of the set of addresses, > +and can have any value between 24 and 32. > +If not specified, > +it will be assumed as 24. > +.br > +This format is particularly useful to handle sparse address sets > +within a single rule. > +Because the matching occurs using a > +bitmask, it takes constant time and dramatically reduces > +the complexity of rulesets. > +.br > +As an example, an address specified as 1.2.3.4/24{128,35-55,89} > +or 1.2.3.0/24{128,35-55,89} > +will match the following IP addresses: > +.br > +1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . > +.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list > +.It Ar ip6-addr : > +A host or subnet specified one of the following ways: > +.Bl -tag -width indent > +.It Ar numeric-ip | hostname > +Matches a single IPv6 address as allowed by > +.Xr inet_pton 3 > +or a hostname. > +Hostnames are resolved at the time the rule is added to the firewall > +list. > +.It Ar addr Ns / Ns Ar masklen > +Matches all IPv6 addresses with base > +.Ar addr > +(specified as allowed by > +.Xr inet_pton > +or a hostname) > +and mask width of > +.Cm masklen > +bits. > +.El > +.Pp > +No support for sets of IPv6 addresses is provided because IPv6 addresses > +are typically random past the initial prefix. > +.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports > +For protocols which support port numbers (such as TCP and UDP), optional > +.Cm ports > +may be specified as one or more ports or port ranges, separated > +by commas but no spaces, and an optional > +.Cm not > +operator. > +The > +.Ql \&- > +notation specifies a range of ports (including boundaries). > +.Pp > +Service names (from > +.Pa /etc/services ) > +may be used instead of numeric port values. > +The length of the port list is limited to 30 ports or ranges, > +though one can specify larger ranges by using an > +.Em or-block > +in the > +.Cm options > +section of the rule. > +.Pp > +A backslash > +.Pq Ql \e > +can be used to escape the dash > +.Pq Ql - > +character in a service name (from a shell, the backslash must be > +typed twice to avoid the shell itself interpreting it as an escape > +character). > +.Pp > +.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" > +.Pp > +Fragmented packets which have a non-zero offset (i.e., not the first > +fragment) will never match a rule which has one or more port > +specifications. > +See the > +.Cm frag > +option for details on matching fragmented packets. > +.El > +.Ss RULE OPTIONS (MATCH PATTERNS) > +Additional match patterns can be used within > +rules. > +Zero or more of these so-called > +.Em options > +can be present in a rule, optionally prefixed by the > +.Cm not > +operand, and possibly grouped into > +.Em or-blocks . > +.Pp > +The following match patterns can be used (listed in alphabetical order): > +.Bl -tag -width indent > +.It Cm // this is a comment. > +Inserts the specified text as a comment in the rule. > +Everything following // is considered as a comment and stored in the rule. > +You can have comment-only rules, which are listed as having a > +.Cm count > +action followed by the comment. > +.It Cm bridged > +Alias for > +.Cm layer2 . > +.It Cm diverted > +Matches only packets generated by a divert socket. > +.It Cm diverted-loopback > +Matches only packets coming from a divert socket back into the IP stack > +input for delivery. > +.It Cm diverted-output > +Matches only packets going from a divert socket back outward to the IP > +stack output for delivery. > +.It Cm dst-ip Ar ip-address > +Matches IPv4 packets whose destination IP is one of the address(es) > +specified as argument. > +.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address > +Matches IPv6 packets whose destination IP is one of the address(es) > +specified as argument. > +.It Cm dst-port Ar ports > +Matches IP packets whose destination port is one of the port(s) > +specified as argument. > +.It Cm established > +Matches TCP packets that have the RST or ACK bits set. > +.It Cm ext6hdr Ar header > +Matches IPv6 packets containing the extended header given by > +.Ar header . > +Supported headers are: > +.Pp > +Fragment, > +.Pq Cm frag , > +Hop-to-hop options > +.Pq Cm hopopt , > +any type of Routing Header > +.Pq Cm route , > +Source routing Routing Header Type 0 > +.Pq Cm rthdr0 , > +Mobile IPv6 Routing Header Type 2 > +.Pq Cm rthdr2 , > +Destination options > +.Pq Cm dstopt , > +IPSec authentication headers > +.Pq Cm ah , > +and IPsec encapsulated security payload headers > +.Pq Cm esp . > +.It Cm fib Ar fibnum > +Matches a packet that has been tagged to use > +the given FIB (routing table) number. > +.It Cm flow Ar table Ns Pq Ar name Ns Op , Ns Ar value > +Search for the flow entry in lookup table > +.Ar name . > +If not found, the match fails. > +Otherwise, the match succeeds and > +.Cm tablearg > +is set to the value extracted from the table. > +.Pp > +This option can be useful to quickly dispatch traffic based on > +certain packet fields. > +See the > +.Sx LOOKUP TABLES > +section below for more information on lookup tables. > +.It Cm flow-id Ar labels > +Matches IPv6 packets containing any of the flow labels given in > +.Ar labels . > +.Ar labels > +is a comma separated list of numeric flow labels. > +.It Cm frag > +Matches packets that are fragments and not the first > +fragment of an IP datagram. > +Note that these packets will not have > +the next protocol header (e.g.\& TCP, UDP) so options that look into > +these headers cannot match. > +.It Cm gid Ar group > +Matches all TCP or UDP packets sent by or received for a > +.Ar group . > +A > +.Ar group > +may be specified by name or number. > +.It Cm jail Ar prisonID > +Matches all TCP or UDP packets sent by or received for the > +jail whos prison ID is > +.Ar prisonID . > +.It Cm icmptypes Ar types > +Matches ICMP packets whose ICMP type is in the list > +.Ar types . > +The list may be specified as any combination of > +individual types (numeric) separated by commas. > +.Em Ranges are not allowed . > +The supported ICMP types are: > +.Pp > +echo reply > +.Pq Cm 0 , > +destination unreachable > +.Pq Cm 3 , > +source quench > +.Pq Cm 4 , > +redirect > +.Pq Cm 5 , > +echo request > +.Pq Cm 8 , > +router advertisement > +.Pq Cm 9 , > +router solicitation > +.Pq Cm 10 , > +time-to-live exceeded > +.Pq Cm 11 , > +IP header bad > +.Pq Cm 12 , > +timestamp request > +.Pq Cm 13 , > +timestamp reply > +.Pq Cm 14 , > +information request > +.Pq Cm 15 , > +information reply > +.Pq Cm 16 , > +address mask request > +.Pq Cm 17 > +and address mask reply > +.Pq Cm 18 . > +.It Cm icmp6types Ar types > +Matches ICMP6 packets whose ICMP6 type is in the list of > +.Ar types . > +The list may be specified as any combination of > +individual types (numeric) separated by commas. > +.Em Ranges are not allowed . > +.It Cm in | out > +Matches incoming or outgoing packets, respectively. > +.Cm in > +and > +.Cm out > +are mutually exclusive (in fact, > +.Cm out > +is implemented as > +.Cm not in Ns No ). > +.It Cm ipid Ar id-list > +Matches IPv4 packets whose > +.Cm ip_id > +field has value included in > +.Ar id-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +.It Cm iplen Ar len-list > +Matches IP packets whose total length, including header and data, is > +in the set > +.Ar len-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +.It Cm ipoptions Ar spec > +Matches packets whose IPv4 header contains the comma separated list of > +options specified in > +.Ar spec . > +The supported IP options are: > +.Pp > +.Cm ssrr > +(strict source route), > +.Cm lsrr > +(loose source route), > +.Cm rr > +(record packet route) and > +.Cm ts > +(timestamp). > +The absence of a particular option may be denoted > +with a > +.Ql \&! . > +.It Cm ipprecedence Ar precedence > +Matches IPv4 packets whose precedence field is equal to > +.Ar precedence . > +.It Cm ipsec > +Matches packets that have IPSEC history associated with them > +(i.e., the packet comes encapsulated in IPSEC, the kernel > +has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly > +decapsulate it). > +.Pp > +Note that specifying > +.Cm ipsec > +is different from specifying > +.Cm proto Ar ipsec > +as the latter will only look at the specific IP protocol field, > +irrespective of IPSEC kernel support and the validity of the IPSEC data. > +.Pp > +Further note that this flag is silently ignored in kernels without > +IPSEC support. > +It does not affect rule processing when given and the > +rules are handled as if with no > +.Cm ipsec > +flag. > +.It Cm iptos Ar spec > +Matches IPv4 packets whose > +.Cm tos > +field contains the comma separated list of > +service types specified in > +.Ar spec . > +The supported IP types of service are: > +.Pp > +.Cm lowdelay > +.Pq Dv IPTOS_LOWDELAY , > +.Cm throughput > +.Pq Dv IPTOS_THROUGHPUT , > +.Cm reliability > +.Pq Dv IPTOS_RELIABILITY , > +.Cm mincost > +.Pq Dv IPTOS_MINCOST , > +.Cm congestion > +.Pq Dv IPTOS_ECN_CE . > +The absence of a particular type may be denoted > +with a > +.Ql \&! . > +.It Cm dscp spec Ns Op , Ns Ar spec > +Matches IPv4/IPv6 packets whose > +.Cm DS > +field value is contained in > +.Ar spec > +mask. > +Multiple values can be specified via > +the comma separated list. > +Value can be one of keywords used in > +.Cm setdscp > +action or exact number. > +.It Cm ipttl Ar ttl-list > +Matches IPv4 packets whose time to live is included in > +.Ar ttl-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +.It Cm ipversion Ar ver > +Matches IP packets whose IP version field is > +.Ar ver . > +.It Cm keep-state > +Upon a match, the firewall will create a dynamic rule, whose > +default behaviour is to match bidirectional traffic between > +source and destination IP/port using the same protocol. > +The rule has a limited lifetime (controlled by a set of > +.Xr sysctl 8 > +variables), and the lifetime is refreshed every time a matching > +packet is found. > +.It Cm layer2 > +Matches only layer2 packets, i.e., those passed to > +.Nm > +from ether_demux() and ether_output_frame(). > +.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N > +The firewall will only allow > +.Ar N > +connections with the same > +set of parameters as specified in the rule. > +One or more > +of source and destination addresses and ports can be > +specified. > +Currently, > +only IPv4 flows are supported. > +.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar name > +Search an entry in lookup table > +.Ar name > +that matches the field specified as argument. > +If not found, the match fails. > +Otherwise, the match succeeds and > +.Cm tablearg > +is set to the value extracted from the table. > +.Pp > +This option can be useful to quickly dispatch traffic based on > +certain packet fields. > +See the > +.Sx LOOKUP TABLES > +section below for more information on lookup tables. > +.It Cm { MAC | mac } Ar dst-mac src-mac > +Match packets with a given > +.Ar dst-mac > +and > +.Ar src-mac > +addresses, specified as the > +.Cm any > +keyword (matching any MAC address), or six groups of hex digits > +separated by colons, > +and optionally followed by a mask indicating the significant bits. > +The mask may be specified using either of the following methods: > +.Bl -enum -width indent > +.It > +A slash > +.Pq / > +followed by the number of significant bits. > +For example, an address with 33 significant bits could be specified as: > +.Pp > +.Dl "MAC 10:20:30:40:50:60/33 any" > +.It > +An ampersand > +.Pq & > +followed by a bitmask specified as six groups of hex digits separated > +by colons. > +For example, an address in which the last 16 bits are significant could > +be specified as: > +.Pp > +.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" > +.Pp > +Note that the ampersand character has a special meaning in many shells > +and should generally be escaped. > +.El > +Note that the order of MAC addresses (destination first, > +source second) is > +the same as on the wire, but the opposite of the one used for > +IP addresses. > +.It Cm mac-type Ar mac-type > +Matches packets whose Ethernet Type field > +corresponds to one of those specified as argument. > +.Ar mac-type > +is specified in the same way as > +.Cm port numbers > +(i.e., one or more comma-separated single values or ranges). > +You can use symbolic names for known values such as > +.Em vlan , ipv4, ipv6 . > +Values can be entered as decimal or hexadecimal (if prefixed by 0x), > +and they are always printed as hexadecimal (unless the > +.Cm -N > +option is used, in which case symbolic resolution will be attempted). > +.It Cm proto Ar protocol > +Matches packets with the corresponding IP protocol. > +.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Po Ar name Ns Oo , Ns Ar value Oc Pc | Ar ipno | Ar any > +Matches packets received, transmitted or going through, > +respectively, the interface specified by exact name > +.Po Ar ifX Pc , > +by device name > +.Po Ar if* Pc , > +by IP address, or through some interface. > +Table > +.Ar name > +may be used to match interface by its kernel ifindex. > +See the > +.Sx LOOKUP TABLES > +section below for more information on lookup tables. > +.Pp > +The > +.Cm via > +keyword causes the interface to always be checked. > +If > +.Cm recv > +or > +.Cm xmit > +is used instead of > +.Cm via , > +then only the receive or transmit interface (respectively) > +is checked. > +By specifying both, it is possible to match packets based on > +both receive and transmit interface, e.g.: > +.Pp > +.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" > +.Pp > +The > +.Cm recv > +interface can be tested on either incoming or outgoing packets, > +while the > +.Cm xmit > +interface can only be tested on outgoing packets. > +So > +.Cm out > +is required (and > +.Cm in > +is invalid) whenever > +.Cm xmit > +is used. > +.Pp > +A packet might not have a receive or transmit interface: packets > +originating from the local host have no receive interface, > +while packets destined for the local host have no transmit > +interface. > +.It Cm setup > +Matches TCP packets that have the SYN bit set but no ACK bit. > +This is the short form of > +.Dq Li tcpflags\ syn,!ack . > +.It Cm sockarg > +Matches packets that are associated to a local socket and > +for which the SO_USER_COOKIE socket option has been set > +to a non-zero value. > +As a side effect, the value of the > +option is made available as > +.Cm tablearg > +value, which in turn can be used as > +.Cm skipto > +or > +.Cm pipe > +number. > +.It Cm src-ip Ar ip-address > +Matches IPv4 packets whose source IP is one of the address(es) > +specified as an argument. > +.It Cm src-ip6 Ar ip6-address > +Matches IPv6 packets whose source IP is one of the address(es) > +specified as an argument. > +.It Cm src-port Ar ports > +Matches IP packets whose source port is one of the port(s) > +specified as argument. > +.It Cm tagged Ar tag-list > +Matches packets whose tags are included in > +.Ar tag-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +Tags can be applied to the packet using > +.Cm tag > +rule action parameter (see it's description for details on tags). > +.It Cm tcpack Ar ack > +TCP packets only. > +Match if the TCP header acknowledgment number field is set to > +.Ar ack . > +.It Cm tcpdatalen Ar tcpdatalen-list > +Matches TCP packets whose length of TCP data is > +.Ar tcpdatalen-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +.It Cm tcpflags Ar spec > +TCP packets only. > +Match if the TCP header contains the comma separated list of > +flags specified in > +.Ar spec . > +The supported TCP flags are: > +.Pp > +.Cm fin , > +.Cm syn , > +.Cm rst , > +.Cm psh , > +.Cm ack > +and > +.Cm urg . > +The absence of a particular flag may be denoted > +with a > +.Ql \&! . > +A rule which contains a > +.Cm tcpflags > +specification can never match a fragmented packet which has > +a non-zero offset. > +See the > +.Cm frag > +option for details on matching fragmented packets. > +.It Cm tcpseq Ar seq > +TCP packets only. > +Match if the TCP header sequence number field is set to > +.Ar seq . > +.It Cm tcpwin Ar tcpwin-list > +Matches TCP packets whose header window field is set to > +.Ar tcpwin-list , > +which is either a single value or a list of values or ranges > +specified in the same way as > +.Ar ports . > +.It Cm tcpoptions Ar spec > +TCP packets only. > +Match if the TCP header contains the comma separated list of > +options specified in > +.Ar spec . > +The supported TCP options are: > +.Pp > +.Cm mss > +(maximum segment size), > +.Cm window > +(tcp window advertisement), > +.Cm sack > +(selective ack), > +.Cm ts > +(rfc1323 timestamp) and > +.Cm cc > +(rfc1644 t/tcp connection count). > +The absence of a particular option may be denoted > +with a > +.Ql \&! . > +.It Cm uid Ar user > +Match all TCP or UDP packets sent by or received for a > +.Ar user . > +A > +.Ar user > +may be matched by name or identification number. > +.It Cm verrevpath > +For incoming packets, > +a routing table lookup is done on the packet's source address. > +If the interface on which the packet entered the system matches the > +outgoing interface for the route, > +the packet matches. > +If the interfaces do not match up, > +the packet does not match. > +All outgoing packets or packets with no incoming interface match. > +.Pp > +The name and functionality of the option is intentionally similar to > +the Cisco IOS command: > +.Pp > +.Dl ip verify unicast reverse-path > +.Pp > +This option can be used to make anti-spoofing rules to reject all > +packets with source addresses not from this interface. > +See also the option > +.Cm antispoof . > +.It Cm versrcreach > +For incoming packets, > +a routing table lookup is done on the packet's source address. > +If a route to the source address exists, but not the default route > +or a blackhole/reject route, the packet matches. > +Otherwise, the packet does not match. > +All outgoing packets match. > +.Pp > +The name and functionality of the option is intentionally similar to > +the Cisco IOS command: > +.Pp > +.Dl ip verify unicast source reachable-via any > +.Pp > +This option can be used to make anti-spoofing rules to reject all > +packets whose source address is unreachable. > +.It Cm antispoof > +For incoming packets, the packet's source address is checked if it > +belongs to a directly connected network. > +If the network is directly connected, then the interface the packet > +came on in is compared to the interface the network is connected to. > +When incoming interface and directly connected interface are not the > +same, the packet does not match. > +Otherwise, the packet does match. > +All outgoing packets match. > +.Pp > +This option can be used to make anti-spoofing rules to reject all > +packets that pretend to be from a directly connected network but do > +not come in through that interface. > +This option is similar to but more restricted than > +.Cm verrevpath > +because it engages only on packets with source addresses of directly > +connected networks instead of all source addresses. > +.El > +.Sh LOOKUP TABLES > +Lookup tables are useful to handle large sparse sets of > +addresses or other search keys (e.g., ports, jail IDs, interface names). > +In the rest of this section we will use the term ``key''. > +Table name needs to match the following spec: > +.Ar table-name . > +Tables with the same name can be created in different > +.Ar sets . > +However, rule links to the tables in > +.Ar set 0 > +by default. > +This behavior can be controlled by > +.Va net.inet.ip.fw.tables_sets > +variable. > +See the > +.Sx SETS OF RULES > +section for more information. > +There may be up to 65535 different lookup tables. > +.Pp > +The following table types are supported: > +.Bl -tag -width indent > +.It Ar table-type : Ar addr | iface | number | flow > +.It Ar table-key : Ar addr Ns Oo / Ns Ar masklen Oc | iface-name | number | flow-spec > +.It Ar flow-spec : Ar flow-field Ns Op , Ns Ar flow-spec > +.It Ar flow-field : src-ip | proto | src-port | dst-ip | dst-port > +.It Cm addr > +matches IPv4 or IPv6 address. > +Each entry is represented by an > +.Ar addr Ns Op / Ns Ar masklen > +and will match all addresses with base > +.Ar addr > +(specified as an IPv4/IPv6 address, or a hostname) and mask width of > +.Ar masklen > +bits. > +If > +.Ar masklen > +is not specified, it defaults to 32 for IPv4 and 128 for IPv6. > +When looking up an IP address in a table, the most specific > +entry will match. > +.It Cm iface > +matches interface names. > +Each entry is represented by string treated as interface name. > +Wildcards are not supported. > +.It Cm number > +maches protocol ports, uids/gids or jail IDs. > +Each entry is represented by 32-bit unsigned integer. > +Ranges are not supported. > +.It Cm flow > +Matches packet fields specified by > +.Ar flow > +type suboptions with table entries. > +.El > +.Pp > +Tables require explicit creation via > +.Cm create > +before use. > +.Pp > +The following creation options are supported: > +.Bl -tag -width indent > +.It Ar create-options : Ar create-option | create-options > +.It Ar create-option : Cm type Ar table-type | Cm valtype Ar value-mask | Cm algo Ar algo-desc | > +.Cm limit Ar number | Cm locked > +.It Cm type > +Table key type. > +.It Cm valtype > +Table value mask. > +.It Cm algo > +Table algorithm to use (see below). > +.It Cm limit > +Maximum number of items that may be inserted into table. > +.It Cm locked > +Restrict any table modifications. > +.El > +.Pp > +Some of these options may be modified later via > +.Cm modify > +keyword. > +The following options can be changed: > +.Bl -tag -width indent > +.It Ar modify-options : Ar modify-option | modify-options > +.It Ar modify-option : Cm limit Ar number > +.It Cm limit > +Alter maximum number of items that may be inserted into table. > +.El > +.Pp > +Additionally, table can be locked or unlocked using > +.Cm lock > +or > +.Cm unlock > +commands. > +.Pp > +Tables of the same > +.Ar type > +can be swapped with each other using > +.Cm swap Ar name > +command. > +Swap may fail if tables limits are set and data exchange > +would result in limits hit. > +Operation is performed atomically. > +.Pp > +One or more entries can be added to a table at once using > +.Cm add > +command. > +Addition of all items are performed atomically. > +By default, error in addition of one entry does not influence > +addition of other entries. However, non-zero error code is returned > +in that case. > +Special > +.Cm atomic > +keyword may be specified before > +.Cm add > +to indicate all-or-none add request. > +.Pp > +One or more entries can be removed from a table at once using > +.Cm delete > +command. > +By default, error in removal of one entry does not influence > +removing of other entries. However, non-zero error code is returned > +in that case. > +.Pp > +It may be possible to check what entry will be found on particular > +.Ar table-key > +using > +.Cm lookup > +.Ar table-key > +command. > +This functionality is optional and may be unsupported in some algorithms. > +.Pp > +The following operations can be performed on > +.Ar one > +or > +.Cm all > +tables: > +.Bl -tag -width indent > +.It Cm list > +List all entries. > +.It Cm flush > +Removes all entries. > +.It Cm info > +Shows generic table information. > +.It Cm detail > +Shows generic table information and algo-specific data. > +.El > +.Pp > +The following lookup algorithms are supported: > +.Bl -tag -width indent > +.It Ar algo-desc : algo-name | "algo-name algo-data" > +.It Ar algo-name: Ar addr:radix | addr:hash | iface:array | number:array | flow:hash > +.It Cm addr:radix > +Separate Radix trees for IPv4 and IPv6, the same way as the routing table (see > +.Xr route 4 ) . > +Default choice for > +.Ar addr > +type. > +.It Cm addr:hash > +Separate auto-growing hashes for IPv4 and IPv6. > +Accepts entries with the same mask length specified initially via > +.Cm "addr:hash masks=/v4,/v6" > +algorithm creation options. > +Assume /32 and /128 masks by default. > +Search removes host bits (according to mask) from supplied address and checks > +resulting key in appropriate hash. > +Mostly optimized for /64 and byte-ranged IPv6 masks. > +.It Cm iface:array > +Array storing sorted indexes for entries which are presented in the system. > +Optimized for very fast lookup. > +.It Cm number:array > +Array storing sorted u32 numbers. > +.It Cm flow:hash > +Auto-growing hash storing flow entries. > +Search calculates hash on required packet fields and searches for matching > +entries in selected bucket. > +.El > +.Pp > +The > +.Cm tablearg > +feature provides the ability to use a value, looked up in the table, as > +the argument for a rule action, action parameter or rule option. > +This can significantly reduce number of rules in some configurations. > +If two tables are used in a rule, the result of the second (destination) > +is used. > +.Pp > +Each record may hold one or more values according to > +.Ar value-mask . > +This mask is set on table creation via > +.Cm valtype > +option. > +The following value types are supported: > +.Bl -tag -width indent > +.It Ar value-mask : Ar value-type Ns Op , Ns Ar value-mask > +.It Ar value-type : Ar skipto | pipe | fib | nat | dscp | tag | divert | > +.Ar netgraph | limit | ipv4 > +.It Cm skipto > +rule number to jump to. > +.It Cm pipe > +Pipe number to use. > +.It Cm fib > +fib number to match/set. > +.It Cm nat > +nat number to jump to. > +.It Cm dscp > +dscp value to match/set. > +.It Cm tag > +tag number to match/set. > +.It Cm divert > +port number to divert traffic to. > +.It Cm netgraph > +hook number to move packet to. > +.It Cm limit > +maximum number of connections. > +.It Cm ipv4 > +IPv4 nexthop to fwd packets to. > +.El > +.Pp > +The > +.Cm tablearg > +argument can be used with the following actions: > +.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib, > +action parameters: > +.Cm tag, untag, > +rule options: > +.Cm limit, tagged. > +.Pp > +When used with the > +.Cm skipto > +action, the user should be aware that the code will walk the ruleset > +up to a rule equal to, or past, the given number. > +.Pp > +See the > +.Sx EXAMPLES > +Section for example usage of tables and the tablearg keyword. > +.Sh SETS OF RULES > +Each rule or table belongs to one of 32 different > +.Em sets > +, numbered 0 to 31. > +Set 31 is reserved for the default rule. > +.Pp > +By default, rules or tables are put in set 0, unless you use the > +.Cm set N > +attribute when adding a new rule or table. > +Sets can be individually and atomically enabled or disabled, > +so this mechanism permits an easy way to store multiple configurations > +of the firewall and quickly (and atomically) switch between them. > +.Pp > +By default, tables from set 0 are referenced when adding rule with > +table opcodes regardless of rule set. > +This behavior can be changed by setting > +.Va net.inet.ip.fw.tables_set > +variable to 1. > +Rule's set will then be used for table references. > +.Pp > +The command to enable/disable sets is > +.Bd -ragged -offset indent > +.Nm > +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... > +.Ed > +.Pp > +where multiple > +.Cm enable > +or > +.Cm disable > +sections can be specified. > +Command execution is atomic on all the sets specified in the command. > +By default, all sets are enabled. > +.Pp > +When you disable a set, its rules behave as if they do not exist > +in the firewall configuration, with only one exception: > +.Bd -ragged -offset indent > +dynamic rules created from a rule before it had been disabled > +will still be active until they expire. > +In order to delete > +dynamic rules you have to explicitly delete the parent rule > +which generated them. > +.Ed > +.Pp > +The set number of rules can be changed with the command > +.Bd -ragged -offset indent > +.Nm > +.Cm set move > +.Brq Cm rule Ar rule-number | old-set > +.Cm to Ar new-set > +.Ed > +.Pp > +Also, you can atomically swap two rulesets with the command > +.Bd -ragged -offset indent > +.Nm > +.Cm set swap Ar first-set second-set > +.Ed > +.Pp > +See the > +.Sx EXAMPLES > +Section on some possible uses of sets of rules. > +.Sh STATEFUL FIREWALL > +Stateful operation is a way for the firewall to dynamically > +create rules for specific flows when packets that > +match a given pattern are detected. > +Support for stateful > +operation comes through the > +.Cm check-state , keep-state > +and > +.Cm limit > +options of > +.Nm rules . > +.Pp > +Dynamic rules are created when a packet matches a > +.Cm keep-state > +or > +.Cm limit > +rule, causing the creation of a > +.Em dynamic > +rule which will match all and only packets with > +a given > +.Em protocol > +between a > +.Em src-ip/src-port dst-ip/dst-port > +pair of addresses > +.Em ( src > +and > +.Em dst > +are used here only to denote the initial match addresses, but they > +are completely equivalent afterwards). > +Dynamic rules will be checked at the first > +.Cm check-state, keep-state > +or > +.Cm limit > +occurrence, and the action performed upon a match will be the same > +as in the parent rule. > +.Pp > +Note that no additional attributes other than protocol and IP addresses > +and ports are checked on dynamic rules. > +.Pp > +The typical use of dynamic rules is to keep a closed firewall configuration, > +but let the first TCP SYN packet from the inside network install a > +dynamic rule for the flow so that packets belonging to that session > +will be allowed through the firewall: > +.Pp > +.Dl "ipfw add check-state" > +.Dl "ipfw add allow tcp from my-subnet to any setup keep-state" > +.Dl "ipfw add deny tcp from any to any" > +.Pp > +A similar approach can be used for UDP, where an UDP packet coming > +from the inside will install a dynamic rule to let the response through > +the firewall: > +.Pp > +.Dl "ipfw add check-state" > +.Dl "ipfw add allow udp from my-subnet to any keep-state" > +.Dl "ipfw add deny udp from any to any" > +.Pp > +Dynamic rules expire after some time, which depends on the status > +of the flow and the setting of some > +.Cm sysctl > +variables. > +See Section > +.Sx SYSCTL VARIABLES > +for more details. > +For TCP sessions, dynamic rules can be instructed to periodically > +send keepalive packets to refresh the state of the rule when it is > +about to expire. > +.Pp > +See Section > +.Sx EXAMPLES > +for more examples on how to use dynamic rules. > +.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION > +.Nm > +is also the user interface for the > +.Nm dummynet > +traffic shaper, packet scheduler and network emulator, a subsystem that > +can artificially queue, delay or drop packets > +emulating the behaviour of certain network links > +or queueing systems. > +.Pp > +.Nm dummynet > +operates by first using the firewall to select packets > +using any match pattern that can be used in > +.Nm > +rules. > +Matching packets are then passed to either of two > +different objects, which implement the traffic regulation: > +.Bl -hang -offset XXXX > +.It Em pipe > +A > +.Em pipe > +emulates a > +.Em link > +with given bandwidth and propagation delay, > +driven by a FIFO scheduler and a single queue with programmable > +queue size and packet loss rate. > +Packets are appended to the queue as they come out from > +.Nm ipfw , > +and then transferred in FIFO order to the link at the desired rate. > +.It Em queue > +A > +.Em queue > +is an abstraction used to implement packet scheduling > +using one of several packet scheduling algorithms. > +Packets sent to a > +.Em queue > +are first grouped into flows according to a mask on the 5-tuple. > +Flows are then passed to the scheduler associated to the > +.Em queue , > +and each flow uses scheduling parameters (weight and others) > +as configured in the > +.Em queue > +itself. > +A scheduler in turn is connected to an emulated link, > +and arbitrates the link's bandwidth among backlogged flows according to > +weights and to the features of the scheduling algorithm in use. > +.El > +.Pp > +In practice, > +.Em pipes > +can be used to set hard limits to the bandwidth that a flow can use, whereas > +.Em queues > +can be used to determine how different flows share the available bandwidth. > +.Pp > +A graphical representation of the binding of queues, > +flows, schedulers and links is below. > +.Bd -literal -offset indent > + (flow_mask|sched_mask) sched_mask > + +---------+ weight Wx +-------------+ > + | |->-[flow]-->--| |-+ > + -->--| QUEUE x | ... | | | > + | |->-[flow]-->--| SCHEDuler N | | > + +---------+ | | | > + ... | +--[LINK N]-->-- > + +---------+ weight Wy | | +--[LINK N]-->-- > + | |->-[flow]-->--| | | > + -->--| QUEUE y | ... | | | > + | |->-[flow]-->--| | | > + +---------+ +-------------+ | > + +-------------+ > +.Ed > +It is important to understand the role of the SCHED_MASK > +and FLOW_MASK, which are configured through the commands > +.Dl "ipfw sched N config mask SCHED_MASK ..." > +and > +.Dl "ipfw queue X config mask FLOW_MASK ..." . > +.Pp > +The SCHED_MASK is used to assign flows to one or more > +scheduler instances, one for each > +value of the packet's 5-tuple after applying SCHED_MASK. > +As an example, using ``src-ip 0xffffff00'' creates one instance > +for each /24 destination subnet. > +.Pp > +The FLOW_MASK, together with the SCHED_MASK, is used to split > +packets into flows. > +As an example, using > +``src-ip 0x000000ff'' > +together with the previous SCHED_MASK makes a flow for > +each individual source address. > +In turn, flows for each /24 > +subnet will be sent to the same scheduler instance. > +.Pp > +The above diagram holds even for the > +.Em pipe > +case, with the only restriction that a > +.Em pipe > +only supports a SCHED_MASK, and forces the use of a FIFO > +scheduler (these are for backward compatibility reasons; > +in fact, internally, a > +.Nm dummynet's > +pipe is implemented exactly as above). > +.Pp > +There are two modes of > +.Nm dummynet > +operation: > +.Dq normal > +and > +.Dq fast . > +The > +.Dq normal > +mode tries to emulate a real link: the > +.Nm dummynet > +scheduler ensures that the packet will not leave the pipe faster than it > +would on the real link with a given bandwidth. > +The > +.Dq fast > +mode allows certain packets to bypass the > +.Nm dummynet > +scheduler (if packet flow does not exceed pipe's bandwidth). > +This is the reason why the > +.Dq fast > +mode requires less CPU cycles per packet (on average) and packet latency > +can be significantly lower in comparison to a real link with the same > +bandwidth. > +The default mode is > +.Dq normal . > +The > +.Dq fast > +mode can be enabled by setting the > +.Va net.inet.ip.dummynet.io_fast > +.Xr sysctl 8 > +variable to a non-zero value. > +.Pp > +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION > +The > +.Em pipe , > +.Em queue > +and > +.Em scheduler > +configuration commands are the following: > +.Bd -ragged -offset indent > +.Cm pipe Ar number Cm config Ar pipe-configuration > +.Pp > +.Cm queue Ar number Cm config Ar queue-configuration > +.Pp > +.Cm sched Ar number Cm config Ar sched-configuration > +.Ed > +.Pp > +The following parameters can be configured for a pipe: > +.Pp > +.Bl -tag -width indent -compact > +.It Cm bw Ar bandwidth | device > +Bandwidth, measured in > +.Sm off > +.Op Cm K | M > +.Brq Cm bit/s | Byte/s . > +.Sm on > +.Pp > +A value of 0 (default) means unlimited bandwidth. > +The unit must immediately follow the number, as in > +.Pp > +.Dl "ipfw pipe 1 config bw 300Kbit/s" > +.Pp > +If a device name is specified instead of a numeric value, as in > +.Pp > +.Dl "ipfw pipe 1 config bw tun0" > +.Pp > +then the transmit clock is supplied by the specified device. > +At the moment only the > +.Xr tun 4 > +device supports this > +functionality, for use in conjunction with > +.Xr ppp 8 . > +.Pp > +.It Cm delay Ar ms-delay > +Propagation delay, measured in milliseconds. > +The value is rounded to the next multiple of the clock tick > +(typically 10ms, but it is a good practice to run kernels > +with > +.Dq "options HZ=1000" > +to reduce > +the granularity to 1ms or less). > +The default value is 0, meaning no delay. > +.Pp > +.It Cm burst Ar size > +If the data to be sent exceeds the pipe's bandwidth limit > +(and the pipe was previously idle), up to > +.Ar size > +bytes of data are allowed to bypass the > +.Nm dummynet > +scheduler, and will be sent as fast as the physical link allows. > +Any additional data will be transmitted at the rate specified > +by the > +.Nm pipe > +bandwidth. > +The burst size depends on how long the pipe has been idle; > +the effective burst size is calculated as follows: > +MAX( > +.Ar size > +, > +.Nm bw > +* pipe_idle_time). > +.Pp > +.It Cm profile Ar filename > +A file specifying the additional overhead incurred in the transmission > +of a packet on the link. > +.Pp > +Some link types introduce extra delays in the transmission > +of a packet, e.g., because of MAC level framing, contention on > +the use of the channel, MAC level retransmissions and so on. > +From our point of view, the channel is effectively unavailable > +for this extra time, which is constant or variable depending > +on the link type. > +Additionally, packets may be dropped after this > +time (e.g., on a wireless link after too many retransmissions). > +We can model the additional delay with an empirical curve > +that represents its distribution. > +.Bd -literal -offset indent > + cumulative probability > + 1.0 ^ > + | > + L +-- loss-level x > + | ****** > + | * > + | ***** > + | * > + | ** > + | * > + +-------*-------------------> > + delay > +.Ed > +The empirical curve may have both vertical and horizontal lines. > +Vertical lines represent constant delay for a range of > +probabilities. > +Horizontal lines correspond to a discontinuity in the delay > +distribution: the pipe will use the largest delay for a > +given probability. > +.Pp > +The file format is the following, with whitespace acting as > +a separator and '#' indicating the beginning a comment: > +.Bl -tag -width indent > +.It Cm name Ar identifier > +optional name (listed by "ipfw pipe show") > +to identify the delay distribution; > +.It Cm bw Ar value > +the bandwidth used for the pipe. > +If not specified here, it must be present > +explicitly as a configuration parameter for the pipe; > +.It Cm loss-level Ar L > +the probability above which packets are lost. > +(0.0 <= L <= 1.0, default 1.0 i.e., no loss); > +.It Cm samples Ar N > +the number of samples used in the internal > +representation of the curve (2..1024; default 100); > +.It Cm "delay prob" | "prob delay" > +One of these two lines is mandatory and defines > +the format of the following lines with data points. > +.It Ar XXX Ar YYY > +2 or more lines representing points in the curve, > +with either delay or probability first, according > +to the chosen format. > +The unit for delay is milliseconds. > +Data points do not need to be sorted. > +Also, the number of actual lines can be different > +from the value of the "samples" parameter: > +.Nm > +utility will sort and interpolate > +the curve as needed. > +.El > +.Pp > +Example of a profile file: > +.Bd -literal -offset indent > +name bla_bla_bla > +samples 100 > +loss-level 0.86 > +prob delay > +0 200 # minimum overhead is 200ms > +0.5 200 > +0.5 300 > +0.8 1000 > +0.9 1300 > +1 1300 > +#configuration file end > +.Ed > +.El > +.Pp > +The following parameters can be configured for a queue: > +.Pp > +.Bl -tag -width indent -compact > +.It Cm pipe Ar pipe_nr > +Connects a queue to the specified pipe. > +Multiple queues (with the same or different weights) can be connected to > +the same pipe, which specifies the aggregate rate for the set of queues. > +.Pp > +.It Cm weight Ar weight > +Specifies the weight to be used for flows matching this queue. > +The weight must be in the range 1..100, and defaults to 1. > +.El > +.Pp > +The following case-insensitive parameters can be configured for a > +scheduler: > +.Pp > +.Bl -tag -width indent -compact > +.It Cm type Ar {fifo | wf2q+ | rr | qfq} > +specifies the scheduling algorithm to use. > +.Bl -tag -width indent -compact > +.It Cm fifo > +is just a FIFO scheduler (which means that all packets > +are stored in the same queue as they arrive to the scheduler). > +FIFO has O(1) per-packet time complexity, with very low > +constants (estimate 60-80ns on a 2GHz desktop machine) > +but gives no service guarantees. > +.It Cm wf2q+ > +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing > +algorithm which permits flows to share bandwidth according to > +their weights. > +Note that weights are not priorities; even a flow > +with a minuscule weight will never starve. > +WF2Q+ has O(log N) per-packet processing cost, where N is the number > +of flows, and is the default algorithm used by previous versions > +dummynet's queues. > +.It Cm rr > +implements the Deficit Round Robin algorithm, which has O(1) processing > +costs (roughly, 100-150ns per packet) > +and permits bandwidth allocation according to weights, but > +with poor service guarantees. > +.It Cm qfq > +implements the QFQ algorithm, which is a very fast variant of > +WF2Q+, with similar service guarantees and O(1) processing > +costs (roughly, 200-250ns per packet). > +.El > +.El > +.Pp > +In addition to the type, all parameters allowed for a pipe can also > +be specified for a scheduler. > +.Pp > +Finally, the following parameters can be configured for both > +pipes and queues: > +.Pp > +.Bl -tag -width XXXX -compact > +.It Cm buckets Ar hash-table-size > +Specifies the size of the hash table used for storing the > +various queues. > +Default value is 64 controlled by the > +.Xr sysctl 8 > +variable > +.Va net.inet.ip.dummynet.hash_size , > +allowed range is 16 to 65536. > +.Pp > +.It Cm mask Ar mask-specifier > +Packets sent to a given pipe or queue by an > +.Nm > +rule can be further classified into multiple flows, each of which is then > +sent to a different > +.Em dynamic > +pipe or queue. > +A flow identifier is constructed by masking the IP addresses, > +ports and protocol types as specified with the > +.Cm mask > +options in the configuration of the pipe or queue. > +For each different flow identifier, a new pipe or queue is created > +with the same parameters as the original object, and matching packets > +are sent to it. > +.Pp > +Thus, when > +.Em dynamic pipes > +are used, each flow will get the same bandwidth as defined by the pipe, > +whereas when > +.Em dynamic queues > +are used, each flow will share the parent's pipe bandwidth evenly > +with other flows generated by the same queue (note that other queues > +with different weights might be connected to the same pipe). > +.br > +Available mask specifiers are a combination of one or more of the following: > +.Pp > +.Cm dst-ip Ar mask , > +.Cm dst-ip6 Ar mask , > +.Cm src-ip Ar mask , > +.Cm src-ip6 Ar mask , > +.Cm dst-port Ar mask , > +.Cm src-port Ar mask , > +.Cm flow-id Ar mask , > +.Cm proto Ar mask > +or > +.Cm all , > +.Pp > +where the latter means all bits in all fields are significant. > +.Pp > +.It Cm noerror > +When a packet is dropped by a > +.Nm dummynet > +queue or pipe, the error > +is normally reported to the caller routine in the kernel, in the > +same way as it happens when a device queue fills up. > +Setting this > +option reports the packet as successfully delivered, which can be > +needed for some experimental setups where you want to simulate > +loss or congestion at a remote router. > +.Pp > +.It Cm plr Ar packet-loss-rate > +Packet loss rate. > +Argument > +.Ar packet-loss-rate > +is a floating-point number between 0 and 1, with 0 meaning no > +loss, 1 meaning 100% loss. > +The loss rate is internally represented on 31 bits. > +.Pp > +.It Cm queue Brq Ar slots | size Ns Cm Kbytes > +Queue size, in > +.Ar slots > +or > +.Cm KBytes . > +Default value is 50 slots, which > +is the typical queue size for Ethernet devices. > +Note that for slow speed links you should keep the queue > +size short or your traffic might be affected by a significant > +queueing delay. > +E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit > +or 20s of queue on a 30Kbit/s pipe. > +Even worse effects can result if you get packets from an > +interface with a much larger MTU, e.g.\& the loopback interface > +with its 16KB packets. > +The > +.Xr sysctl 8 > +variables > +.Em net.inet.ip.dummynet.pipe_byte_limit > +and > +.Em net.inet.ip.dummynet.pipe_slot_limit > +control the maximum lengths that can be specified. > +.Pp > +.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p > +[ecn] > +Make use of the RED (Random Early Detection) queue management algorithm. > +.Ar w_q > +and > +.Ar max_p > +are floating > +point numbers between 0 and 1 (inclusive), while > +.Ar min_th > +and > +.Ar max_th > +are integer numbers specifying thresholds for queue management > +(thresholds are computed in bytes if the queue has been defined > +in bytes, in slots otherwise). > +The two parameters can also be of the same value if needed. The > +.Nm dummynet > +also supports the gentle RED variant (gred) and ECN (Explicit Congestion > +Notification) as optional. Three > +.Xr sysctl 8 > +variables can be used to control the RED behaviour: > +.Bl -tag -width indent > +.It Va net.inet.ip.dummynet.red_lookup_depth > +specifies the accuracy in computing the average queue > +when the link is idle (defaults to 256, must be greater than zero) > +.It Va net.inet.ip.dummynet.red_avg_pkt_size > +specifies the expected average packet size (defaults to 512, must be > +greater than zero) > +.It Va net.inet.ip.dummynet.red_max_pkt_size > +specifies the expected maximum packet size, only used when queue > +thresholds are in bytes (defaults to 1500, must be greater than zero). > +.El > +.El > +.Pp > +When used with IPv6 data, > +.Nm dummynet > +currently has several limitations. > +Information necessary to route link-local packets to an > +interface is not available after processing by > +.Nm dummynet > +so those packets are dropped in the output path. > +Care should be taken to ensure that link-local packets are not passed to > +.Nm dummynet . > +.Sh CHECKLIST > +Here are some important points to consider when designing your > +rules: > +.Bl -bullet > +.It > +Remember that you filter both packets going > +.Cm in > +and > +.Cm out . > +Most connections need packets going in both directions. > +.It > +Remember to test very carefully. > +It is a good idea to be near the console when doing this. > +If you cannot be near the console, > +use an auto-recovery script such as the one in > +.Pa /usr/share/examples/ipfw/change_rules.sh . > +.It > +Do not forget the loopback interface. > +.El > +.Sh FINE POINTS > +.Bl -bullet > +.It > +There are circumstances where fragmented datagrams are unconditionally > +dropped. > +TCP packets are dropped if they do not contain at least 20 bytes of > +TCP header, UDP packets are dropped if they do not contain a full 8 > +byte UDP header, and ICMP packets are dropped if they do not contain > +4 bytes of ICMP header, enough to specify the ICMP type, code, and > +checksum. > +These packets are simply logged as > +.Dq pullup failed > +since there may not be enough good data in the packet to produce a > +meaningful log entry. > +.It > +Another type of packet is unconditionally dropped, a TCP packet with a > +fragment offset of one. > +This is a valid packet, but it only has one use, to try > +to circumvent firewalls. > +When logging is enabled, these packets are > +reported as being dropped by rule -1. > +.It > +If you are logged in over a network, loading the > +.Xr kld 4 > +version of > +.Nm > +is probably not as straightforward as you would think. > +The following command line is recommended: > +.Bd -literal -offset indent > +kldload ipfw && \e > +ipfw add 32000 allow ip from any to any > +.Ed > +.Pp > +Along the same lines, doing an > +.Bd -literal -offset indent > +ipfw flush > +.Ed > +.Pp > +in similar surroundings is also a bad idea. > +.It > +The > +.Nm > +filter list may not be modified if the system security level > +is set to 3 or higher > +(see > +.Xr init 8 > +for information on system security levels). > +.El > +.Sh PACKET DIVERSION > +A > +.Xr divert 4 > +socket bound to the specified port will receive all packets > +diverted to that port. > +If no socket is bound to the destination port, or if the divert module is > +not loaded, or if the kernel was not compiled with divert socket support, > +the packets are dropped. > +.Sh NETWORK ADDRESS TRANSLATION (NAT) > +.Nm > +support in-kernel NAT using the kernel version of > +.Xr libalias 3 . > +.Pp > +The nat configuration command is the following: > +.Bd -ragged -offset indent > +.Bk -words > +.Cm nat > +.Ar nat_number > +.Cm config > +.Ar nat-configuration > +.Ek > +.Ed > +.Pp > +The following parameters can be configured: > +.Bl -tag -width indent > +.It Cm ip Ar ip_address > +Define an ip address to use for aliasing. > +.It Cm if Ar nic > +Use ip address of NIC for aliasing, dynamically changing > +it if NIC's ip address changes. > +.It Cm log > +Enable logging on this nat instance. > +.It Cm deny_in > +Deny any incoming connection from outside world. > +.It Cm same_ports > +Try to leave the alias port numbers unchanged from > +the actual local port numbers. > +.It Cm unreg_only > +Traffic on the local network not originating from an > +unregistered address spaces will be ignored. > +.It Cm reset > +Reset table of the packet aliasing engine on address change. > +.It Cm reverse > +Reverse the way libalias handles aliasing. > +.It Cm proxy_only > +Obey transparent proxy rules only, packet aliasing is not performed. > +.It Cm skip_global > +Skip instance in case of global state lookup (see below). > +.El > +.Pp > +Some specials value can be supplied instead of > +.Va nat_number: > +.Bl -tag -width indent > +.It Cm global > +Looks up translation state in all configured nat instances. > +If an entry is found, packet is aliased according to that entry. > +If no entry was found in any of the instances, packet is passed unchanged, > +and no new entry will be created. > +See section > +.Sx MULTIPLE INSTANCES > +in > +.Xr natd 8 > +for more information. > +.It Cm tablearg > +Uses argument supplied in lookup table. > +See > +.Sx LOOKUP TABLES > +section below for more information on lookup tables. > +.El > +.Pp > +To let the packet continue after being (de)aliased, set the sysctl variable > +.Va net.inet.ip.fw.one_pass > +to 0. > +For more information about aliasing modes, refer to > +.Xr libalias 3 . > +See Section > +.Sx EXAMPLES > +for some examples about nat usage. > +.Ss REDIRECT AND LSNAT SUPPORT IN IPFW > +Redirect and LSNAT support follow closely the syntax used in > +.Xr natd 8 . > +See Section > +.Sx EXAMPLES > +for some examples on how to do redirect and lsnat. > +.Ss SCTP NAT SUPPORT > +SCTP nat can be configured in a similar manner to TCP through the > +.Nm > +command line tool. > +The main difference is that > +.Nm sctp nat > +does not do port translation. > +Since the local and global side ports will be the same, > +there is no need to specify both. > +Ports are redirected as follows: > +.Bd -ragged -offset indent > +.Bk -words > +.Cm nat > +.Ar nat_number > +.Cm config if > +.Ar nic > +.Cm redirect_port sctp > +.Ar ip_address [,addr_list] {[port | port-port] [,ports]} > +.Ek > +.Ed > +.Pp > +Most > +.Nm sctp nat > +configuration can be done in real-time through the > +.Xr sysctl 8 > +interface. > +All may be changed dynamically, though the hash_table size will only > +change for new > +.Nm nat > +instances. > +See > +.Sx SYSCTL VARIABLES > +for more info. > +.Sh LOADER TUNABLES > +Tunables can be set in > +.Xr loader 8 > +prompt, > +.Xr loader.conf 5 > +or > +.Xr kenv 1 > +before ipfw module gets loaded. > +.Bl -tag -width indent > +.It Va net.inet.ip.fw.default_to_accept: No 0 > +Defines ipfw last rule behavior. > +This value overrides > +.Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)" > +from kernel configuration file. > +.It Va net.inet.ip.fw.tables_max: No 128 > +Defines number of tables available in ipfw. > +Number cannot exceed 65534. > +.El > +.Sh SYSCTL VARIABLES > +A set of > +.Xr sysctl 8 > +variables controls the behaviour of the firewall and > +associated modules > +.Pq Nm dummynet , bridge , sctp nat . > +These are shown below together with their default value > +(but always check with the > +.Xr sysctl 8 > +command what value is actually in use) and meaning: > +.Bl -tag -width indent > +.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0 > +Defines how the > +.Nm nat > +responds to receipt of global OOTB ASCONF-AddIP: > +.Bl -tag -width indent > +.It Cm 0 > +No response (unless a partially matching association exists - > +ports and vtags match but global address does not) > +.It Cm 1 > +.Nm nat > +will accept and process all OOTB global AddIP messages. > +.El > +.Pp > +Option 1 should never be selected as this forms a security risk. > +An attacker can > +establish multiple fake associations by sending AddIP messages. > +.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5 > +Defines the maximum number of chunks in an SCTP packet that will be > +parsed for a > +packet that matches an existing association. > +This value is enforced to be greater or equal than > +.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . > +A high value is > +a DoS risk yet setting too low a value may result in > +important control chunks in > +the packet not being located and parsed. > +.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1 > +Defines when the > +.Nm nat > +responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. > +An OOTB packet is a packet that arrives with no existing association > +registered in the > +.Nm nat > +and is not an INIT or ASCONF-AddIP packet: > +.Bl -tag -width indent > +.It Cm 0 > +ErrorM is never sent in response to OOTB packets. > +.It Cm 1 > +ErrorM is only sent to OOTB packets received on the local side. > +.It Cm 2 > +ErrorM is sent to the local side and on the global side ONLY if there is a > +partial match (ports and vtags match but the source global IP does not). > +This value is only useful if the > +.Nm nat > +is tracking global IP addresses. > +.It Cm 3 > +ErrorM is sent in response to all OOTB packets on both > +the local and global side > +(DoS risk). > +.El > +.Pp > +At the moment the default is 0, since the ErrorM packet is not yet > +supported by most SCTP stacks. > +When it is supported, and if not tracking > +global addresses, we recommend setting this value to 1 to allow > +multi-homed local hosts to function with the > +.Nm nat . > +To track global addresses, we recommend setting this value to 2 to > +allow global hosts to be informed when they need to (re)send an > +ASCONF-AddIP. > +Value 3 should never be chosen (except for debugging) as the > +.Nm nat > +will respond to all OOTB global packets (a DoS risk). > +.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003 > +Size of hash tables used for > +.Nm nat > +lookups (100 < prime_number > 1000001). > +This value sets the > +.Nm hash table > +size for any future created > +.Nm nat > +instance and therefore must be set prior to creating a > +.Nm nat > +instance. > +The table sizes may be changed to suit specific needs. > +If there will be few > +concurrent associations, and memory is scarce, you may make these smaller. > +If there will be many thousands (or millions) of concurrent associations, you > +should make these larger. > +A prime number is best for the table size. > +The sysctl > +update function will adjust your input value to the next highest prime number. > +.It Va net.inet.ip.alias.sctp.holddown_time: No 0 > +Hold association in table for this many seconds after receiving a > +SHUTDOWN-COMPLETE. > +This allows endpoints to correct shutdown gracefully if a > +shutdown_complete is lost and retransmissions are required. > +.It Va net.inet.ip.alias.sctp.init_timer: No 15 > +Timeout value while waiting for (INIT-ACK|AddIP-ACK). > +This value cannot be 0. > +.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2 > +Defines the maximum number of chunks in an SCTP packet that will be parsed when > +no existing association exists that matches that packet. > +Ideally this packet > +will only be an INIT or ASCONF-AddIP packet. > +A higher value may become a DoS > +risk as malformed packets can consume processing resources. > +.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25 > +Defines the maximum number of parameters within a chunk that will be > +parsed in a > +packet. > +As for other similar sysctl variables, larger values pose a DoS risk. > +.It Va net.inet.ip.alias.sctp.log_level: No 0 > +Level of detail in the system log messages (0 \- minimal, 1 \- event, > +2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). > +May be a good > +option in high loss environments. > +.It Va net.inet.ip.alias.sctp.shutdown_time: No 15 > +Timeout value while waiting for SHUTDOWN-COMPLETE. > +This value cannot be 0. > +.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0 > +Enables/disables global IP address tracking within the > +.Nm nat > +and places an > +upper limit on the number of addresses tracked for each association: > +.Bl -tag -width indent > +.It Cm 0 > +Global tracking is disabled > +.It Cm >1 > +Enables tracking, the maximum number of addresses tracked for each > +association is limited to this value > +.El > +.Pp > +This variable is fully dynamic, the new value will be adopted for all newly > +arriving associations, existing associations are treated > +as they were previously. > +Global tracking will decrease the number of collisions within the > +.Nm nat > +at a cost > +of increased processing load, memory usage, complexity, and possible > +.Nm nat > +state > +problems in complex networks with multiple > +.Nm nats . > +We recommend not tracking > +global IP addresses, this will still result in a fully functional > +.Nm nat . > +.It Va net.inet.ip.alias.sctp.up_timer: No 300 > +Timeout value to keep an association up with no traffic. > +This value cannot be 0. > +.It Va net.inet.ip.dummynet.expire : No 1 > +Lazily delete dynamic pipes/queue once they have no pending traffic. > +You can disable this by setting the variable to 0, in which case > +the pipes/queues will only be deleted when the threshold is reached. > +.It Va net.inet.ip.dummynet.hash_size : No 64 > +Default size of the hash table used for dynamic pipes/queues. > +This value is used when no > +.Cm buckets > +option is specified when configuring a pipe/queue. > +.It Va net.inet.ip.dummynet.io_fast : No 0 > +If set to a non-zero value, > +the > +.Dq fast > +mode of > +.Nm dummynet > +operation (see above) is enabled. > +.It Va net.inet.ip.dummynet.io_pkt > +Number of packets passed to > +.Nm dummynet . > +.It Va net.inet.ip.dummynet.io_pkt_drop > +Number of packets dropped by > +.Nm dummynet . > +.It Va net.inet.ip.dummynet.io_pkt_fast > +Number of packets bypassed by the > +.Nm dummynet > +scheduler. > +.It Va net.inet.ip.dummynet.max_chain_len : No 16 > +Target value for the maximum number of pipes/queues in a hash bucket. > +The product > +.Cm max_chain_len*hash_size > +is used to determine the threshold over which empty pipes/queues > +will be expired even when > +.Cm net.inet.ip.dummynet.expire=0 . > +.It Va net.inet.ip.dummynet.red_lookup_depth : No 256 > +.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 > +.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 > +Parameters used in the computations of the drop probability > +for the RED algorithm. > +.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 > +.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 > +The maximum queue size that can be specified in bytes or packets. > +These limits prevent accidental exhaustion of resources such as mbufs. > +If you raise these limits, > +you should make sure the system is configured so that sufficient resources > +are available. > +.It Va net.inet.ip.fw.autoinc_step : No 100 > +Delta between rule numbers when auto-generating them. > +The value must be in the range 1..1000. > +.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets > +The current number of buckets in the hash table for dynamic rules > +(readonly). > +.It Va net.inet.ip.fw.debug : No 1 > +Controls debugging messages produced by > +.Nm . > +.It Va net.inet.ip.fw.default_rule : No 65535 > +The default rule number (read-only). > +By the design of > +.Nm , the default rule is the last one, so its number > +can also serve as the highest number allowed for a rule. > +.It Va net.inet.ip.fw.dyn_buckets : No 256 > +The number of buckets in the hash table for dynamic rules. > +Must be a power of 2, up to 65536. > +It only takes effect when all dynamic rules have expired, so you > +are advised to use a > +.Cm flush > +command to make sure that the hash table is resized. > +.It Va net.inet.ip.fw.dyn_count : No 3 > +Current number of dynamic rules > +(read-only). > +.It Va net.inet.ip.fw.dyn_keepalive : No 1 > +Enables generation of keepalive packets for > +.Cm keep-state > +rules on TCP sessions. > +A keepalive is generated to both > +sides of the connection every 5 seconds for the last 20 > +seconds of the lifetime of the rule. > +.It Va net.inet.ip.fw.dyn_max : No 8192 > +Maximum number of dynamic rules. > +When you hit this limit, no more dynamic rules can be > +installed until old ones expire. > +.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 > +.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 > +.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 > +.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 > +.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 > +.It Va net.inet.ip.fw.dyn_short_lifetime : No 30 > +These variables control the lifetime, in seconds, of dynamic > +rules. > +Upon the initial SYN exchange the lifetime is kept short, > +then increased after both SYN have been seen, then decreased > +again during the final FIN exchange or when a RST is received. > +Both > +.Em dyn_fin_lifetime > +and > +.Em dyn_rst_lifetime > +must be strictly lower than 5 seconds, the period of > +repetition of keepalives. > +The firewall enforces that. > +.It Va net.inet.ip.fw.dyn_keep_states: No 0 > +Keep dynamic states on rule/set deletion. > +States are relinked to default rule (65535). > +This can be handly for ruleset reload. > +Turned off by default. > +.It Va net.inet.ip.fw.enable : No 1 > +Enables the firewall. > +Setting this variable to 0 lets you run your machine without > +firewall even if compiled in. > +.It Va net.inet6.ip6.fw.enable : No 1 > +provides the same functionality as above for the IPv6 case. > +.It Va net.inet.ip.fw.one_pass : No 1 > +When set, the packet exiting from the > +.Nm dummynet > +pipe or from > +.Xr ng_ipfw 4 > +node is not passed though the firewall again. > +Otherwise, after an action, the packet is > +reinjected into the firewall at the next rule. > +.It Va net.inet.ip.fw.tables_max : No 128 > +Maximum number of tables. > +.It Va net.inet.ip.fw.verbose : No 1 > +Enables verbose messages. > +.It Va net.inet.ip.fw.verbose_limit : No 0 > +Limits the number of messages produced by a verbose firewall. > +.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 > +If enabled packets with unknown IPv6 Extension Headers will be denied. > +.It Va net.link.ether.ipfw : No 0 > +Controls whether layer-2 packets are passed to > +.Nm . > +Default is no. > +.It Va net.link.bridge.ipfw : No 0 > +Controls whether bridged packets are passed to > +.Nm . > +Default is no. > +.El > +.Sh INTERNAL DIAGNOSTICS > +There are some commands that may be useful to understand current state > +of certain subsystems inside kernel module. > +These commands provide debugging output which may change without notice. > +.Pp > +Currently the following commands are available as > +.Cm internal > +sub-options: > +.Bl -tag -width indent > +.It Cm iflist > +Lists all interface which are currently tracked by > +.Nm > +with their in-kernel status. > +.It Cm talist > +List all table lookup algorithms currently available. > +.El > +.Sh EXAMPLES > +There are far too many possible uses of > +.Nm > +so this Section will only give a small set of examples. > +.Pp > +.Ss BASIC PACKET FILTERING > +This command adds an entry which denies all tcp packets from > +.Em cracker.evil.org > +to the telnet port of > +.Em wolf.tambov.su > +from being forwarded by the host: > +.Pp > +.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" > +.Pp > +This one disallows any connection from the entire cracker's > +network to my host: > +.Pp > +.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" > +.Pp > +A first and efficient way to limit access (not using dynamic rules) > +is the use of the following rules: > +.Pp > +.Dl "ipfw add allow tcp from any to any established" > +.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" > +.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" > +.Dl "..." > +.Dl "ipfw add deny tcp from any to any" > +.Pp > +The first rule will be a quick match for normal TCP packets, > +but it will not match the initial SYN packet, which will be > +matched by the > +.Cm setup > +rules only for selected source/destination pairs. > +All other SYN packets will be rejected by the final > +.Cm deny > +rule. > +.Pp > +If you administer one or more subnets, you can take advantage > +of the address sets and or-blocks and write extremely > +compact rulesets which selectively enable services to blocks > +of clients, as below: > +.Pp > +.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" > +.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" > +.Dl "" > +.Dl "ipfw add allow ip from ${goodguys} to any" > +.Dl "ipfw add deny ip from ${badguys} to any" > +.Dl "... normal policies ..." > +.Pp > +The > +.Cm verrevpath > +option could be used to do automated anti-spoofing by adding the > +following to the top of a ruleset: > +.Pp > +.Dl "ipfw add deny ip from any to any not verrevpath in" > +.Pp > +This rule drops all incoming packets that appear to be coming to the > +system on the wrong interface. > +For example, a packet with a source > +address belonging to a host on a protected internal network would be > +dropped if it tried to enter the system from an external interface. > +.Pp > +The > +.Cm antispoof > +option could be used to do similar but more restricted anti-spoofing > +by adding the following to the top of a ruleset: > +.Pp > +.Dl "ipfw add deny ip from any to any not antispoof in" > +.Pp > +This rule drops all incoming packets that appear to be coming from another > +directly connected system but on the wrong interface. > +For example, a packet with a source address of > +.Li 192.168.0.0/24 , > +configured on > +.Li fxp0 , > +but coming in on > +.Li fxp1 > +would be dropped. > +.Pp > +The > +.Cm setdscp > +option could be used to (re)mark user traffic, > +by adding the following to the appropriate place in ruleset: > +.Pp > +.Dl "ipfw add setdscp be ip from any to any dscp af11,af21" > +.Ss DYNAMIC RULES > +In order to protect a site from flood attacks involving fake > +TCP packets, it is safer to use dynamic rules: > +.Pp > +.Dl "ipfw add check-state" > +.Dl "ipfw add deny tcp from any to any established" > +.Dl "ipfw add allow tcp from my-net to any setup keep-state" > +.Pp > +This will let the firewall install dynamic rules only for > +those connection which start with a regular SYN packet coming > +from the inside of our network. > +Dynamic rules are checked when encountering the first > +occurrence of a > +.Cm check-state , > +.Cm keep-state > +or > +.Cm limit > +rule. > +A > +.Cm check-state > +rule should usually be placed near the beginning of the > +ruleset to minimize the amount of work scanning the ruleset. > +Your mileage may vary. > +.Pp > +To limit the number of connections a user can open > +you can use the following type of rules: > +.Pp > +.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" > +.Dl "ipfw add allow tcp from any to me setup limit src-addr 4" > +.Pp > +The former (assuming it runs on a gateway) will allow each host > +on a /24 network to open at most 10 TCP connections. > +The latter can be placed on a server to make sure that a single > +client does not use more than 4 simultaneous connections. > +.Pp > +.Em BEWARE : > +stateful rules can be subject to denial-of-service attacks > +by a SYN-flood which opens a huge number of dynamic rules. > +The effects of such attacks can be partially limited by > +acting on a set of > +.Xr sysctl 8 > +variables which control the operation of the firewall. > +.Pp > +Here is a good usage of the > +.Cm list > +command to see accounting records and timestamp information: > +.Pp > +.Dl ipfw -at list > +.Pp > +or in short form without timestamps: > +.Pp > +.Dl ipfw -a list > +.Pp > +which is equivalent to: > +.Pp > +.Dl ipfw show > +.Pp > +Next rule diverts all incoming packets from 192.168.2.0/24 > +to divert port 5000: > +.Pp > +.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in > +.Ss TRAFFIC SHAPING > +The following rules show some of the applications of > +.Nm > +and > +.Nm dummynet > +for simulations and the like. > +.Pp > +This rule drops random incoming packets with a probability > +of 5%: > +.Pp > +.Dl "ipfw add prob 0.05 deny ip from any to any in" > +.Pp > +A similar effect can be achieved making use of > +.Nm dummynet > +pipes: > +.Pp > +.Dl "ipfw add pipe 10 ip from any to any" > +.Dl "ipfw pipe 10 config plr 0.05" > +.Pp > +We can use pipes to artificially limit bandwidth, e.g.\& on a > +machine acting as a router, if we want to limit traffic from > +local clients on 192.168.2.0/24 we do: > +.Pp > +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" > +.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes" > +.Pp > +note that we use the > +.Cm out > +modifier so that the rule is not used twice. > +Remember in fact that > +.Nm > +rules are checked both on incoming and outgoing packets. > +.Pp > +Should we want to simulate a bidirectional link with bandwidth > +limitations, the correct way is the following: > +.Pp > +.Dl "ipfw add pipe 1 ip from any to any out" > +.Dl "ipfw add pipe 2 ip from any to any in" > +.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes" > +.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes" > +.Pp > +The above can be very useful, e.g.\& if you want to see how > +your fancy Web page will look for a residential user who > +is connected only through a slow link. > +You should not use only one pipe for both directions, unless > +you want to simulate a half-duplex medium (e.g.\& AppleTalk, > +Ethernet, IRDA). > +It is not necessary that both pipes have the same configuration, > +so we can also simulate asymmetric links. > +.Pp > +Should we want to verify network performance with the RED queue > +management algorithm: > +.Pp > +.Dl "ipfw add pipe 1 ip from any to any" > +.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" > +.Pp > +Another typical application of the traffic shaper is to > +introduce some delay in the communication. > +This can significantly affect applications which do a lot of Remote > +Procedure Calls, and where the round-trip-time of the > +connection often becomes a limiting factor much more than > +bandwidth: > +.Pp > +.Dl "ipfw add pipe 1 ip from any to any out" > +.Dl "ipfw add pipe 2 ip from any to any in" > +.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s" > +.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s" > +.Pp > +Per-flow queueing can be useful for a variety of purposes. > +A very simple one is counting traffic: > +.Pp > +.Dl "ipfw add pipe 1 tcp from any to any" > +.Dl "ipfw add pipe 1 udp from any to any" > +.Dl "ipfw add pipe 1 ip from any to any" > +.Dl "ipfw pipe 1 config mask all" > +.Pp > +The above set of rules will create queues (and collect > +statistics) for all traffic. > +Because the pipes have no limitations, the only effect is > +collecting statistics. > +Note that we need 3 rules, not just the last one, because > +when > +.Nm > +tries to match IP packets it will not consider ports, so we > +would not see connections on separate ports as different > +ones. > +.Pp > +A more sophisticated example is limiting the outbound traffic > +on a net with per-host limits, rather than per-network limits: > +.Pp > +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" > +.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" > +.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" > +.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" > +.Ss LOOKUP TABLES > +In the following example, we need to create several traffic bandwidth > +classes and we need different hosts/networks to fall into different classes. > +We create one pipe for each class and configure them accordingly. > +Then we create a single table and fill it with IP subnets and addresses. > +For each subnet/host we set the argument equal to the number of the pipe > +that it should use. > +Then we classify traffic using a single rule: > +.Pp > +.Dl "ipfw pipe 1 config bw 1000Kbyte/s" > +.Dl "ipfw pipe 4 config bw 4000Kbyte/s" > +.Dl "..." > +.Dl "ipfw table T1 create type addr" > +.Dl "ipfw table T1 add 192.168.2.0/24 1" > +.Dl "ipfw table T1 add 192.168.0.0/27 4" > +.Dl "ipfw table T1 add 192.168.0.2 1" > +.Dl "..." > +.Dl "ipfw add pipe tablearg ip from 'table(T1)' to any" > +.Pp > +Using the > +.Cm fwd > +action, the table entries may include hostnames and IP addresses. > +.Pp > +.Dl "ipfw table T2 create type addr ftype ip" > +.Dl "ipfw table T2 add 192.168.2.0/24 10.23.2.1" > +.Dl "ipfw table T21 add 192.168.0.0/27 router1.dmz" > +.Dl "..." > +.Dl "ipfw add 100 fwd tablearg ip from any to table(1)" > +.Pp > +In the following example per-interface firewall is created: > +.Pp > +.Dl "ipfw table IN create type iface valtype skipto,fib" > +.Dl "ipfw table IN add vlan20 12000,12" > +.Dl "ipfw table IN add vlan30 13000,13" > +.Dl "ipfw table OUT create type iface valtype skipto" > +.Dl "ipfw table OUT add vlan20 22000" > +.Dl "ipfw table OUT add vlan30 23000" > +.Dl ".." > +.Dl "ipfw add 100 ipfw setfib tablearg ip from any to any recv 'table(IN)' in" > +.Dl "ipfw add 200 ipfw skipto tablearg ip from any to any recv 'table(IN)' in" > +.Dl "ipfw add 300 ipfw skipto tablearg ip from any to any xmit 'table(OUT)' out" > +.Pp > +The following example illustrate usage of flow tables: > +.Pp > +.Dl "ipfw table fl create type flow:flow:src-ip,proto,dst-ip,dst-port" > +.Dl "ipfw table fl add 2a02:6b8:77::88,tcp,2a02:6b8:77::99,80 11" > +.Dl "ipfw table fl add 10.0.0.1,udp,10.0.0.2,53 12" > +.Dl ".." > +.Dl "ipfw add 100 allow ip from any to any flow 'table(fl,11)' recv ix0" > +.Ss SETS OF RULES > +To add a set of rules atomically, e.g.\& set 18: > +.Pp > +.Dl "ipfw set disable 18" > +.Dl "ipfw add NN set 18 ... # repeat as needed" > +.Dl "ipfw set enable 18" > +.Pp > +To delete a set of rules atomically the command is simply: > +.Pp > +.Dl "ipfw delete set 18" > +.Pp > +To test a ruleset and disable it and regain control if something goes wrong: > +.Pp > +.Dl "ipfw set disable 18" > +.Dl "ipfw add NN set 18 ... # repeat as needed" > +.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" > +.Pp > +Here if everything goes well, you press control-C before the "sleep" > +terminates, and your ruleset will be left active. > +Otherwise, e.g.\& if > +you cannot access your box, the ruleset will be disabled after > +the sleep terminates thus restoring the previous situation. > +.Pp > +To show rules of the specific set: > +.Pp > +.Dl "ipfw set 18 show" > +.Pp > +To show rules of the disabled set: > +.Pp > +.Dl "ipfw -S set 18 show" > +.Pp > +To clear a specific rule counters of the specific set: > +.Pp > +.Dl "ipfw set 18 zero NN" > +.Pp > +To delete a specific rule of the specific set: > +.Pp > +.Dl "ipfw set 18 delete NN" > +.Ss NAT, REDIRECT AND LSNAT > +First redirect all the traffic to nat instance 123: > +.Pp > +.Dl "ipfw add nat 123 all from any to any" > +.Pp > +Then to configure nat instance 123 to alias all the outgoing traffic with ip > +192.168.0.123, blocking all incoming connections, trying to keep > +same ports on both sides, clearing aliasing table on address change > +and keeping a log of traffic/link statistics: > +.Pp > +.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" > +.Pp > +Or to change address of instance 123, aliasing table will be cleared (see > +reset option): > +.Pp > +.Dl "ipfw nat 123 config ip 10.0.0.1" > +.Pp > +To see configuration of nat instance 123: > +.Pp > +.Dl "ipfw nat 123 show config" > +.Pp > +To show logs of all the instances in range 111-999: > +.Pp > +.Dl "ipfw nat 111-999 show" > +.Pp > +To see configurations of all instances: > +.Pp > +.Dl "ipfw nat show config" > +.Pp > +Or a redirect rule with mixed modes could looks like: > +.Pp > +.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66" > +.Dl " redirect_port tcp 192.168.0.1:80 500" > +.Dl " redirect_proto udp 192.168.1.43 192.168.1.1" > +.Dl " redirect_addr 192.168.0.10,192.168.0.11" > +.Dl " 10.0.0.100 # LSNAT" > +.Dl " redirect_port tcp 192.168.0.1:80,192.168.0.10:22" > +.Dl " 500 # LSNAT" > +.Pp > +or it could be split in: > +.Pp > +.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66" > +.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500" > +.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1" > +.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12" > +.Dl " 10.0.0.100" > +.Dl "ipfw nat 5 config redirect_port tcp" > +.Dl " 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500" > +.Sh SEE ALSO > +.Xr cpp 1 , > +.Xr m4 1 , > +.Xr altq 4 , > +.Xr divert 4 , > +.Xr dummynet 4 , > +.Xr if_bridge 4 , > +.Xr ip 4 , > +.Xr ipfirewall 4 , > +.Xr ng_ipfw 4 , > +.Xr protocols 5 , > +.Xr services 5 , > +.Xr init 8 , > +.Xr kldload 8 , > +.Xr reboot 8 , > +.Xr sysctl 8 , > +.Xr syslogd 8 > +.Sh HISTORY > +The > +.Nm > +utility first appeared in > +.Fx 2.0 . > +.Nm dummynet > +was introduced in > +.Fx 2.2.8 . > +Stateful extensions were introduced in > +.Fx 4.0 . > +.Nm ipfw2 > +was introduced in Summer 2002. > +.Sh AUTHORS > +.An Ugen J. S. Antsilevich , > +.An Poul-Henning Kamp , > +.An Alex Nash , > +.An Archie Cobbs , > +.An Luigi Rizzo . > +.Pp > +.An -nosplit > +API based upon code written by > +.An Daniel Boulet > +for BSDI. > +.Pp > +Dummynet has been introduced by Luigi Rizzo in 1997-1998. > +.Pp > +Some early work (1999-2000) on the > +.Nm dummynet > +traffic shaper supported by Akamba Corp. > +.Pp > +The ipfw core (ipfw2) has been completely redesigned and > +reimplemented by Luigi Rizzo in summer 2002. > +Further > +actions and > +options have been added by various developer over the years. > +.Pp > +.An -nosplit > +In-kernel NAT support written by > +.An Paolo Pisati Aq Mt piso@FreeBSD.org > +as part of a Summer of Code 2005 project. > +.Pp > +SCTP > +.Nm nat > +support has been developed by > +.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . > +The primary developers and maintainers are David Hayes and Jason But. > +For further information visit: > +.Aq http://www.caia.swin.edu.au/urp/SONATA > +.Pp > +Delay profiles have been developed by Alessandro Cerri and > +Luigi Rizzo, supported by the > +European Commission within Projects Onelab and Onelab2. > +.Sh BUGS > +The syntax has grown over the years and sometimes it might be confusing. > +Unfortunately, backward compatibility prevents cleaning up mistakes > +made in the definition of the syntax. > +.Pp > +.Em !!! WARNING !!! > +.Pp > +Misconfiguring the firewall can put your computer in an unusable state, > +possibly shutting down network services and requiring console access to > +regain control of it. > +.Pp > +Incoming packet fragments diverted by > +.Cm divert > +are reassembled before delivery to the socket. > +The action used on those packet is the one from the > +rule which matches the first fragment of the packet. > +.Pp > +Packets diverted to userland, and then reinserted by a userland process > +may lose various packet attributes. > +The packet source interface name > +will be preserved if it is shorter than 8 bytes and the userland process > +saves and reuses the sockaddr_in > +(as does > +.Xr natd 8 ) ; > +otherwise, it may be lost. > +If a packet is reinserted in this manner, later rules may be incorrectly > +applied, making the order of > +.Cm divert > +rules in the rule sequence very important. > +.Pp > +Dummynet drops all packets with IPv6 link-local addresses. > +.Pp > +Rules using > +.Cm uid > +or > +.Cm gid > +may not behave as expected. > +In particular, incoming SYN packets may > +have no uid or gid associated with them since they do not yet belong > +to a TCP connection, and the uid/gid associated with a packet may not > +be as expected if the associated process calls > +.Xr setuid 2 > +or similar system calls. > +.Pp > +Rule syntax is subject to the command line environment and some patterns > +may need to be escaped with the backslash character > +or quoted appropriately. > +.Pp > +Due to the architecture of > +.Xr libalias 3 , > +ipfw nat is not compatible with the TCP segmentation offloading (TSO). > +Thus, to reliably nat your network traffic, please disable TSO > +on your NICs using > +.Xr ifconfig 8 . > +.Pp > +ICMP error messages are not implicitly matched by dynamic rules > +for the respective conversations. > +To avoid failures of network error detection and path MTU discovery, > +ICMP error messages may need to be allowed explicitly through static > +rules. > +.Pp > +Rules using > +.Cm call > +and > +.Cm return > +actions may lead to confusing behaviour if ruleset has mistakes, > +and/or interaction with other subsystems (netgraph, dummynet, etc.) is used. > +One possible case for this is packet leaving > +.Nm > +in subroutine on the input pass, while later on output encountering unpaired > +.Cm return > +first. > +As the call stack is kept intact after input pass, packet will suddenly > +return to the rule number used on input pass, not on output one. > +Order of processing should be checked carefully to avoid such mistakes. > diff --git a/example/ipfw/ipfw/ipfw2.c b/example/ipfw/ipfw/ipfw2.c > new file mode 100644 > index 0000000..b8ef6ee > --- /dev/null > +++ b/example/ipfw/ipfw/ipfw2.c > @@ -0,0 +1,4968 @@ > +/* > + * Copyright (c) 2002-2003 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * NEW command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/ipfw2.c 273253 2014-10-18 15:18:31Z melifaro $ > + */ > + > +#include <sys/types.h> > +#include <sys/param.h> > +#include <sys/socket.h> > +#include <sys/sockio.h> > +#include <sys/sysctl.h> > + > +#include "ipfw2.h" > + > +#include <ctype.h> > +#include <err.h> > +#include <errno.h> > +#include <grp.h> > +#include <netdb.h> > +#include <pwd.h> > +#include <stdio.h> > +#include <stdarg.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > +#include <time.h> /* ctime */ > +#include <timeconv.h> /* _long_to_time */ > +#include <unistd.h> > +#include <fcntl.h> > +#include <stddef.h> /* offsetof */ > + > +#include <net/ethernet.h> > +#include <net/if.h> /* only IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/in_systm.h> /* only n_short, n_long */ > +#include <netinet/ip.h> > +#include <netinet/ip_icmp.h> > +#include <netinet/ip_fw.h> > +#include <netinet/tcp.h> > +#include <arpa/inet.h> > + > +struct cmdline_opts co; /* global options */ > + > +struct format_opts { > + int bcwidth; > + int pcwidth; > + int show_counters; > + uint32_t set_mask; /* enabled sets mask */ > + uint32_t flags; /* request flags */ > + uint32_t first; /* first rule to request */ > + uint32_t last; /* last rule to request */ > + uint32_t dcnt; /* number of dynamic states */ > + ipfw_obj_ctlv *tstate; /* table state data */ > +}; > + > +int resvd_set_number = RESVD_SET; > + > +int ipfw_socket = -1; > + > +#define CHECK_LENGTH(v, len) do { \ > + if ((v) < (len)) \ > + errx(EX_DATAERR, "Rule too long"); \ > + } while (0) > +/* > + * Check if we have enough space in cmd buffer. Note that since > + * first 8? u32 words are reserved by reserved header, full cmd > + * buffer can't be used, so we need to protect from buffer overrun > + * only. At the beginnig, cblen is less than actual buffer size by > + * size of ipfw_insn_u32 instruction + 1 u32 work. This eliminates need > + * for checking small instructions fitting in given range. > + * We also (ab)use the fact that ipfw_insn is always the first field > + * for any custom instruction. > + */ > +#define CHECK_CMDLEN CHECK_LENGTH(cblen, F_LEN((ipfw_insn *)cmd)) > + > +#define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ > + if (!av[0]) \ > + errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ > + if (_substrcmp(*av, "tablearg") == 0) { \ > + arg = IP_FW_TARG; \ > + break; \ > + } \ > + \ > + { \ > + long _xval; \ > + char *end; \ > + \ > + _xval = strtol(*av, &end, 10); \ > + \ > + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ > + errx(EX_DATAERR, "%s: invalid argument: %s", \ > + match_value(s_x, tok), *av); \ > + \ > + if (errno == ERANGE || _xval < min || _xval > max) \ > + errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ > + match_value(s_x, tok), min, max, *av); \ > + \ > + if (_xval == IP_FW_TARG) \ > + errx(EX_DATAERR, "%s: illegal argument value: %s", \ > + match_value(s_x, tok), *av); \ > + arg = _xval; \ > + } \ > +} while (0) > + > +static struct _s_x f_tcpflags[] = { > + { "syn", TH_SYN }, > + { "fin", TH_FIN }, > + { "ack", TH_ACK }, > + { "psh", TH_PUSH }, > + { "rst", TH_RST }, > + { "urg", TH_URG }, > + { "tcp flag", 0 }, > + { NULL, 0 } > +}; > + > +static struct _s_x f_tcpopts[] = { > + { "mss", IP_FW_TCPOPT_MSS }, > + { "maxseg", IP_FW_TCPOPT_MSS }, > + { "window", IP_FW_TCPOPT_WINDOW }, > + { "sack", IP_FW_TCPOPT_SACK }, > + { "ts", IP_FW_TCPOPT_TS }, > + { "timestamp", IP_FW_TCPOPT_TS }, > + { "cc", IP_FW_TCPOPT_CC }, > + { "tcp option", 0 }, > + { NULL, 0 } > +}; > + > +/* > + * IP options span the range 0 to 255 so we need to remap them > + * (though in fact only the low 5 bits are significant). > + */ > +static struct _s_x f_ipopts[] = { > + { "ssrr", IP_FW_IPOPT_SSRR}, > + { "lsrr", IP_FW_IPOPT_LSRR}, > + { "rr", IP_FW_IPOPT_RR}, > + { "ts", IP_FW_IPOPT_TS}, > + { "ip option", 0 }, > + { NULL, 0 } > +}; > + > +static struct _s_x f_iptos[] = { > + { "lowdelay", IPTOS_LOWDELAY}, > + { "throughput", IPTOS_THROUGHPUT}, > + { "reliability", IPTOS_RELIABILITY}, > + { "mincost", IPTOS_MINCOST}, > + { "congestion", IPTOS_ECN_CE}, > + { "ecntransport", IPTOS_ECN_ECT0}, > + { "ip tos option", 0}, > + { NULL, 0 } > +}; > + > +struct _s_x f_ipdscp[] = { > + { "af11", IPTOS_DSCP_AF11 >> 2 }, /* 001010 */ > + { "af12", IPTOS_DSCP_AF12 >> 2 }, /* 001100 */ > + { "af13", IPTOS_DSCP_AF13 >> 2 }, /* 001110 */ > + { "af21", IPTOS_DSCP_AF21 >> 2 }, /* 010010 */ > + { "af22", IPTOS_DSCP_AF22 >> 2 }, /* 010100 */ > + { "af23", IPTOS_DSCP_AF23 >> 2 }, /* 010110 */ > + { "af31", IPTOS_DSCP_AF31 >> 2 }, /* 011010 */ > + { "af32", IPTOS_DSCP_AF32 >> 2 }, /* 011100 */ > + { "af33", IPTOS_DSCP_AF33 >> 2 }, /* 011110 */ > + { "af41", IPTOS_DSCP_AF41 >> 2 }, /* 100010 */ > + { "af42", IPTOS_DSCP_AF42 >> 2 }, /* 100100 */ > + { "af43", IPTOS_DSCP_AF43 >> 2 }, /* 100110 */ > + { "be", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ > + { "ef", IPTOS_DSCP_EF >> 2 }, /* 101110 */ > + { "cs0", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ > + { "cs1", IPTOS_DSCP_CS1 >> 2 }, /* 001000 */ > + { "cs2", IPTOS_DSCP_CS2 >> 2 }, /* 010000 */ > + { "cs3", IPTOS_DSCP_CS3 >> 2 }, /* 011000 */ > + { "cs4", IPTOS_DSCP_CS4 >> 2 }, /* 100000 */ > + { "cs5", IPTOS_DSCP_CS5 >> 2 }, /* 101000 */ > + { "cs6", IPTOS_DSCP_CS6 >> 2 }, /* 110000 */ > + { "cs7", IPTOS_DSCP_CS7 >> 2 }, /* 100000 */ > + { NULL, 0 } > +}; > + > +static struct _s_x limit_masks[] = { > + {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, > + {"src-addr", DYN_SRC_ADDR}, > + {"src-port", DYN_SRC_PORT}, > + {"dst-addr", DYN_DST_ADDR}, > + {"dst-port", DYN_DST_PORT}, > + {NULL, 0} > +}; > + > +/* > + * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines > + * This is only used in this code. > + */ > +#define IPPROTO_ETHERTYPE 0x1000 > +static struct _s_x ether_types[] = { > + /* > + * Note, we cannot use "-:&/" in the names because they are field > + * separators in the type specifications. Also, we use s = NULL as > + * end-delimiter, because a type of 0 can be legal. > + */ > + { "ip", 0x0800 }, > + { "ipv4", 0x0800 }, > + { "ipv6", 0x86dd }, > + { "arp", 0x0806 }, > + { "rarp", 0x8035 }, > + { "vlan", 0x8100 }, > + { "loop", 0x9000 }, > + { "trail", 0x1000 }, > + { "at", 0x809b }, > + { "atalk", 0x809b }, > + { "aarp", 0x80f3 }, > + { "pppoe_disc", 0x8863 }, > + { "pppoe_sess", 0x8864 }, > + { "ipx_8022", 0x00E0 }, > + { "ipx_8023", 0x0000 }, > + { "ipx_ii", 0x8137 }, > + { "ipx_snap", 0x8137 }, > + { "ipx", 0x8137 }, > + { "ns", 0x0600 }, > + { NULL, 0 } > +}; > + > + > +static struct _s_x rule_actions[] = { > + { "accept", TOK_ACCEPT }, > + { "pass", TOK_ACCEPT }, > + { "allow", TOK_ACCEPT }, > + { "permit", TOK_ACCEPT }, > + { "count", TOK_COUNT }, > + { "pipe", TOK_PIPE }, > + { "queue", TOK_QUEUE }, > + { "divert", TOK_DIVERT }, > + { "tee", TOK_TEE }, > + { "netgraph", TOK_NETGRAPH }, > + { "ngtee", TOK_NGTEE }, > + { "fwd", TOK_FORWARD }, > + { "forward", TOK_FORWARD }, > + { "skipto", TOK_SKIPTO }, > + { "deny", TOK_DENY }, > + { "drop", TOK_DENY }, > + { "reject", TOK_REJECT }, > + { "reset6", TOK_RESET6 }, > + { "reset", TOK_RESET }, > + { "unreach6", TOK_UNREACH6 }, > + { "unreach", TOK_UNREACH }, > + { "check-state", TOK_CHECKSTATE }, > + { "//", TOK_COMMENT }, > + { "nat", TOK_NAT }, > + { "reass", TOK_REASS }, > + { "setfib", TOK_SETFIB }, > + { "setdscp", TOK_SETDSCP }, > + { "call", TOK_CALL }, > + { "return", TOK_RETURN }, > + { NULL, 0 } /* terminator */ > +}; > + > +static struct _s_x rule_action_params[] = { > + { "altq", TOK_ALTQ }, > + { "log", TOK_LOG }, > + { "tag", TOK_TAG }, > + { "untag", TOK_UNTAG }, > + { NULL, 0 } /* terminator */ > +}; > + > +/* > + * The 'lookup' instruction accepts one of the following arguments. > + * -1 is a terminator for the list. > + * Arguments are passed as v[1] in O_DST_LOOKUP options. > + */ > +static int lookup_key[] = { > + TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, > + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; > + > +static struct _s_x rule_options[] = { > + { "tagged", TOK_TAGGED }, > + { "uid", TOK_UID }, > + { "gid", TOK_GID }, > + { "jail", TOK_JAIL }, > + { "in", TOK_IN }, > + { "limit", TOK_LIMIT }, > + { "keep-state", TOK_KEEPSTATE }, > + { "bridged", TOK_LAYER2 }, > + { "layer2", TOK_LAYER2 }, > + { "out", TOK_OUT }, > + { "diverted", TOK_DIVERTED }, > + { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, > + { "diverted-output", TOK_DIVERTEDOUTPUT }, > + { "xmit", TOK_XMIT }, > + { "recv", TOK_RECV }, > + { "via", TOK_VIA }, > + { "fragment", TOK_FRAG }, > + { "frag", TOK_FRAG }, > + { "fib", TOK_FIB }, > + { "ipoptions", TOK_IPOPTS }, > + { "ipopts", TOK_IPOPTS }, > + { "iplen", TOK_IPLEN }, > + { "ipid", TOK_IPID }, > + { "ipprecedence", TOK_IPPRECEDENCE }, > + { "dscp", TOK_DSCP }, > + { "iptos", TOK_IPTOS }, > + { "ipttl", TOK_IPTTL }, > + { "ipversion", TOK_IPVER }, > + { "ipver", TOK_IPVER }, > + { "estab", TOK_ESTAB }, > + { "established", TOK_ESTAB }, > + { "setup", TOK_SETUP }, > + { "sockarg", TOK_SOCKARG }, > + { "tcpdatalen", TOK_TCPDATALEN }, > + { "tcpflags", TOK_TCPFLAGS }, > + { "tcpflgs", TOK_TCPFLAGS }, > + { "tcpoptions", TOK_TCPOPTS }, > + { "tcpopts", TOK_TCPOPTS }, > + { "tcpseq", TOK_TCPSEQ }, > + { "tcpack", TOK_TCPACK }, > + { "tcpwin", TOK_TCPWIN }, > + { "icmptype", TOK_ICMPTYPES }, > + { "icmptypes", TOK_ICMPTYPES }, > + { "dst-ip", TOK_DSTIP }, > + { "src-ip", TOK_SRCIP }, > + { "dst-port", TOK_DSTPORT }, > + { "src-port", TOK_SRCPORT }, > + { "proto", TOK_PROTO }, > + { "MAC", TOK_MAC }, > + { "mac", TOK_MAC }, > + { "mac-type", TOK_MACTYPE }, > + { "verrevpath", TOK_VERREVPATH }, > + { "versrcreach", TOK_VERSRCREACH }, > + { "antispoof", TOK_ANTISPOOF }, > + { "ipsec", TOK_IPSEC }, > + { "icmp6type", TOK_ICMP6TYPES }, > + { "icmp6types", TOK_ICMP6TYPES }, > + { "ext6hdr", TOK_EXT6HDR}, > + { "flow-id", TOK_FLOWID}, > + { "ipv6", TOK_IPV6}, > + { "ip6", TOK_IPV6}, > + { "ipv4", TOK_IPV4}, > + { "ip4", TOK_IPV4}, > + { "dst-ipv6", TOK_DSTIP6}, > + { "dst-ip6", TOK_DSTIP6}, > + { "src-ipv6", TOK_SRCIP6}, > + { "src-ip6", TOK_SRCIP6}, > + { "lookup", TOK_LOOKUP}, > + { "flow", TOK_FLOW}, > + { "//", TOK_COMMENT }, > + > + { "not", TOK_NOT }, /* pseudo option */ > + { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ > + { "or", TOK_OR }, /* pseudo option */ > + { "|", /* escape */ TOK_OR }, /* pseudo option */ > + { "{", TOK_STARTBRACE }, /* pseudo option */ > + { "(", TOK_STARTBRACE }, /* pseudo option */ > + { "}", TOK_ENDBRACE }, /* pseudo option */ > + { ")", TOK_ENDBRACE }, /* pseudo option */ > + { NULL, 0 } /* terminator */ > +}; > + > +void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg); > +static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, > + ipfw_cfg_lheader **pcfg, size_t *psize); > +static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, > + ipfw_cfg_lheader *cfg, size_t sz, int ac, char **av); > +static void ipfw_list_tifaces(void); > + > +/* > + * Simple string buffer API. > + * Used to simplify buffer passing between function and for > + * transparent overrun handling. > + */ > + > +/* > + * Allocates new buffer of given size @sz. > + * > + * Returns 0 on success. > + */ > +int > +bp_alloc(struct buf_pr *b, size_t size) > +{ > + memset(b, 0, sizeof(struct buf_pr)); > + > + if ((b->buf = calloc(1, size)) == NULL) > + return (ENOMEM); > + > + b->ptr = b->buf; > + b->size = size; > + b->avail = b->size; > + > + return (0); > +} > + > +void > +bp_free(struct buf_pr *b) > +{ > + > + free(b->buf); > +} > + > +/* > + * Flushes buffer so new writer start from beginning. > + */ > +void > +bp_flush(struct buf_pr *b) > +{ > + > + b->ptr = b->buf; > + b->avail = b->size; > +} > + > +/* > + * Print message specified by @format and args. > + * Automatically manage buffer space and transparently handle > + * buffer overruns. > + * > + * Returns number of bytes that should have been printed. > + */ > +int > +bprintf(struct buf_pr *b, char *format, ...) > +{ > + va_list args; > + int i; > + > + va_start(args, format); > + > + i = vsnprintf(b->ptr, b->avail, format, args); > + va_end(args); > + > + if (i > b->avail || i < 0) { > + /* Overflow or print error */ > + b->avail = 0; > + } else { > + b->ptr += i; > + b->avail -= i; > + } > + > + b->needed += i; > + > + return (i); > +} > + > +/* > + * Special values printer for tablearg-aware opcodes. > + */ > +void > +bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg) > +{ > + > + if (str != NULL) > + bprintf(bp, "%s", str); > + if (arg == IP_FW_TARG) > + bprintf(bp, "tablearg"); > + else > + bprintf(bp, "%u", arg); > +} > + > +/* > + * Helper routine to print a possibly unaligned uint64_t on > + * various platform. If width > 0, print the value with > + * the desired width, followed by a space; > + * otherwise, return the required width. > + */ > +int > +pr_u64(struct buf_pr *b, uint64_t *pd, int width) > +{ > +#ifdef TCC > +#define U64_FMT "I64" > +#else > +#define U64_FMT "llu" > +#endif > + uint64_t u; > + unsigned long long d; > + > + bcopy (pd, &u, sizeof(u)); > + d = u; > + return (width > 0) ? > + bprintf(b, "%*" U64_FMT " ", width, d) : > + snprintf(NULL, 0, "%" U64_FMT, d) ; > +#undef U64_FMT > +} > + > + > +void * > +safe_calloc(size_t number, size_t size) > +{ > + void *ret = calloc(number, size); > + > + if (ret == NULL) > + err(EX_OSERR, "calloc"); > + return ret; > +} > + > +void * > +safe_realloc(void *ptr, size_t size) > +{ > + void *ret = realloc(ptr, size); > + > + if (ret == NULL) > + err(EX_OSERR, "realloc"); > + return ret; > +} > + > +/* > + * Compare things like interface or table names. > + */ > +int > +stringnum_cmp(const char *a, const char *b) > +{ > + int la, lb; > + > + la = strlen(a); > + lb = strlen(b); > + > + if (la > lb) > + return (1); > + else if (la < lb) > + return (-01); > + > + return (strcmp(a, b)); > +} > + > + > +/* > + * conditionally runs the command. > + * Selected options or negative -> getsockopt > + */ > +int > +do_cmd(int optname, void *optval, uintptr_t optlen) > +{ > + int i; > + > + if (co.test_only) > + return 0; > + > + if (ipfw_socket == -1) > + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); > + if (ipfw_socket < 0) > + err(EX_UNAVAILABLE, "socket"); > + > + if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || > + optname == IP_FW_ADD || optname == IP_FW3 || > + optname == IP_FW_NAT_GET_CONFIG || > + optname < 0 || > + optname == IP_FW_NAT_GET_LOG) { > + if (optname < 0) > + optname = -optname; > + i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, > + (socklen_t *)optlen); > + } else { > + i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); > + } > + return i; > +} > + > +/* > + * do_set3 - pass ipfw control cmd to kernel > + * @optname: option name > + * @optval: pointer to option data > + * @optlen: option length > + * > + * Assumes op3 header is already embedded. > + * Calls setsockopt() with IP_FW3 as kernel-visible opcode. > + * Returns 0 on success or errno otherwise. > + */ > +int > +do_set3(int optname, ip_fw3_opheader *op3, uintptr_t optlen) > +{ > + > + if (co.test_only) > + return (0); > + > + if (ipfw_socket == -1) > + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); > + if (ipfw_socket < 0) > + err(EX_UNAVAILABLE, "socket"); > + > + op3->opcode = optname; > + > + return (setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, optlen)); > +} > + > +/* > + * do_get3 - pass ipfw control cmd to kernel > + * @optname: option name > + * @optval: pointer to option data > + * @optlen: pointer to option length > + * > + * Assumes op3 header is already embedded. > + * Calls getsockopt() with IP_FW3 as kernel-visible opcode. > + * Returns 0 on success or errno otherwise. > + */ > +int > +do_get3(int optname, ip_fw3_opheader *op3, size_t *optlen) > +{ > + int error; > + > + if (co.test_only) > + return (0); > + > + if (ipfw_socket == -1) > + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); > + if (ipfw_socket < 0) > + err(EX_UNAVAILABLE, "socket"); > + > + op3->opcode = optname; > + > + error = getsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, > + (socklen_t *)optlen); > + > + return (error); > +} > + > +/** > + * match_token takes a table and a string, returns the value associated > + * with the string (-1 in case of failure). > + */ > +int > +match_token(struct _s_x *table, char *string) > +{ > + struct _s_x *pt; > + uint i = strlen(string); > + > + for (pt = table ; i && pt->s != NULL ; pt++) > + if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) > + return pt->x; > + return (-1); > +} > + > +/** > + * match_token takes a table and a string, returns the value associated > + * with the string for the best match. > + * > + * Returns: > + * value from @table for matched records > + * -1 for non-matched records > + * -2 if more than one records match @string. > + */ > +int > +match_token_relaxed(struct _s_x *table, char *string) > +{ > + struct _s_x *pt, *m = NULL; > + int i, c; > + > + i = strlen(string); > + c = 0; > + > + for (pt = table ; i != 0 && pt->s != NULL ; pt++) { > + if (strncmp(pt->s, string, i) != 0) > + continue; > + m = pt; > + c++; > + } > + > + if (c == 1) > + return (m->x); > + > + return (c > 0 ? -2: -1); > +} > + > +/** > + * match_value takes a table and a value, returns the string associated > + * with the value (NULL in case of failure). > + */ > +char const * > +match_value(struct _s_x *p, int value) > +{ > + for (; p->s != NULL; p++) > + if (p->x == value) > + return p->s; > + return NULL; > +} > + > +size_t > +concat_tokens(char *buf, size_t bufsize, struct _s_x *table, char *delimiter) > +{ > + struct _s_x *pt; > + int l; > + size_t sz; > + > + for (sz = 0, pt = table ; pt->s != NULL; pt++) { > + l = snprintf(buf + sz, bufsize - sz, "%s%s", > + (sz == 0) ? "" : delimiter, pt->s); > + sz += l; > + bufsize += l; > + if (sz > bufsize) > + return (bufsize); > + } > + > + return (sz); > +} > + > +/* > + * helper function to process a set of flags and set bits in the > + * appropriate masks. > + */ > +int > +fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, > + uint32_t *clear) > +{ > + char *q; /* points to the separator */ > + int val; > + uint32_t *which; /* mask we are working on */ > + > + while (p && *p) { > + if (*p == '!') { > + p++; > + which = clear; > + } else > + which = set; > + q = strchr(p, ','); > + if (q) > + *q++ = '\0'; > + val = match_token(flags, p); > + if (val <= 0) { > + if (e != NULL) > + *e = p; > + return (-1); > + } > + *which |= (uint32_t)val; > + p = q; > + } > + return (0); > +} > + > +void > +print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set) > +{ > + char const *comma = ""; > + int i, l; > + > + for (i = 0; list[i].x != 0; i++) { > + if ((set & list[i].x) == 0) > + continue; > + > + set &= ~list[i].x; > + l = snprintf(buf, sz, "%s%s", comma, list[i].s); > + if (l >= sz) > + return; > + comma = ","; > + buf += l; > + sz -=l; > + } > +} > + > +/* > + * _substrcmp takes two strings and returns 1 if they do not match, > + * and 0 if they match exactly or the first string is a sub-string > + * of the second. A warning is printed to stderr in the case that the > + * first string is a sub-string of the second. > + * > + * This function will be removed in the future through the usual > + * deprecation process. > + */ > +int > +_substrcmp(const char *str1, const char* str2) > +{ > + > + if (strncmp(str1, str2, strlen(str1)) != 0) > + return 1; > + > + if (strlen(str1) != strlen(str2)) > + warnx("DEPRECATED: '%s' matched '%s' as a sub-string", > + str1, str2); > + return 0; > +} > + > +/* > + * _substrcmp2 takes three strings and returns 1 if the first two do not match, > + * and 0 if they match exactly or the second string is a sub-string > + * of the first. A warning is printed to stderr in the case that the > + * first string does not match the third. > + * > + * This function exists to warn about the bizarre construction > + * strncmp(str, "by", 2) which is used to allow people to use a shortcut > + * for "bytes". The problem is that in addition to accepting "by", > + * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any > + * other string beginning with "by". > + * > + * This function will be removed in the future through the usual > + * deprecation process. > + */ > +int > +_substrcmp2(const char *str1, const char* str2, const char* str3) > +{ > + > + if (strncmp(str1, str2, strlen(str2)) != 0) > + return 1; > + > + if (strcmp(str1, str3) != 0) > + warnx("DEPRECATED: '%s' matched '%s'", > + str1, str3); > + return 0; > +} > + > +/* > + * prints one port, symbolic or numeric > + */ > +static void > +print_port(struct buf_pr *bp, int proto, uint16_t port) > +{ > + > + if (proto == IPPROTO_ETHERTYPE) { > + char const *s; > + > + if (co.do_resolv && (s = match_value(ether_types, port)) ) > + bprintf(bp, "%s", s); > + else > + bprintf(bp, "0x%04x", port); > + } else { > + struct servent *se = NULL; > + if (co.do_resolv) { > + struct protoent *pe = getprotobynumber(proto); > + > + se = getservbyport(htons(port), pe ? pe->p_name : NULL); > + } > + if (se) > + bprintf(bp, "%s", se->s_name); > + else > + bprintf(bp, "%d", port); > + } > +} > + > +static struct _s_x _port_name[] = { > + {"dst-port", O_IP_DSTPORT}, > + {"src-port", O_IP_SRCPORT}, > + {"ipid", O_IPID}, > + {"iplen", O_IPLEN}, > + {"ipttl", O_IPTTL}, > + {"mac-type", O_MAC_TYPE}, > + {"tcpdatalen", O_TCPDATALEN}, > + {"tcpwin", O_TCPWIN}, > + {"tagged", O_TAGGED}, > + {NULL, 0} > +}; > + > +/* > + * Print the values in a list 16-bit items of the types above. > + * XXX todo: add support for mask. > + */ > +static void > +print_newports(struct buf_pr *bp, ipfw_insn_u16 *cmd, int proto, int opcode) > +{ > + uint16_t *p = cmd->ports; > + int i; > + char const *sep; > + > + if (opcode != 0) { > + sep = match_value(_port_name, opcode); > + if (sep == NULL) > + sep = "???"; > + bprintf(bp, " %s", sep); > + } > + sep = " "; > + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { > + bprintf(bp, "%s", sep); > + print_port(bp, proto, p[0]); > + if (p[0] != p[1]) { > + bprintf(bp, "-"); > + print_port(bp, proto, p[1]); > + } > + sep = ","; > + } > +} > + > +/* > + * Like strtol, but also translates service names into port numbers > + * for some protocols. > + * In particular: > + * proto == -1 disables the protocol check; > + * proto == IPPROTO_ETHERTYPE looks up an internal table > + * proto == <some value in /etc/protocols> matches the values there. > + * Returns *end == s in case the parameter is not found. > + */ > +static int > +strtoport(char *s, char **end, int base, int proto) > +{ > + char *p, *buf; > + char *s1; > + int i; > + > + *end = s; /* default - not found */ > + if (*s == '\0') > + return 0; /* not found */ > + > + if (isdigit(*s)) > + return strtol(s, end, base); > + > + /* > + * find separator. '\\' escapes the next char. > + */ > + for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) > + if (*s1 == '\\' && s1[1] != '\0') > + s1++; > + > + buf = safe_calloc(s1 - s + 1, 1); > + > + /* > + * copy into a buffer skipping backslashes > + */ > + for (p = s, i = 0; p != s1 ; p++) > + if (*p != '\\') > + buf[i++] = *p; > + buf[i++] = '\0'; > + > + if (proto == IPPROTO_ETHERTYPE) { > + i = match_token(ether_types, buf); > + free(buf); > + if (i != -1) { /* found */ > + *end = s1; > + return i; > + } > + } else { > + struct protoent *pe = NULL; > + struct servent *se; > + > + if (proto != 0) > + pe = getprotobynumber(proto); > + setservent(1); > + se = getservbyname(buf, pe ? pe->p_name : NULL); > + free(buf); > + if (se != NULL) { > + *end = s1; > + return ntohs(se->s_port); > + } > + } > + return 0; /* not found */ > +} > + > +/* > + * Fill the body of the command with the list of port ranges. > + */ > +static int > +fill_newports(ipfw_insn_u16 *cmd, char *av, int proto, int cblen) > +{ > + uint16_t a, b, *p = cmd->ports; > + int i = 0; > + char *s = av; > + > + while (*s) { > + a = strtoport(av, &s, 0, proto); > + if (s == av) /* empty or invalid argument */ > + return (0); > + > + CHECK_LENGTH(cblen, i + 2); > + > + switch (*s) { > + case '-': /* a range */ > + av = s + 1; > + b = strtoport(av, &s, 0, proto); > + /* Reject expressions like '1-abc' or '1-2-3'. */ > + if (s == av || (*s != ',' && *s != '\0')) > + return (0); > + p[0] = a; > + p[1] = b; > + break; > + case ',': /* comma separated list */ > + case '\0': > + p[0] = p[1] = a; > + break; > + default: > + warnx("port list: invalid separator <%c> in <%s>", > + *s, av); > + return (0); > + } > + > + i++; > + p += 2; > + av = s + 1; > + } > + if (i > 0) { > + if (i + 1 > F_LEN_MASK) > + errx(EX_DATAERR, "too many ports/ranges\n"); > + cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ > + } > + return (i); > +} > + > +/* > + * Fill the body of the command with the list of DiffServ codepoints. > + */ > +static void > +fill_dscp(ipfw_insn *cmd, char *av, int cblen) > +{ > + uint32_t *low, *high; > + char *s = av, *a; > + int code; > + > + cmd->opcode = O_DSCP; > + cmd->len |= F_INSN_SIZE(ipfw_insn_u32) + 1; > + > + CHECK_CMDLEN; > + > + low = (uint32_t *)(cmd + 1); > + high = low + 1; > + > + *low = 0; > + *high = 0; > + > + while (s != NULL) { > + a = strchr(s, ','); > + > + if (a != NULL) > + *a++ = '\0'; > + > + if (isalpha(*s)) { > + if ((code = match_token(f_ipdscp, s)) == -1) > + errx(EX_DATAERR, "Unknown DSCP code"); > + } else { > + code = strtoul(s, NULL, 10); > + if (code < 0 || code > 63) > + errx(EX_DATAERR, "Invalid DSCP value"); > + } > + > + if (code > 32) > + *high |= 1 << (code - 32); > + else > + *low |= 1 << code; > + > + s = a; > + } > +} > + > +static struct _s_x icmpcodes[] = { > + { "net", ICMP_UNREACH_NET }, > + { "host", ICMP_UNREACH_HOST }, > + { "protocol", ICMP_UNREACH_PROTOCOL }, > + { "port", ICMP_UNREACH_PORT }, > + { "needfrag", ICMP_UNREACH_NEEDFRAG }, > + { "srcfail", ICMP_UNREACH_SRCFAIL }, > + { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, > + { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, > + { "isolated", ICMP_UNREACH_ISOLATED }, > + { "net-prohib", ICMP_UNREACH_NET_PROHIB }, > + { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, > + { "tosnet", ICMP_UNREACH_TOSNET }, > + { "toshost", ICMP_UNREACH_TOSHOST }, > + { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, > + { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, > + { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, > + { NULL, 0 } > +}; > + > +static void > +fill_reject_code(u_short *codep, char *str) > +{ > + int val; > + char *s; > + > + val = strtoul(str, &s, 0); > + if (s == str || *s != '\0' || val >= 0x100) > + val = match_token(icmpcodes, str); > + if (val < 0) > + errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); > + *codep = val; > + return; > +} > + > +static void > +print_reject_code(struct buf_pr *bp, uint16_t code) > +{ > + char const *s; > + > + if ((s = match_value(icmpcodes, code)) != NULL) > + bprintf(bp, "unreach %s", s); > + else > + bprintf(bp, "unreach %u", code); > +} > + > +/* > + * Returns the number of bits set (from left) in a contiguous bitmask, > + * or -1 if the mask is not contiguous. > + * XXX this needs a proper fix. > + * This effectively works on masks in big-endian (network) format. > + * when compiled on little endian architectures. > + * > + * First bit is bit 7 of the first byte -- note, for MAC addresses, > + * the first bit on the wire is bit 0 of the first byte. > + * len is the max length in bits. > + */ > +int > +contigmask(uint8_t *p, int len) > +{ > + int i, n; > + > + for (i=0; i<len ; i++) > + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ > + break; > + for (n=i+1; n < len; n++) > + if ( (p[n/8] & (1 << (7 - (n%8)))) != 0) > + return -1; /* mask not contiguous */ > + return i; > +} > + > +/* > + * print flags set/clear in the two bitmasks passed as parameters. > + * There is a specialized check for f_tcpflags. > + */ > +static void > +print_flags(struct buf_pr *bp, char const *name, ipfw_insn *cmd, > + struct _s_x *list) > +{ > + char const *comma = ""; > + int i; > + uint8_t set = cmd->arg1 & 0xff; > + uint8_t clear = (cmd->arg1 >> 8) & 0xff; > + > + if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { > + bprintf(bp, " setup"); > + return; > + } > + > + bprintf(bp, " %s ", name); > + for (i=0; list[i].x != 0; i++) { > + if (set & list[i].x) { > + set &= ~list[i].x; > + bprintf(bp, "%s%s", comma, list[i].s); > + comma = ","; > + } > + if (clear & list[i].x) { > + clear &= ~list[i].x; > + bprintf(bp, "%s!%s", comma, list[i].s); > + comma = ","; > + } > + } > +} > + > + > +/* > + * Print the ip address contained in a command. > + */ > +static void > +print_ip(struct buf_pr *bp, struct format_opts *fo, ipfw_insn_ip *cmd, > + char const *s) > +{ > + struct hostent *he = NULL; > + struct in_addr *ia; > + uint32_t len = F_LEN((ipfw_insn *)cmd); > + uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; > + char *t; > + > + if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { > + uint32_t d = a[1]; > + const char *arg = "<invalid>"; > + > + if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) > + arg = match_value(rule_options, lookup_key[d]); > + t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); > + bprintf(bp, "%s lookup %s %s", cmd->o.len & F_NOT ? " not": "", > + arg, t); > + return; > + } > + bprintf(bp, "%s%s ", cmd->o.len & F_NOT ? " not": "", s); > + > + if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { > + bprintf(bp, "me"); > + return; > + } > + if (cmd->o.opcode == O_IP_SRC_LOOKUP || > + cmd->o.opcode == O_IP_DST_LOOKUP) { > + t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); > + bprintf(bp, "table(%s", t); > + if (len == F_INSN_SIZE(ipfw_insn_u32)) > + bprintf(bp, ",%u", *a); > + bprintf(bp, ")"); > + return; > + } > + if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { > + uint32_t x, *map = (uint32_t *)&(cmd->mask); > + int i, j; > + char comma = '{'; > + > + x = cmd->o.arg1 - 1; > + x = htonl( ~x ); > + cmd->addr.s_addr = htonl(cmd->addr.s_addr); > + bprintf(bp, "%s/%d", inet_ntoa(cmd->addr), > + contigmask((uint8_t *)&x, 32)); > + x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); > + x &= 0xff; /* base */ > + /* > + * Print bits and ranges. > + * Locate first bit set (i), then locate first bit unset (j). > + * If we have 3+ consecutive bits set, then print them as a > + * range, otherwise only print the initial bit and rescan. > + */ > + for (i=0; i < cmd->o.arg1; i++) > + if (map[i/32] & (1<<(i & 31))) { > + for (j=i+1; j < cmd->o.arg1; j++) > + if (!(map[ j/32] & (1<<(j & 31)))) > + break; > + bprintf(bp, "%c%d", comma, i+x); > + if (j>i+2) { /* range has at least 3 elements */ > + bprintf(bp, "-%d", j-1+x); > + i = j-1; > + } > + comma = ','; > + } > + bprintf(bp, "}"); > + return; > + } > + /* > + * len == 2 indicates a single IP, whereas lists of 1 or more > + * addr/mask pairs have len = (2n+1). We convert len to n so we > + * use that to count the number of entries. > + */ > + for (len = len / 2; len > 0; len--, a += 2) { > + int mb = /* mask length */ > + (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? > + 32 : contigmask((uint8_t *)&(a[1]), 32); > + if (mb == 32 && co.do_resolv) > + he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); > + if (he != NULL) /* resolved to name */ > + bprintf(bp, "%s", he->h_name); > + else if (mb == 0) /* any */ > + bprintf(bp, "any"); > + else { /* numeric IP followed by some kind of mask */ > + ia = (struct in_addr *)&a[0]; > + bprintf(bp, "%s", inet_ntoa(*ia)); > + if (mb < 0) > + bprintf(bp, ":%s", inet_ntoa(*ia ) ); > + else if (mb < 32) > + bprintf(bp, "/%d", mb); > + } > + if (len > 1) > + bprintf(bp, ","); > + } > +} > + > +/* > + * prints a MAC address/mask pair > + */ > +static void > +print_mac(struct buf_pr *bp, uint8_t *addr, uint8_t *mask) > +{ > + int l = contigmask(mask, 48); > + > + if (l == 0) > + bprintf(bp, " any"); > + else { > + bprintf(bp, " %02x:%02x:%02x:%02x:%02x:%02x", > + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); > + if (l == -1) > + bprintf(bp, "&%02x:%02x:%02x:%02x:%02x:%02x", > + mask[0], mask[1], mask[2], > + mask[3], mask[4], mask[5]); > + else if (l < 48) > + bprintf(bp, "/%d", l); > + } > +} > + > +static void > +fill_icmptypes(ipfw_insn_u32 *cmd, char *av) > +{ > + uint8_t type; > + > + cmd->d[0] = 0; > + while (*av) { > + if (*av == ',') > + av++; > + > + type = strtoul(av, &av, 0); > + > + if (*av != ',' && *av != '\0') > + errx(EX_DATAERR, "invalid ICMP type"); > + > + if (type > 31) > + errx(EX_DATAERR, "ICMP type out of range"); > + > + cmd->d[0] |= 1 << type; > + } > + cmd->o.opcode = O_ICMPTYPE; > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); > +} > + > +static void > +print_icmptypes(struct buf_pr *bp, ipfw_insn_u32 *cmd) > +{ > + int i; > + char sep= ' '; > + > + bprintf(bp, " icmptypes"); > + for (i = 0; i < 32; i++) { > + if ( (cmd->d[0] & (1 << (i))) == 0) > + continue; > + bprintf(bp, "%c%d", sep, i); > + sep = ','; > + } > +} > + > +static void > +print_dscp(struct buf_pr *bp, ipfw_insn_u32 *cmd) > +{ > + int i, c; > + uint32_t *v; > + char sep= ' '; > + const char *code; > + > + bprintf(bp, " dscp"); > + i = 0; > + c = 0; > + v = cmd->d; > + while (i < 64) { > + if (*v & (1 << i)) { > + if ((code = match_value(f_ipdscp, i)) != NULL) > + bprintf(bp, "%c%s", sep, code); > + else > + bprintf(bp, "%c%d", sep, i); > + sep = ','; > + } > + > + if ((++i % 32) == 0) > + v++; > + } > +} > + > +/* > + * show_ipfw() prints the body of an ipfw rule. > + * Because the standard rule has at least proto src_ip dst_ip, we use > + * a helper function to produce these entries if not provided explicitly. > + * The first argument is the list of fields we have, the second is > + * the list of fields we want to be printed. > + * > + * Special cases if we have provided a MAC header: > + * + if the rule does not contain IP addresses/ports, do not print them; > + * + if the rule does not contain an IP proto, print "all" instead of "ip"; > + * > + * Once we have 'have_options', IP header fields are printed as options. > + */ > +#define HAVE_PROTO 0x0001 > +#define HAVE_SRCIP 0x0002 > +#define HAVE_DSTIP 0x0004 > +#define HAVE_PROTO4 0x0008 > +#define HAVE_PROTO6 0x0010 > +#define HAVE_IP 0x0100 > +#define HAVE_OPTIONS 0x8000 > + > +static void > +show_prerequisites(struct buf_pr *bp, int *flags, int want, int cmd) > +{ > + (void)cmd; /* UNUSED */ > + if (co.comment_only) > + return; > + if ( (*flags & HAVE_IP) == HAVE_IP) > + *flags |= HAVE_OPTIONS; > + > + if ( !(*flags & HAVE_OPTIONS)) { > + if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { > + if ( (*flags & HAVE_PROTO4)) > + bprintf(bp, " ip4"); > + else if ( (*flags & HAVE_PROTO6)) > + bprintf(bp, " ip6"); > + else > + bprintf(bp, " ip"); > + } > + if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) > + bprintf(bp, " from any"); > + if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) > + bprintf(bp, " to any"); > + } > + *flags |= want; > +} > + > +static void > +show_static_rule(struct cmdline_opts *co, struct format_opts *fo, > + struct buf_pr *bp, struct ip_fw_rule *rule, struct ip_fw_bcounter *cntr) > +{ > + static int twidth = 0; > + int l; > + ipfw_insn *cmd, *tagptr = NULL; > + const char *comment = NULL; /* ptr to comment if we have one */ > + int proto = 0; /* default */ > + int flags = 0; /* prerequisites */ > + ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ > + ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ > + int or_block = 0; /* we are in an or block */ > + uint32_t uval; > + > + if ((fo->set_mask & (1 << rule->set)) == 0) { > + /* disabled mask */ > + if (!co->show_sets) > + return; > + else > + bprintf(bp, "# DISABLED "); > + } > + bprintf(bp, "%05u ", rule->rulenum); > + > + /* Print counters if enabled */ > + if (fo->pcwidth > 0 || fo->bcwidth > 0) { > + pr_u64(bp, &cntr->pcnt, fo->pcwidth); > + pr_u64(bp, &cntr->bcnt, fo->bcwidth); > + } > + > + if (co->do_time == 2) > + bprintf(bp, "%10u ", cntr->timestamp); > + else if (co->do_time == 1) { > + char timestr[30]; > + time_t t = (time_t)0; > + > + if (twidth == 0) { > + strcpy(timestr, ctime(&t)); > + *strchr(timestr, '\n') = '\0'; > + twidth = strlen(timestr); > + } > + if (cntr->timestamp > 0) { > + t = _long_to_time(cntr->timestamp); > + > + strcpy(timestr, ctime(&t)); > + *strchr(timestr, '\n') = '\0'; > + bprintf(bp, "%s ", timestr); > + } else { > + bprintf(bp, "%*s", twidth, " "); > + } > + } > + > + if (co->show_sets) > + bprintf(bp, "set %d ", rule->set); > + > + /* > + * print the optional "match probability" > + */ > + if (rule->cmd_len > 0) { > + cmd = rule->cmd ; > + if (cmd->opcode == O_PROB) { > + ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; > + double d = 1.0 * p->d[0]; > + > + d = (d / 0x7fffffff); > + bprintf(bp, "prob %f ", d); > + } > + } > + > + /* > + * first print actions > + */ > + for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); > + l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { > + switch(cmd->opcode) { > + case O_CHECK_STATE: > + bprintf(bp, "check-state"); > + /* avoid printing anything else */ > + flags = HAVE_PROTO | HAVE_SRCIP | > + HAVE_DSTIP | HAVE_IP; > + break; > + > + case O_ACCEPT: > + bprintf(bp, "allow"); > + break; > + > + case O_COUNT: > + bprintf(bp, "count"); > + break; > + > + case O_DENY: > + bprintf(bp, "deny"); > + break; > + > + case O_REJECT: > + if (cmd->arg1 == ICMP_REJECT_RST) > + bprintf(bp, "reset"); > + else if (cmd->arg1 == ICMP_UNREACH_HOST) > + bprintf(bp, "reject"); > + else > + print_reject_code(bp, cmd->arg1); > + break; > + > + case O_UNREACH6: > + if (cmd->arg1 == ICMP6_UNREACH_RST) > + bprintf(bp, "reset6"); > + else > + print_unreach6_code(cmd->arg1); > + break; > + > + case O_SKIPTO: > + bprint_uint_arg(bp, "skipto ", cmd->arg1); > + break; > + > + case O_PIPE: > + bprint_uint_arg(bp, "pipe ", cmd->arg1); > + break; > + > + case O_QUEUE: > + bprint_uint_arg(bp, "queue ", cmd->arg1); > + break; > + > + case O_DIVERT: > + bprint_uint_arg(bp, "divert ", cmd->arg1); > + break; > + > + case O_TEE: > + bprint_uint_arg(bp, "tee ", cmd->arg1); > + break; > + > + case O_NETGRAPH: > + bprint_uint_arg(bp, "netgraph ", cmd->arg1); > + break; > + > + case O_NGTEE: > + bprint_uint_arg(bp, "ngtee ", cmd->arg1); > + break; > + > + case O_FORWARD_IP: > + { > + ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; > + > + if (s->sa.sin_addr.s_addr == INADDR_ANY) { > + bprintf(bp, "fwd tablearg"); > + } else { > + bprintf(bp, "fwd %s",inet_ntoa(s->sa.sin_addr)); > + } > + if (s->sa.sin_port) > + bprintf(bp, ",%d", s->sa.sin_port); > + } > + break; > + > + case O_FORWARD_IP6: > + { > + char buf[4 + INET6_ADDRSTRLEN + 1]; > + ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; > + > + bprintf(bp, "fwd %s", inet_ntop(AF_INET6, > + &s->sa.sin6_addr, buf, sizeof(buf))); > + if (s->sa.sin6_port) > + bprintf(bp, ",%d", s->sa.sin6_port); > + } > + break; > + > + case O_LOG: /* O_LOG is printed last */ > + logptr = (ipfw_insn_log *)cmd; > + break; > + > + case O_ALTQ: /* O_ALTQ is printed after O_LOG */ > + altqptr = (ipfw_insn_altq *)cmd; > + break; > + > + case O_TAG: > + tagptr = cmd; > + break; > + > + case O_NAT: > + if (cmd->arg1 != 0) > + bprint_uint_arg(bp, "nat ", cmd->arg1); > + else > + bprintf(bp, "nat global"); > + break; > + > + case O_SETFIB: > + bprint_uint_arg(bp, "setfib ", cmd->arg1 & 0x7FFF); > + break; > + > + case O_SETDSCP: > + { > + const char *code; > + > + if (cmd->arg1 == IP_FW_TARG) { > + bprint_uint_arg(bp, "setdscp ", cmd->arg1); > + break; > + } > + uval = cmd->arg1 & 0x3F; > + if ((code = match_value(f_ipdscp, uval)) != NULL) > + bprintf(bp, "setdscp %s", code); > + else > + bprint_uint_arg(bp, "setdscp ", uval); > + } > + break; > + > + case O_REASS: > + bprintf(bp, "reass"); > + break; > + > + case O_CALLRETURN: > + if (cmd->len & F_NOT) > + bprintf(bp, "return"); > + else > + bprint_uint_arg(bp, "call ", cmd->arg1); > + break; > + > + default: > + bprintf(bp, "** unrecognized action %d len %d ", > + cmd->opcode, cmd->len); > + } > + } > + if (logptr) { > + if (logptr->max_log > 0) > + bprintf(bp, " log logamount %d", logptr->max_log); > + else > + bprintf(bp, " log"); > + } > +#ifndef NO_ALTQ > + if (altqptr) { > + print_altq_cmd(bp, altqptr); > + } > +#endif > + if (tagptr) { > + if (tagptr->len & F_NOT) > + bprint_uint_arg(bp, " untag ", tagptr->arg1); > + else > + bprint_uint_arg(bp, " tag ", tagptr->arg1); > + } > + > + /* > + * then print the body. > + */ > + for (l = rule->act_ofs, cmd = rule->cmd; > + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { > + if ((cmd->len & F_OR) || (cmd->len & F_NOT)) > + continue; > + if (cmd->opcode == O_IP4) { > + flags |= HAVE_PROTO4; > + break; > + } else if (cmd->opcode == O_IP6) { > + flags |= HAVE_PROTO6; > + break; > + } > + } > + if (rule->flags & IPFW_RULE_NOOPT) { /* empty rules before options */ > + if (!co->do_compact) { > + show_prerequisites(bp, &flags, HAVE_PROTO, 0); > + bprintf(bp, " from any to any"); > + } > + flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | > + HAVE_SRCIP | HAVE_DSTIP; > + } > + > + if (co->comment_only) > + comment = "..."; > + > + for (l = rule->act_ofs, cmd = rule->cmd; > + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { > + /* useful alias */ > + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; > + > + if (co->comment_only) { > + if (cmd->opcode != O_NOP) > + continue; > + bprintf(bp, " // %s\n", (char *)(cmd + 1)); > + return; > + } > + > + show_prerequisites(bp, &flags, 0, cmd->opcode); > + > + switch(cmd->opcode) { > + case O_PROB: > + break; /* done already */ > + > + case O_PROBE_STATE: > + break; /* no need to print anything here */ > + > + case O_IP_SRC: > + case O_IP_SRC_LOOKUP: > + case O_IP_SRC_MASK: > + case O_IP_SRC_ME: > + case O_IP_SRC_SET: > + show_prerequisites(bp, &flags, HAVE_PROTO, 0); > + if (!(flags & HAVE_SRCIP)) > + bprintf(bp, " from"); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + print_ip(bp, fo, (ipfw_insn_ip *)cmd, > + (flags & HAVE_OPTIONS) ? " src-ip" : ""); > + flags |= HAVE_SRCIP; > + break; > + > + case O_IP_DST: > + case O_IP_DST_LOOKUP: > + case O_IP_DST_MASK: > + case O_IP_DST_ME: > + case O_IP_DST_SET: > + show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); > + if (!(flags & HAVE_DSTIP)) > + bprintf(bp, " to"); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + print_ip(bp, fo, (ipfw_insn_ip *)cmd, > + (flags & HAVE_OPTIONS) ? " dst-ip" : ""); > + flags |= HAVE_DSTIP; > + break; > + > + case O_IP6_SRC: > + case O_IP6_SRC_MASK: > + case O_IP6_SRC_ME: > + show_prerequisites(bp, &flags, HAVE_PROTO, 0); > + if (!(flags & HAVE_SRCIP)) > + bprintf(bp, " from"); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + print_ip6(bp, (ipfw_insn_ip6 *)cmd, > + (flags & HAVE_OPTIONS) ? " src-ip6" : ""); > + flags |= HAVE_SRCIP | HAVE_PROTO; > + break; > + > + case O_IP6_DST: > + case O_IP6_DST_MASK: > + case O_IP6_DST_ME: > + show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); > + if (!(flags & HAVE_DSTIP)) > + bprintf(bp, " to"); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + print_ip6(bp, (ipfw_insn_ip6 *)cmd, > + (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); > + flags |= HAVE_DSTIP; > + break; > + > + case O_FLOW6ID: > + print_flow6id(bp, (ipfw_insn_u32 *) cmd ); > + flags |= HAVE_OPTIONS; > + break; > + > + case O_IP_DSTPORT: > + show_prerequisites(bp, &flags, > + HAVE_PROTO | HAVE_SRCIP | > + HAVE_DSTIP | HAVE_IP, 0); > + case O_IP_SRCPORT: > + if (flags & HAVE_DSTIP) > + flags |= HAVE_IP; > + show_prerequisites(bp, &flags, > + HAVE_PROTO | HAVE_SRCIP, 0); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + if (cmd->len & F_NOT) > + bprintf(bp, " not"); > + print_newports(bp, (ipfw_insn_u16 *)cmd, proto, > + (flags & HAVE_OPTIONS) ? cmd->opcode : 0); > + break; > + > + case O_PROTO: { > + struct protoent *pe = NULL; > + > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + if (cmd->len & F_NOT) > + bprintf(bp, " not"); > + proto = cmd->arg1; > + pe = getprotobynumber(cmd->arg1); > + if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && > + !(flags & HAVE_PROTO)) > + show_prerequisites(bp, &flags, > + HAVE_PROTO | HAVE_IP | HAVE_SRCIP | > + HAVE_DSTIP | HAVE_OPTIONS, 0); > + if (flags & HAVE_OPTIONS) > + bprintf(bp, " proto"); > + if (pe) > + bprintf(bp, " %s", pe->p_name); > + else > + bprintf(bp, " %u", cmd->arg1); > + } > + flags |= HAVE_PROTO; > + break; > + > + default: /*options ... */ > + if (!(cmd->len & (F_OR|F_NOT))) > + if (((cmd->opcode == O_IP6) && > + (flags & HAVE_PROTO6)) || > + ((cmd->opcode == O_IP4) && > + (flags & HAVE_PROTO4))) > + break; > + show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | > + HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); > + if ((cmd->len & F_OR) && !or_block) > + bprintf(bp, " {"); > + if (cmd->len & F_NOT && cmd->opcode != O_IN) > + bprintf(bp, " not"); > + switch(cmd->opcode) { > + case O_MACADDR2: { > + ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; > + > + bprintf(bp, " MAC"); > + print_mac(bp, m->addr, m->mask); > + print_mac(bp, m->addr + 6, m->mask + 6); > + } > + break; > + > + case O_MAC_TYPE: > + print_newports(bp, (ipfw_insn_u16 *)cmd, > + IPPROTO_ETHERTYPE, cmd->opcode); > + break; > + > + > + case O_FRAG: > + bprintf(bp, " frag"); > + break; > + > + case O_FIB: > + bprintf(bp, " fib %u", cmd->arg1 ); > + break; > + case O_SOCKARG: > + bprintf(bp, " sockarg"); > + break; > + > + case O_IN: > + bprintf(bp, cmd->len & F_NOT ? " out" : " in"); > + break; > + > + case O_DIVERTED: > + switch (cmd->arg1) { > + case 3: > + bprintf(bp, " diverted"); > + break; > + case 1: > + bprintf(bp, " diverted-loopback"); > + break; > + case 2: > + bprintf(bp, " diverted-output"); > + break; > + default: > + bprintf(bp, " diverted-?<%u>", cmd->arg1); > + break; > + } > + break; > + > + case O_LAYER2: > + bprintf(bp, " layer2"); > + break; > + case O_XMIT: > + case O_RECV: > + case O_VIA: > + { > + char const *s, *t; > + ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; > + > + if (cmd->opcode == O_XMIT) > + s = "xmit"; > + else if (cmd->opcode == O_RECV) > + s = "recv"; > + else /* if (cmd->opcode == O_VIA) */ > + s = "via"; > + if (cmdif->name[0] == '\0') > + bprintf(bp, " %s %s", s, > + inet_ntoa(cmdif->p.ip)); > + else if (cmdif->name[0] == '\1') { > + /* interface table */ > + t = table_search_ctlv(fo->tstate, > + cmdif->p.kidx); > + bprintf(bp, " %s table(%s)", s, t); > + } else > + bprintf(bp, " %s %s", s, cmdif->name); > + > + break; > + } > + case O_IP_FLOW_LOOKUP: > + { > + char *t; > + > + t = table_search_ctlv(fo->tstate, cmd->arg1); > + bprintf(bp, " flow table(%s", t); > + if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) > + bprintf(bp, ",%u", > + ((ipfw_insn_u32 *)cmd)->d[0]); > + bprintf(bp, ")"); > + break; > + } > + case O_IPID: > + if (F_LEN(cmd) == 1) > + bprintf(bp, " ipid %u", cmd->arg1 ); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, > + O_IPID); > + break; > + > + case O_IPTTL: > + if (F_LEN(cmd) == 1) > + bprintf(bp, " ipttl %u", cmd->arg1 ); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, > + O_IPTTL); > + break; > + > + case O_IPVER: > + bprintf(bp, " ipver %u", cmd->arg1 ); > + break; > + > + case O_IPPRECEDENCE: > + bprintf(bp, " ipprecedence %u", cmd->arg1 >> 5); > + break; > + > + case O_DSCP: > + print_dscp(bp, (ipfw_insn_u32 *)cmd); > + break; > + > + case O_IPLEN: > + if (F_LEN(cmd) == 1) > + bprintf(bp, " iplen %u", cmd->arg1 ); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, > + O_IPLEN); > + break; > + > + case O_IPOPT: > + print_flags(bp, "ipoptions", cmd, f_ipopts); > + break; > + > + case O_IPTOS: > + print_flags(bp, "iptos", cmd, f_iptos); > + break; > + > + case O_ICMPTYPE: > + print_icmptypes(bp, (ipfw_insn_u32 *)cmd); > + break; > + > + case O_ESTAB: > + bprintf(bp, " established"); > + break; > + > + case O_TCPDATALEN: > + if (F_LEN(cmd) == 1) > + bprintf(bp, " tcpdatalen %u", cmd->arg1 ); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, > + O_TCPDATALEN); > + break; > + > + case O_TCPFLAGS: > + print_flags(bp, "tcpflags", cmd, f_tcpflags); > + break; > + > + case O_TCPOPTS: > + print_flags(bp, "tcpoptions", cmd, f_tcpopts); > + break; > + > + case O_TCPWIN: > + if (F_LEN(cmd) == 1) > + bprintf(bp, " tcpwin %u", cmd->arg1); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, > + O_TCPWIN); > + break; > + > + case O_TCPACK: > + bprintf(bp, " tcpack %d", ntohl(cmd32->d[0])); > + break; > + > + case O_TCPSEQ: > + bprintf(bp, " tcpseq %d", ntohl(cmd32->d[0])); > + break; > + > + case O_UID: > + { > + struct passwd *pwd = getpwuid(cmd32->d[0]); > + > + if (pwd) > + bprintf(bp, " uid %s", pwd->pw_name); > + else > + bprintf(bp, " uid %u", cmd32->d[0]); > + } > + break; > + > + case O_GID: > + { > + struct group *grp = getgrgid(cmd32->d[0]); > + > + if (grp) > + bprintf(bp, " gid %s", grp->gr_name); > + else > + bprintf(bp, " gid %u", cmd32->d[0]); > + } > + break; > + > + case O_JAIL: > + bprintf(bp, " jail %d", cmd32->d[0]); > + break; > + > + case O_VERREVPATH: > + bprintf(bp, " verrevpath"); > + break; > + > + case O_VERSRCREACH: > + bprintf(bp, " versrcreach"); > + break; > + > + case O_ANTISPOOF: > + bprintf(bp, " antispoof"); > + break; > + > + case O_IPSEC: > + bprintf(bp, " ipsec"); > + break; > + > + case O_NOP: > + comment = (char *)(cmd + 1); > + break; > + > + case O_KEEP_STATE: > + bprintf(bp, " keep-state"); > + break; > + > + case O_LIMIT: { > + struct _s_x *p = limit_masks; > + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; > + uint8_t x = c->limit_mask; > + char const *comma = " "; > + > + bprintf(bp, " limit"); > + for (; p->x != 0 ; p++) > + if ((x & p->x) == p->x) { > + x &= ~p->x; > + bprintf(bp, "%s%s", comma,p->s); > + comma = ","; > + } > + bprint_uint_arg(bp, " ", c->conn_limit); > + break; > + } > + > + case O_IP6: > + bprintf(bp, " ip6"); > + break; > + > + case O_IP4: > + bprintf(bp, " ip4"); > + break; > + > + case O_ICMP6TYPE: > + print_icmp6types(bp, (ipfw_insn_u32 *)cmd); > + break; > + > + case O_EXT_HDR: > + print_ext6hdr(bp, (ipfw_insn *)cmd); > + break; > + > + case O_TAGGED: > + if (F_LEN(cmd) == 1) > + bprint_uint_arg(bp, " tagged ", > + cmd->arg1); > + else > + print_newports(bp, (ipfw_insn_u16 *)cmd, > + 0, O_TAGGED); > + break; > + > + default: > + bprintf(bp, " [opcode %d len %d]", > + cmd->opcode, cmd->len); > + } > + } > + if (cmd->len & F_OR) { > + bprintf(bp, " or"); > + or_block = 1; > + } else if (or_block) { > + bprintf(bp, " }"); > + or_block = 0; > + } > + } > + show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP > + | HAVE_IP, 0); > + if (comment) > + bprintf(bp, " // %s", comment); > + bprintf(bp, "\n"); > +} > + > +static void > +show_dyn_state(struct cmdline_opts *co, struct format_opts *fo, > + struct buf_pr *bp, ipfw_dyn_rule *d) > +{ > + struct protoent *pe; > + struct in_addr a; > + uint16_t rulenum; > + char buf[INET6_ADDRSTRLEN]; > + > + if (!co->do_expired) { > + if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) > + return; > + } > + bcopy(&d->rule, &rulenum, sizeof(rulenum)); > + bprintf(bp, "%05d", rulenum); > + if (fo->pcwidth > 0 || fo->bcwidth > 0) { > + bprintf(bp, " "); > + pr_u64(bp, &d->pcnt, fo->pcwidth); > + pr_u64(bp, &d->bcnt, fo->bcwidth); > + bprintf(bp, "(%ds)", d->expire); > + } > + switch (d->dyn_type) { > + case O_LIMIT_PARENT: > + bprintf(bp, " PARENT %d", d->count); > + break; > + case O_LIMIT: > + bprintf(bp, " LIMIT"); > + break; > + case O_KEEP_STATE: /* bidir, no mask */ > + bprintf(bp, " STATE"); > + break; > + } > + > + if ((pe = getprotobynumber(d->id.proto)) != NULL) > + bprintf(bp, " %s", pe->p_name); > + else > + bprintf(bp, " proto %u", d->id.proto); > + > + if (d->id.addr_type == 4) { > + a.s_addr = htonl(d->id.src_ip); > + bprintf(bp, " %s %d", inet_ntoa(a), d->id.src_port); > + > + a.s_addr = htonl(d->id.dst_ip); > + bprintf(bp, " <-> %s %d", inet_ntoa(a), d->id.dst_port); > + } else if (d->id.addr_type == 6) { > + bprintf(bp, " %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, > + sizeof(buf)), d->id.src_port); > + bprintf(bp, " <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, > + buf, sizeof(buf)), d->id.dst_port); > + } else > + bprintf(bp, " UNKNOWN <-> UNKNOWN\n"); > +} > + > +static int > +do_range_cmd(int cmd, ipfw_range_tlv *rt) > +{ > + ipfw_range_header rh; > + size_t sz; > + > + memset(&rh, 0, sizeof(rh)); > + memcpy(&rh.range, rt, sizeof(*rt)); > + rh.range.head.length = sizeof(*rt); > + rh.range.head.type = IPFW_TLV_RANGE; > + sz = sizeof(rh); > + > + if (do_get3(cmd, &rh.opheader, &sz) != 0) > + return (-1); > + /* Save number of matched objects */ > + rt->new_set = rh.range.new_set; > + return (0); > +} > + > +/* > + * This one handles all set-related commands > + * ipfw set { show | enable | disable } > + * ipfw set swap X Y > + * ipfw set move X to Y > + * ipfw set move rule X to Y > + */ > +void > +ipfw_sets_handler(char *av[]) > +{ > + uint32_t masks[2]; > + int i; > + uint8_t cmd, rulenum; > + ipfw_range_tlv rt; > + char *msg; > + size_t size; > + > + av++; > + memset(&rt, 0, sizeof(rt)); > + > + if (av[0] == NULL) > + errx(EX_USAGE, "set needs command"); > + if (_substrcmp(*av, "show") == 0) { > + struct format_opts fo; > + ipfw_cfg_lheader *cfg; > + > + memset(&fo, 0, sizeof(fo)); > + if (ipfw_get_config(&co, &fo, &cfg, &size) != 0) > + err(EX_OSERR, "requesting config failed"); > + > + for (i = 0, msg = "disable"; i < RESVD_SET; i++) > + if ((cfg->set_mask & (1<<i)) == 0) { > + printf("%s %d", msg, i); > + msg = ""; > + } > + msg = (cfg->set_mask != (uint32_t)-1) ? " enable" : "enable"; > + for (i = 0; i < RESVD_SET; i++) > + if ((cfg->set_mask & (1<<i)) != 0) { > + printf("%s %d", msg, i); > + msg = ""; > + } > + printf("\n"); > + free(cfg); > + } else if (_substrcmp(*av, "swap") == 0) { > + av++; > + if ( av[0] == NULL || av[1] == NULL ) > + errx(EX_USAGE, "set swap needs 2 set numbers\n"); > + rt.set = atoi(av[0]); > + rt.new_set = atoi(av[1]); > + if (!isdigit(*(av[0])) || rt.set > RESVD_SET) > + errx(EX_DATAERR, "invalid set number %s\n", av[0]); > + if (!isdigit(*(av[1])) || rt.new_set > RESVD_SET) > + errx(EX_DATAERR, "invalid set number %s\n", av[1]); > + i = do_range_cmd(IP_FW_SET_SWAP, &rt); > + } else if (_substrcmp(*av, "move") == 0) { > + av++; > + if (av[0] && _substrcmp(*av, "rule") == 0) { > + rt.flags = IPFW_RCFLAG_RANGE; /* move rules to new set */ > + cmd = IP_FW_XMOVE; > + av++; > + } else > + cmd = IP_FW_SET_MOVE; /* Move set to new one */ > + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || > + av[3] != NULL || _substrcmp(av[1], "to") != 0) > + errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); > + rulenum = atoi(av[0]); > + rt.new_set = atoi(av[2]); > + if (cmd == IP_FW_XMOVE) { > + rt.start_rule = rulenum; > + rt.end_rule = rulenum; > + } else > + rt.set = rulenum; > + rt.new_set = atoi(av[2]); > + if (!isdigit(*(av[0])) || (cmd == 3 && rt.set > RESVD_SET) || > + (cmd == 2 && rt.start_rule == IPFW_DEFAULT_RULE) ) > + errx(EX_DATAERR, "invalid source number %s\n", av[0]); > + if (!isdigit(*(av[2])) || rt.new_set > RESVD_SET) > + errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); > + i = do_range_cmd(cmd, &rt); > + } else if (_substrcmp(*av, "disable") == 0 || > + _substrcmp(*av, "enable") == 0 ) { > + int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; > + > + av++; > + masks[0] = masks[1] = 0; > + > + while (av[0]) { > + if (isdigit(**av)) { > + i = atoi(*av); > + if (i < 0 || i > RESVD_SET) > + errx(EX_DATAERR, > + "invalid set number %d\n", i); > + masks[which] |= (1<<i); > + } else if (_substrcmp(*av, "disable") == 0) > + which = 0; > + else if (_substrcmp(*av, "enable") == 0) > + which = 1; > + else > + errx(EX_DATAERR, > + "invalid set command %s\n", *av); > + av++; > + } > + if ( (masks[0] & masks[1]) != 0 ) > + errx(EX_DATAERR, > + "cannot enable and disable the same set\n"); > + > + rt.set = masks[0]; > + rt.new_set = masks[1]; > + i = do_range_cmd(IP_FW_SET_ENABLE, &rt); > + if (i) > + warn("set enable/disable: setsockopt(IP_FW_SET_ENABLE)"); > + } else > + errx(EX_USAGE, "invalid set command %s\n", *av); > +} > + > +void > +ipfw_sysctl_handler(char *av[], int which) > +{ > + av++; > + > + if (av[0] == NULL) { > + warnx("missing keyword to enable/disable\n"); > + } else if (_substrcmp(*av, "firewall") == 0) { > + sysctlbyname("net.inet.ip.fw.enable", NULL, 0, > + &which, sizeof(which)); > + sysctlbyname("net.inet6.ip6.fw.enable", NULL, 0, > + &which, sizeof(which)); > + } else if (_substrcmp(*av, "one_pass") == 0) { > + sysctlbyname("net.inet.ip.fw.one_pass", NULL, 0, > + &which, sizeof(which)); > + } else if (_substrcmp(*av, "debug") == 0) { > + sysctlbyname("net.inet.ip.fw.debug", NULL, 0, > + &which, sizeof(which)); > + } else if (_substrcmp(*av, "verbose") == 0) { > + sysctlbyname("net.inet.ip.fw.verbose", NULL, 0, > + &which, sizeof(which)); > + } else if (_substrcmp(*av, "dyn_keepalive") == 0) { > + sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0, > + &which, sizeof(which)); > +#ifndef NO_ALTQ > + } else if (_substrcmp(*av, "altq") == 0) { > + altq_set_enabled(which); > +#endif > + } else { > + warnx("unrecognize enable/disable keyword: %s\n", *av); > + } > +} > + > +typedef void state_cb(struct cmdline_opts *co, struct format_opts *fo, > + void *arg, void *state); > + > +static void > +prepare_format_dyn(struct cmdline_opts *co, struct format_opts *fo, > + void *arg, void *_state) > +{ > + ipfw_dyn_rule *d; > + int width; > + uint8_t set; > + > + d = (ipfw_dyn_rule *)_state; > + /* Count _ALL_ states */ > + fo->dcnt++; > + > + if (fo->show_counters == 0) > + return; > + > + if (co->use_set) { > + /* skip states from another set */ > + bcopy((char *)&d->rule + sizeof(uint16_t), &set, > + sizeof(uint8_t)); > + if (set != co->use_set - 1) > + return; > + } > + > + width = pr_u64(NULL, &d->pcnt, 0); > + if (width > fo->pcwidth) > + fo->pcwidth = width; > + > + width = pr_u64(NULL, &d->bcnt, 0); > + if (width > fo->bcwidth) > + fo->bcwidth = width; > +} > + > +static int > +foreach_state(struct cmdline_opts *co, struct format_opts *fo, > + caddr_t base, size_t sz, state_cb dyn_bc, void *dyn_arg) > +{ > + int ttype; > + state_cb *fptr; > + void *farg; > + ipfw_obj_tlv *tlv; > + ipfw_obj_ctlv *ctlv; > + > + fptr = NULL; > + ttype = 0; > + > + while (sz > 0) { > + ctlv = (ipfw_obj_ctlv *)base; > + switch (ctlv->head.type) { > + case IPFW_TLV_DYNSTATE_LIST: > + base += sizeof(*ctlv); > + sz -= sizeof(*ctlv); > + ttype = IPFW_TLV_DYN_ENT; > + fptr = dyn_bc; > + farg = dyn_arg; > + break; > + default: > + return (sz); > + } > + > + while (sz > 0) { > + tlv = (ipfw_obj_tlv *)base; > + if (tlv->type != ttype) > + break; > + > + fptr(co, fo, farg, tlv + 1); > + sz -= tlv->length; > + base += tlv->length; > + } > + } > + > + return (sz); > +} > + > +static void > +prepare_format_opts(struct cmdline_opts *co, struct format_opts *fo, > + ipfw_obj_tlv *rtlv, int rcnt, caddr_t dynbase, size_t dynsz) > +{ > + int bcwidth, pcwidth, width; > + int n; > + struct ip_fw_bcounter *cntr; > + struct ip_fw_rule *r; > + > + bcwidth = 0; > + pcwidth = 0; > + if (fo->show_counters != 0) { > + for (n = 0; n < rcnt; n++, > + rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { > + cntr = (struct ip_fw_bcounter *)(rtlv + 1); > + r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); > + /* skip rules from another set */ > + if (co->use_set && r->set != co->use_set - 1) > + continue; > + > + /* packet counter */ > + width = pr_u64(NULL, &cntr->pcnt, 0); > + if (width > pcwidth) > + pcwidth = width; > + > + /* byte counter */ > + width = pr_u64(NULL, &cntr->bcnt, 0); > + if (width > bcwidth) > + bcwidth = width; > + } > + } > + fo->bcwidth = bcwidth; > + fo->pcwidth = pcwidth; > + > + fo->dcnt = 0; > + if (co->do_dynamic && dynsz > 0) > + foreach_state(co, fo, dynbase, dynsz, prepare_format_dyn, NULL); > +} > + > +static int > +list_static_range(struct cmdline_opts *co, struct format_opts *fo, > + struct buf_pr *bp, ipfw_obj_tlv *rtlv, int rcnt) > +{ > + int n, seen; > + struct ip_fw_rule *r; > + struct ip_fw_bcounter *cntr; > + int c = 0; > + > + for (n = seen = 0; n < rcnt; n++, > + rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { > + > + if (fo->show_counters != 0) { > + cntr = (struct ip_fw_bcounter *)(rtlv + 1); > + r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); > + } else { > + cntr = NULL; > + r = (struct ip_fw_rule *)(rtlv + 1); > + } > + if (r->rulenum > fo->last) > + break; > + if (co->use_set && r->set != co->use_set - 1) > + continue; > + if (r->rulenum >= fo->first && r->rulenum <= fo->last) { > + show_static_rule(co, fo, bp, r, cntr); > + printf("%s", bp->buf); > + c += rtlv->length; > + bp_flush(bp); > + seen++; > + } > + } > + > + return (seen); > +} > + > +static void > +list_dyn_state(struct cmdline_opts *co, struct format_opts *fo, > + void *_arg, void *_state) > +{ > + uint16_t rulenum; > + uint8_t set; > + ipfw_dyn_rule *d; > + struct buf_pr *bp; > + > + d = (ipfw_dyn_rule *)_state; > + bp = (struct buf_pr *)_arg; > + > + bcopy(&d->rule, &rulenum, sizeof(rulenum)); > + if (rulenum > fo->last) > + return; > + if (co->use_set) { > + bcopy((char *)&d->rule + sizeof(uint16_t), > + &set, sizeof(uint8_t)); > + if (set != co->use_set - 1) > + return; > + } > + if (rulenum >= fo->first) { > + show_dyn_state(co, fo, bp, d); > + printf("%s\n", bp->buf); > + bp_flush(bp); > + } > +} > + > +static int > +list_dyn_range(struct cmdline_opts *co, struct format_opts *fo, > + struct buf_pr *bp, caddr_t base, size_t sz) > +{ > + > + sz = foreach_state(co, fo, base, sz, list_dyn_state, bp); > + return (sz); > +} > + > +void > +ipfw_list(int ac, char *av[], int show_counters) > +{ > + ipfw_cfg_lheader *cfg; > + struct format_opts sfo; > + size_t sz; > + int error; > + int lac; > + char **lav; > + uint32_t rnum; > + char *endptr; > + > + if (co.test_only) { > + fprintf(stderr, "Testing only, list disabled\n"); > + return; > + } > + if (co.do_pipe) { > + dummynet_list(ac, av, show_counters); > + return; > + } > + > + ac--; > + av++; > + memset(&sfo, 0, sizeof(sfo)); > + > + /* Determine rule range to request */ > + if (ac > 0) { > + for (lac = ac, lav = av; lac != 0; lac--) { > + rnum = strtoul(*lav++, &endptr, 10); > + if (sfo.first == 0 || rnum < sfo.first) > + sfo.first = rnum; > + > + if (*endptr == '-') > + rnum = strtoul(endptr + 1, &endptr, 10); > + if (sfo.last == 0 || rnum > sfo.last) > + sfo.last = rnum; > + } > + } > + > + /* get configuraion from kernel */ > + cfg = NULL; > + sfo.show_counters = show_counters; > + sfo.flags = IPFW_CFG_GET_STATIC; > + if (co.do_dynamic != 0) > + sfo.flags |= IPFW_CFG_GET_STATES; > + if (sfo.show_counters != 0) > + sfo.flags |= IPFW_CFG_GET_COUNTERS; > + if (ipfw_get_config(&co, &sfo, &cfg, &sz) != 0) > + err(EX_OSERR, "retrieving config failed"); > + > + error = ipfw_show_config(&co, &sfo, cfg, sz, ac, av); > + > + free(cfg); > + > + if (error != EX_OK) > + exit(error); > +} > + > +static int > +ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, > + ipfw_cfg_lheader *cfg, size_t sz, int ac, char *av[]) > +{ > + caddr_t dynbase; > + size_t dynsz; > + int rcnt; > + int exitval = EX_OK; > + int lac; > + char **lav; > + char *endptr; > + size_t readsz; > + struct buf_pr bp; > + ipfw_obj_ctlv *ctlv, *tstate; > + ipfw_obj_tlv *rbase; > + > + /* > + * Handle tablenames TLV first, if any > + */ > + tstate = NULL; > + rbase = NULL; > + dynbase = NULL; > + dynsz = 0; > + readsz = sizeof(*cfg); > + rcnt = 0; > + > + fo->set_mask = cfg->set_mask; > + > + ctlv = (ipfw_obj_ctlv *)(cfg + 1); > + > + if (cfg->flags & IPFW_CFG_GET_STATIC) { > + /* We've requested static rules */ > + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { > + fo->tstate = ctlv; > + readsz += ctlv->head.length; > + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + > + ctlv->head.length); > + } > + > + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { > + rbase = (ipfw_obj_tlv *)(ctlv + 1); > + rcnt = ctlv->count; > + readsz += ctlv->head.length; > + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + > + ctlv->head.length); > + } > + } > + > + if ((cfg->flags & IPFW_CFG_GET_STATES) && (readsz != sz)) { > + /* We may have some dynamic states */ > + dynsz = sz - readsz; > + /* Skip empty header */ > + if (dynsz != sizeof(ipfw_obj_ctlv)) > + dynbase = (caddr_t)ctlv; > + else > + dynsz = 0; > + } > + > + prepare_format_opts(co, fo, rbase, rcnt, dynbase, dynsz); > + bp_alloc(&bp, 4096); > + > + /* if no rule numbers were specified, list all rules */ > + if (ac == 0) { > + fo->first = 0; > + fo->last = IPFW_DEFAULT_RULE; > + list_static_range(co, fo, &bp, rbase, rcnt); > + > + if (co->do_dynamic && dynsz > 0) { > + printf("## Dynamic rules (%d %zu):\n", fo->dcnt, dynsz); > + list_dyn_range(co, fo, &bp, dynbase, dynsz); > + } > + > + bp_free(&bp); > + return (EX_OK); > + } > + > + /* display specific rules requested on command line */ > + for (lac = ac, lav = av; lac != 0; lac--) { > + /* convert command line rule # */ > + fo->last = fo->first = strtoul(*lav++, &endptr, 10); > + if (*endptr == '-') > + fo->last = strtoul(endptr + 1, &endptr, 10); > + if (*endptr) { > + exitval = EX_USAGE; > + warnx("invalid rule number: %s", *(lav - 1)); > + continue; > + } > + > + if (list_static_range(co, fo, &bp, rbase, rcnt) == 0) { > + /* give precedence to other error(s) */ > + if (exitval == EX_OK) > + exitval = EX_UNAVAILABLE; > + if (fo->first == fo->last) > + warnx("rule %u does not exist", fo->first); > + else > + warnx("no rules in range %u-%u", > + fo->first, fo->last); > + } > + } > + > + if (co->do_dynamic && dynsz > 0) { > + printf("## Dynamic rules:\n"); > + for (lac = ac, lav = av; lac != 0; lac--) { > + fo->last = fo->first = strtoul(*lav++, &endptr, 10); > + if (*endptr == '-') > + fo->last = strtoul(endptr+1, &endptr, 10); > + if (*endptr) > + /* already warned */ > + continue; > + list_dyn_range(co, fo, &bp, dynbase, dynsz); > + } > + } > + > + bp_free(&bp); > + return (exitval); > +} > + > + > +/* > + * Retrieves current ipfw configuration of given type > + * and stores its pointer to @pcfg. > + * > + * Caller is responsible for freeing @pcfg. > + * > + * Returns 0 on success. > + */ > + > +static int > +ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, > + ipfw_cfg_lheader **pcfg, size_t *psize) > +{ > + ipfw_cfg_lheader *cfg; > + size_t sz; > + int i; > + > + > + if (co->test_only != 0) { > + fprintf(stderr, "Testing only, list disabled\n"); > + return (0); > + } > + > + /* Start with some data size */ > + sz = 4096; > + cfg = NULL; > + > + for (i = 0; i < 16; i++) { > + if (cfg != NULL) > + free(cfg); > + if ((cfg = calloc(1, sz)) == NULL) > + return (ENOMEM); > + > + cfg->flags = fo->flags; > + cfg->start_rule = fo->first; > + cfg->end_rule = fo->last; > + > + if (do_get3(IP_FW_XGET, &cfg->opheader, &sz) != 0) { > + if (errno != ENOMEM) { > + free(cfg); > + return (errno); > + } > + > + /* Buffer size is not enough. Try to increase */ > + sz = sz * 2; > + if (sz < cfg->size) > + sz = cfg->size; > + continue; > + } > + > + *pcfg = cfg; > + *psize = sz; > + return (0); > + } > + > + free(cfg); > + return (ENOMEM); > +} > + > +static int > +lookup_host (char *host, struct in_addr *ipaddr) > +{ > + struct hostent *he; > + > + if (!inet_aton(host, ipaddr)) { > + if ((he = gethostbyname(host)) == NULL) > + return(-1); > + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; > + } > + return(0); > +} > + > +struct tidx { > + ipfw_obj_ntlv *idx; > + uint32_t count; > + uint32_t size; > + uint16_t counter; > + uint8_t set; > +}; > + > +static uint16_t > +pack_table(struct tidx *tstate, char *name) > +{ > + int i; > + ipfw_obj_ntlv *ntlv; > + > + if (table_check_name(name) != 0) > + return (0); > + > + for (i = 0; i < tstate->count; i++) { > + if (strcmp(tstate->idx[i].name, name) != 0) > + continue; > + if (tstate->idx[i].set != tstate->set) > + continue; > + > + return (tstate->idx[i].idx); > + } > + > + if (tstate->count + 1 > tstate->size) { > + tstate->size += 4; > + tstate->idx = realloc(tstate->idx, tstate->size * > + sizeof(ipfw_obj_ntlv)); > + if (tstate->idx == NULL) > + return (0); > + } > + > + ntlv = &tstate->idx[i]; > + memset(ntlv, 0, sizeof(ipfw_obj_ntlv)); > + strlcpy(ntlv->name, name, sizeof(ntlv->name)); > + ntlv->head.type = IPFW_TLV_TBL_NAME; > + ntlv->head.length = sizeof(ipfw_obj_ntlv); > + ntlv->set = tstate->set; > + ntlv->idx = ++tstate->counter; > + tstate->count++; > + > + return (ntlv->idx); > +} > + > +static void > +fill_table(ipfw_insn *cmd, char *av, uint8_t opcode, struct tidx *tstate) > +{ > + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; > + uint16_t uidx; > + char *p; > + > + if ((p = strchr(av + 6, ')')) == NULL) > + errx(EX_DATAERR, "forgotten parenthesis: '%s'", av); > + *p = '\0'; > + p = strchr(av + 6, ','); > + if (p) > + *p++ = '\0'; > + > + if ((uidx = pack_table(tstate, av + 6)) == 0) > + errx(EX_DATAERR, "Invalid table name: %s", av + 6); > + > + cmd->opcode = opcode; > + cmd->arg1 = uidx; > + if (p) { > + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); > + d[0] = strtoul(p, NULL, 0); > + } else > + cmd->len |= F_INSN_SIZE(ipfw_insn); > +} > + > + > +/* > + * fills the addr and mask fields in the instruction as appropriate from av. > + * Update length as appropriate. > + * The following formats are allowed: > + * me returns O_IP_*_ME > + * 1.2.3.4 single IP address > + * 1.2.3.4:5.6.7.8 address:mask > + * 1.2.3.4/24 address/mask > + * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet > + * We can have multiple comma-separated address/mask entries. > + */ > +static void > +fill_ip(ipfw_insn_ip *cmd, char *av, int cblen, struct tidx *tstate) > +{ > + int len = 0; > + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; > + > + cmd->o.len &= ~F_LEN_MASK; /* zero len */ > + > + if (_substrcmp(av, "any") == 0) > + return; > + > + if (_substrcmp(av, "me") == 0) { > + cmd->o.len |= F_INSN_SIZE(ipfw_insn); > + return; > + } > + > + if (strncmp(av, "table(", 6) == 0) { > + fill_table(&cmd->o, av, O_IP_DST_LOOKUP, tstate); > + return; > + } > + > + while (av) { > + /* > + * After the address we can have '/' or ':' indicating a mask, > + * ',' indicating another address follows, '{' indicating a > + * set of addresses of unspecified size. > + */ > + char *t = NULL, *p = strpbrk(av, "/:,{"); > + int masklen; > + char md, nd = '\0'; > + > + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn) + 2 + len); > + > + if (p) { > + md = *p; > + *p++ = '\0'; > + if ((t = strpbrk(p, ",{")) != NULL) { > + nd = *t; > + *t = '\0'; > + } > + } else > + md = '\0'; > + > + if (lookup_host(av, (struct in_addr *)&d[0]) != 0) > + errx(EX_NOHOST, "hostname ``%s'' unknown", av); > + switch (md) { > + case ':': > + if (!inet_aton(p, (struct in_addr *)&d[1])) > + errx(EX_DATAERR, "bad netmask ``%s''", p); > + break; > + case '/': > + masklen = atoi(p); > + if (masklen == 0) > + d[1] = htonl(0); /* mask */ > + else if (masklen > 32) > + errx(EX_DATAERR, "bad width ``%s''", p); > + else > + d[1] = htonl(~0 << (32 - masklen)); > + break; > + case '{': /* no mask, assume /24 and put back the '{' */ > + d[1] = htonl(~0 << (32 - 24)); > + *(--p) = md; > + break; > + > + case ',': /* single address plus continuation */ > + *(--p) = md; > + /* FALLTHROUGH */ > + case 0: /* initialization value */ > + default: > + d[1] = htonl(~0); /* force /32 */ > + break; > + } > + d[0] &= d[1]; /* mask base address with mask */ > + if (t) > + *t = nd; > + /* find next separator */ > + if (p) > + p = strpbrk(p, ",{"); > + if (p && *p == '{') { > + /* > + * We have a set of addresses. They are stored as follows: > + * arg1 is the set size (powers of 2, 2..256) > + * addr is the base address IN HOST FORMAT > + * mask.. is an array of arg1 bits (rounded up to > + * the next multiple of 32) with bits set > + * for each host in the map. > + */ > + uint32_t *map = (uint32_t *)&cmd->mask; > + int low, high; > + int i = contigmask((uint8_t *)&(d[1]), 32); > + > + if (len > 0) > + errx(EX_DATAERR, "address set cannot be in a list"); > + if (i < 24 || i > 31) > + errx(EX_DATAERR, "invalid set with mask %d\n", i); > + cmd->o.arg1 = 1<<(32-i); /* map length */ > + d[0] = ntohl(d[0]); /* base addr in host format */ > + cmd->o.opcode = O_IP_DST_SET; /* default */ > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; > + for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) > + map[i] = 0; /* clear map */ > + > + av = p + 1; > + low = d[0] & 0xff; > + high = low + cmd->o.arg1 - 1; > + /* > + * Here, i stores the previous value when we specify a range > + * of addresses within a mask, e.g. 45-63. i = -1 means we > + * have no previous value. > + */ > + i = -1; /* previous value in a range */ > + while (isdigit(*av)) { > + char *s; > + int a = strtol(av, &s, 0); > + > + if (s == av) { /* no parameter */ > + if (*av != '}') > + errx(EX_DATAERR, "set not closed\n"); > + if (i != -1) > + errx(EX_DATAERR, "incomplete range %d-", i); > + break; > + } > + if (a < low || a > high) > + errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", > + a, low, high); > + a -= low; > + if (i == -1) /* no previous in range */ > + i = a; > + else { /* check that range is valid */ > + if (i > a) > + errx(EX_DATAERR, "invalid range %d-%d", > + i+low, a+low); > + if (*s == '-') > + errx(EX_DATAERR, "double '-' in range"); > + } > + for (; i <= a; i++) > + map[i/32] |= 1<<(i & 31); > + i = -1; > + if (*s == '-') > + i = a; > + else if (*s == '}') > + break; > + av = s+1; > + } > + return; > + } > + av = p; > + if (av) /* then *av must be a ',' */ > + av++; > + > + /* Check this entry */ > + if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ > + /* > + * 'any' turns the entire list into a NOP. > + * 'not any' never matches, so it is removed from the > + * list unless it is the only item, in which case we > + * report an error. > + */ > + if (cmd->o.len & F_NOT) { /* "not any" never matches */ > + if (av == NULL && len == 0) /* only this entry */ > + errx(EX_DATAERR, "not any never matches"); > + } > + /* else do nothing and skip this entry */ > + return; > + } > + /* A single IP can be stored in an optimized format */ > + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); > + return; > + } > + len += 2; /* two words... */ > + d += 2; > + } /* end while */ > + if (len + 1 > F_LEN_MASK) > + errx(EX_DATAERR, "address list too long"); > + cmd->o.len |= len+1; > +} > + > + > +/* n2mask sets n bits of the mask */ > +void > +n2mask(struct in6_addr *mask, int n) > +{ > + static int minimask[9] = > + { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; > + u_char *p; > + > + memset(mask, 0, sizeof(struct in6_addr)); > + p = (u_char *) mask; > + for (; n > 0; p++, n -= 8) { > + if (n >= 8) > + *p = 0xff; > + else > + *p = minimask[n]; > + } > + return; > +} > + > +static void > +fill_flags_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, > + struct _s_x *flags, char *p) > +{ > + char *e; > + uint32_t set = 0, clear = 0; > + > + if (fill_flags(flags, p, &e, &set, &clear) != 0) > + errx(EX_DATAERR, "invalid flag %s", e); > + > + cmd->opcode = opcode; > + cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; > + cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); > +} > + > + > +void > +ipfw_delete(char *av[]) > +{ > + int i; > + int exitval = EX_OK; > + int do_set = 0; > + ipfw_range_tlv rt; > + > + av++; > + NEED1("missing rule specification"); > + memset(&rt, 0, sizeof(rt)); > + if ( *av && _substrcmp(*av, "set") == 0) { > + /* Do not allow using the following syntax: > + * ipfw set N delete set M > + */ > + if (co.use_set) > + errx(EX_DATAERR, "invalid syntax"); > + do_set = 1; /* delete set */ > + av++; > + } > + > + /* Rule number */ > + while (*av && isdigit(**av)) { > + i = atoi(*av); av++; > + if (co.do_nat) { > + exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); > + if (exitval) { > + exitval = EX_UNAVAILABLE; > + warn("rule %u not available", i); > + } > + } else if (co.do_pipe) { > + exitval = ipfw_delete_pipe(co.do_pipe, i); > + } else { > + if (do_set != 0) { > + rt.set = i & 31; > + rt.flags = IPFW_RCFLAG_SET; > + } else { > + rt.start_rule = i & 0xffff; > + rt.end_rule = i & 0xffff; > + if (rt.start_rule == 0 && rt.end_rule == 0) > + rt.flags |= IPFW_RCFLAG_ALL; > + else > + rt.flags |= IPFW_RCFLAG_RANGE; > + if (co.use_set != 0) { > + rt.set = co.use_set - 1; > + rt.flags |= IPFW_RCFLAG_SET; > + } > + } > + i = do_range_cmd(IP_FW_XDEL, &rt); > + if (i != 0) { > + exitval = EX_UNAVAILABLE; > + warn("rule %u: setsockopt(IP_FW_XDEL)", > + rt.start_rule); > + } else if (rt.new_set == 0) { > + exitval = EX_UNAVAILABLE; > + if (rt.start_rule != rt.end_rule) > + warnx("no rules rules in %u-%u range", > + rt.start_rule, rt.end_rule); > + else > + warnx("rule %u not found", > + rt.start_rule); > + } > + } > + } > + if (exitval != EX_OK) > + exit(exitval); > +} > + > + > +/* > + * fill the interface structure. We do not check the name as we can > + * create interfaces dynamically, so checking them at insert time > + * makes relatively little sense. > + * Interface names containing '*', '?', or '[' are assumed to be shell > + * patterns which match interfaces. > + */ > +static void > +fill_iface(ipfw_insn_if *cmd, char *arg, int cblen, struct tidx *tstate) > +{ > + char *p; > + uint16_t uidx; > + > + cmd->name[0] = '\0'; > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); > + > + CHECK_CMDLEN; > + > + /* Parse the interface or address */ > + if (strcmp(arg, "any") == 0) > + cmd->o.len = 0; /* effectively ignore this command */ > + else if (strncmp(arg, "table(", 6) == 0) { > + if ((p = strchr(arg + 6, ')')) == NULL) > + errx(EX_DATAERR, "forgotten parenthesis: '%s'", arg); > + *p = '\0'; > + p = strchr(arg + 6, ','); > + if (p) > + *p++ = '\0'; > + if ((uidx = pack_table(tstate, arg + 6)) == 0) > + errx(EX_DATAERR, "Invalid table name: %s", arg + 6); > + > + cmd->name[0] = '\1'; /* Special value indicating table */ > + cmd->p.kidx = uidx; > + } else if (!isdigit(*arg)) { > + strlcpy(cmd->name, arg, sizeof(cmd->name)); > + cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; > + } else if (!inet_aton(arg, &cmd->p.ip)) > + errx(EX_DATAERR, "bad ip address ``%s''", arg); > +} > + > +static void > +get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) > +{ > + int i; > + size_t l; > + char *ap, *ptr, *optr; > + struct ether_addr *mac; > + const char *macset = "0123456789abcdefABCDEF:"; > + > + if (strcmp(p, "any") == 0) { > + for (i = 0; i < ETHER_ADDR_LEN; i++) > + addr[i] = mask[i] = 0; > + return; > + } > + > + optr = ptr = strdup(p); > + if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { > + l = strlen(ap); > + if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) > + errx(EX_DATAERR, "Incorrect MAC address"); > + bcopy(mac, addr, ETHER_ADDR_LEN); > + } else > + errx(EX_DATAERR, "Incorrect MAC address"); > + > + if (ptr != NULL) { /* we have mask? */ > + if (p[ptr - optr - 1] == '/') { /* mask len */ > + long ml = strtol(ptr, &ap, 10); > + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) > + errx(EX_DATAERR, "Incorrect mask length"); > + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) > + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); > + } else { /* mask */ > + l = strlen(ptr); > + if (strspn(ptr, macset) != l || > + (mac = ether_aton(ptr)) == NULL) > + errx(EX_DATAERR, "Incorrect mask"); > + bcopy(mac, mask, ETHER_ADDR_LEN); > + } > + } else { /* default mask: ff:ff:ff:ff:ff:ff */ > + for (i = 0; i < ETHER_ADDR_LEN; i++) > + mask[i] = 0xff; > + } > + for (i = 0; i < ETHER_ADDR_LEN; i++) > + addr[i] &= mask[i]; > + > + free(optr); > +} > + > +/* > + * helper function, updates the pointer to cmd with the length > + * of the current command, and also cleans up the first word of > + * the new command in case it has been clobbered before. > + */ > +static ipfw_insn * > +next_cmd(ipfw_insn *cmd, int *len) > +{ > + *len -= F_LEN(cmd); > + CHECK_LENGTH(*len, 0); > + cmd += F_LEN(cmd); > + bzero(cmd, sizeof(*cmd)); > + return cmd; > +} > + > +/* > + * Takes arguments and copies them into a comment > + */ > +static void > +fill_comment(ipfw_insn *cmd, char **av, int cblen) > +{ > + int i, l; > + char *p = (char *)(cmd + 1); > + > + cmd->opcode = O_NOP; > + cmd->len = (cmd->len & (F_NOT | F_OR)); > + > + /* Compute length of comment string. */ > + for (i = 0, l = 0; av[i] != NULL; i++) > + l += strlen(av[i]) + 1; > + if (l == 0) > + return; > + if (l > 84) > + errx(EX_DATAERR, > + "comment too long (max 80 chars)"); > + l = 1 + (l+3)/4; > + cmd->len = (cmd->len & (F_NOT | F_OR)) | l; > + CHECK_CMDLEN; > + > + for (i = 0; av[i] != NULL; i++) { > + strcpy(p, av[i]); > + p += strlen(av[i]); > + *p++ = ' '; > + } > + *(--p) = '\0'; > +} > + > +/* > + * A function to fill simple commands of size 1. > + * Existing flags are preserved. > + */ > +static void > +fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) > +{ > + cmd->opcode = opcode; > + cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; > + cmd->arg1 = arg; > +} > + > +/* > + * Fetch and add the MAC address and type, with masks. This generates one or > + * two microinstructions, and returns the pointer to the last one. > + */ > +static ipfw_insn * > +add_mac(ipfw_insn *cmd, char *av[], int cblen) > +{ > + ipfw_insn_mac *mac; > + > + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) > + errx(EX_DATAERR, "MAC dst src"); > + > + cmd->opcode = O_MACADDR2; > + cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); > + CHECK_CMDLEN; > + > + mac = (ipfw_insn_mac *)cmd; > + get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ > + get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), > + &(mac->mask[ETHER_ADDR_LEN])); /* src */ > + return cmd; > +} > + > +static ipfw_insn * > +add_mactype(ipfw_insn *cmd, char *av, int cblen) > +{ > + if (!av) > + errx(EX_DATAERR, "missing MAC type"); > + if (strcmp(av, "any") != 0) { /* we have a non-null type */ > + fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE, > + cblen); > + cmd->opcode = O_MAC_TYPE; > + return cmd; > + } else > + return NULL; > +} > + > +static ipfw_insn * > +add_proto0(ipfw_insn *cmd, char *av, u_char *protop) > +{ > + struct protoent *pe; > + char *ep; > + int proto; > + > + proto = strtol(av, &ep, 10); > + if (*ep != '\0' || proto <= 0) { > + if ((pe = getprotobyname(av)) == NULL) > + return NULL; > + proto = pe->p_proto; > + } > + > + fill_cmd(cmd, O_PROTO, 0, proto); > + *protop = proto; > + return cmd; > +} > + > +static ipfw_insn * > +add_proto(ipfw_insn *cmd, char *av, u_char *protop) > +{ > + u_char proto = IPPROTO_IP; > + > + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) > + ; /* do not set O_IP4 nor O_IP6 */ > + else if (strcmp(av, "ip4") == 0) > + /* explicit "just IPv4" rule */ > + fill_cmd(cmd, O_IP4, 0, 0); > + else if (strcmp(av, "ip6") == 0) { > + /* explicit "just IPv6" rule */ > + proto = IPPROTO_IPV6; > + fill_cmd(cmd, O_IP6, 0, 0); > + } else > + return add_proto0(cmd, av, protop); > + > + *protop = proto; > + return cmd; > +} > + > +static ipfw_insn * > +add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) > +{ > + u_char proto = IPPROTO_IP; > + > + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) > + ; /* do not set O_IP4 nor O_IP6 */ > + else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) > + /* explicit "just IPv4" rule */ > + fill_cmd(cmd, O_IP4, 0, 0); > + else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { > + /* explicit "just IPv6" rule */ > + proto = IPPROTO_IPV6; > + fill_cmd(cmd, O_IP6, 0, 0); > + } else > + return add_proto0(cmd, av, protop); > + > + *protop = proto; > + return cmd; > +} > + > +static ipfw_insn * > +add_srcip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) > +{ > + fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); > + if (cmd->opcode == O_IP_DST_SET) /* set */ > + cmd->opcode = O_IP_SRC_SET; > + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ > + cmd->opcode = O_IP_SRC_LOOKUP; > + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ > + cmd->opcode = O_IP_SRC_ME; > + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ > + cmd->opcode = O_IP_SRC; > + else /* addr/mask */ > + cmd->opcode = O_IP_SRC_MASK; > + return cmd; > +} > + > +static ipfw_insn * > +add_dstip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) > +{ > + fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); > + if (cmd->opcode == O_IP_DST_SET) /* set */ > + ; > + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ > + ; > + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ > + cmd->opcode = O_IP_DST_ME; > + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ > + cmd->opcode = O_IP_DST; > + else /* addr/mask */ > + cmd->opcode = O_IP_DST_MASK; > + return cmd; > +} > + > +static struct _s_x f_reserved_keywords[] = { > + { "altq", TOK_OR }, > + { "//", TOK_OR }, > + { "diverted", TOK_OR }, > + { "dst-port", TOK_OR }, > + { "src-port", TOK_OR }, > + { "established", TOK_OR }, > + { "keep-state", TOK_OR }, > + { "frag", TOK_OR }, > + { "icmptypes", TOK_OR }, > + { "in", TOK_OR }, > + { "out", TOK_OR }, > + { "ip6", TOK_OR }, > + { "any", TOK_OR }, > + { "to", TOK_OR }, > + { "via", TOK_OR }, > + { "{", TOK_OR }, > + { NULL, 0 } /* terminator */ > +}; > + > +static ipfw_insn * > +add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode, int cblen) > +{ > + > + if (match_token(f_reserved_keywords, av) != -1) > + return (NULL); > + > + if (fill_newports((ipfw_insn_u16 *)cmd, av, proto, cblen)) { > + /* XXX todo: check that we have a protocol with ports */ > + cmd->opcode = opcode; > + return cmd; > + } > + return NULL; > +} > + > +static ipfw_insn * > +add_src(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) > +{ > + struct in6_addr a; > + char *host, *ch, buf[INET6_ADDRSTRLEN]; > + ipfw_insn *ret = NULL; > + int len; > + > + /* Copy first address in set if needed */ > + if ((ch = strpbrk(av, "/,")) != NULL) { > + len = ch - av; > + strlcpy(buf, av, sizeof(buf)); > + if (len < sizeof(buf)) > + buf[len] = '\0'; > + host = buf; > + } else > + host = av; > + > + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || > + inet_pton(AF_INET6, host, &a) == 1) > + ret = add_srcip6(cmd, av, cblen); > + /* XXX: should check for IPv4, not !IPv6 */ > + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || > + inet_pton(AF_INET6, host, &a) != 1)) > + ret = add_srcip(cmd, av, cblen, tstate); > + if (ret == NULL && strcmp(av, "any") != 0) > + ret = cmd; > + > + return ret; > +} > + > +static ipfw_insn * > +add_dst(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) > +{ > + struct in6_addr a; > + char *host, *ch, buf[INET6_ADDRSTRLEN]; > + ipfw_insn *ret = NULL; > + int len; > + > + /* Copy first address in set if needed */ > + if ((ch = strpbrk(av, "/,")) != NULL) { > + len = ch - av; > + strlcpy(buf, av, sizeof(buf)); > + if (len < sizeof(buf)) > + buf[len] = '\0'; > + host = buf; > + } else > + host = av; > + > + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || > + inet_pton(AF_INET6, host, &a) == 1) > + ret = add_dstip6(cmd, av, cblen); > + /* XXX: should check for IPv4, not !IPv6 */ > + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || > + inet_pton(AF_INET6, host, &a) != 1)) > + ret = add_dstip(cmd, av, cblen, tstate); > + if (ret == NULL && strcmp(av, "any") != 0) > + ret = cmd; > + > + return ret; > +} > + > +/* > + * Parse arguments and assemble the microinstructions which make up a rule. > + * Rules are added into the 'rulebuf' and then copied in the correct order > + * into the actual rule. > + * > + * The syntax for a rule starts with the action, followed by > + * optional action parameters, and the various match patterns. > + * In the assembled microcode, the first opcode must be an O_PROBE_STATE > + * (generated if the rule includes a keep-state option), then the > + * various match patterns, log/altq actions, and the actual action. > + * > + */ > +void > +compile_rule(char *av[], uint32_t *rbuf, int *rbufsize, struct tidx *tstate) > +{ > + /* > + * rules are added into the 'rulebuf' and then copied in > + * the correct order into the actual rule. > + * Some things that need to go out of order (prob, action etc.) > + * go into actbuf[]. > + */ > + static uint32_t actbuf[255], cmdbuf[255]; > + int rblen, ablen, cblen; > + > + ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; > + ipfw_insn *first_cmd; /* first match pattern */ > + > + struct ip_fw_rule *rule; > + > + /* > + * various flags used to record that we entered some fields. > + */ > + ipfw_insn *have_state = NULL; /* check-state or keep-state */ > + ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; > + size_t len; > + > + int i; > + > + int open_par = 0; /* open parenthesis ( */ > + > + /* proto is here because it is used to fetch ports */ > + u_char proto = IPPROTO_IP; /* default protocol */ > + > + double match_prob = 1; /* match probability, default is always match */ > + > + bzero(actbuf, sizeof(actbuf)); /* actions go here */ > + bzero(cmdbuf, sizeof(cmdbuf)); > + bzero(rbuf, *rbufsize); > + > + rule = (struct ip_fw_rule *)rbuf; > + cmd = (ipfw_insn *)cmdbuf; > + action = (ipfw_insn *)actbuf; > + > + rblen = *rbufsize / sizeof(uint32_t); > + rblen -= sizeof(struct ip_fw_rule) / sizeof(uint32_t); > + ablen = sizeof(actbuf) / sizeof(actbuf[0]); > + cblen = sizeof(cmdbuf) / sizeof(cmdbuf[0]); > + cblen -= F_INSN_SIZE(ipfw_insn_u32) + 1; > + > +#define CHECK_RBUFLEN(len) { CHECK_LENGTH(rblen, len); rblen -= len; } > +#define CHECK_ACTLEN CHECK_LENGTH(ablen, action->len) > + > + av++; > + > + /* [rule N] -- Rule number optional */ > + if (av[0] && isdigit(**av)) { > + rule->rulenum = atoi(*av); > + av++; > + } > + > + /* [set N] -- set number (0..RESVD_SET), optional */ > + if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { > + int set = strtoul(av[1], NULL, 10); > + if (set < 0 || set > RESVD_SET) > + errx(EX_DATAERR, "illegal set %s", av[1]); > + rule->set = set; > + tstate->set = set; > + av += 2; > + } > + > + /* [prob D] -- match probability, optional */ > + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { > + match_prob = strtod(av[1], NULL); > + > + if (match_prob <= 0 || match_prob > 1) > + errx(EX_DATAERR, "illegal match prob. %s", av[1]); > + av += 2; > + } > + > + /* action -- mandatory */ > + NEED1("missing action"); > + i = match_token(rule_actions, *av); > + av++; > + action->len = 1; /* default */ > + CHECK_ACTLEN; > + switch(i) { > + case TOK_CHECKSTATE: > + have_state = action; > + action->opcode = O_CHECK_STATE; > + break; > + > + case TOK_ACCEPT: > + action->opcode = O_ACCEPT; > + break; > + > + case TOK_DENY: > + action->opcode = O_DENY; > + action->arg1 = 0; > + break; > + > + case TOK_REJECT: > + action->opcode = O_REJECT; > + action->arg1 = ICMP_UNREACH_HOST; > + break; > + > + case TOK_RESET: > + action->opcode = O_REJECT; > + action->arg1 = ICMP_REJECT_RST; > + break; > + > + case TOK_RESET6: > + action->opcode = O_UNREACH6; > + action->arg1 = ICMP6_UNREACH_RST; > + break; > + > + case TOK_UNREACH: > + action->opcode = O_REJECT; > + NEED1("missing reject code"); > + fill_reject_code(&action->arg1, *av); > + av++; > + break; > + > + case TOK_UNREACH6: > + action->opcode = O_UNREACH6; > + NEED1("missing unreach code"); > + fill_unreach6_code(&action->arg1, *av); > + av++; > + break; > + > + case TOK_COUNT: > + action->opcode = O_COUNT; > + break; > + > + case TOK_NAT: > + action->opcode = O_NAT; > + action->len = F_INSN_SIZE(ipfw_insn_nat); > + CHECK_ACTLEN; > + if (_substrcmp(*av, "global") == 0) { > + action->arg1 = 0; > + av++; > + break; > + } else > + goto chkarg; > + > + case TOK_QUEUE: > + action->opcode = O_QUEUE; > + goto chkarg; > + case TOK_PIPE: > + action->opcode = O_PIPE; > + goto chkarg; > + case TOK_SKIPTO: > + action->opcode = O_SKIPTO; > + goto chkarg; > + case TOK_NETGRAPH: > + action->opcode = O_NETGRAPH; > + goto chkarg; > + case TOK_NGTEE: > + action->opcode = O_NGTEE; > + goto chkarg; > + case TOK_DIVERT: > + action->opcode = O_DIVERT; > + goto chkarg; > + case TOK_TEE: > + action->opcode = O_TEE; > + goto chkarg; > + case TOK_CALL: > + action->opcode = O_CALLRETURN; > +chkarg: > + if (!av[0]) > + errx(EX_USAGE, "missing argument for %s", *(av - 1)); > + if (isdigit(**av)) { > + action->arg1 = strtoul(*av, NULL, 10); > + if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) > + errx(EX_DATAERR, "illegal argument for %s", > + *(av - 1)); > + } else if (_substrcmp(*av, "tablearg") == 0) { > + action->arg1 = IP_FW_TARG; > + } else if (i == TOK_DIVERT || i == TOK_TEE) { > + struct servent *s; > + setservent(1); > + s = getservbyname(av[0], "divert"); > + if (s != NULL) > + action->arg1 = ntohs(s->s_port); > + else > + errx(EX_DATAERR, "illegal divert/tee port"); > + } else > + errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); > + av++; > + break; > + > + case TOK_FORWARD: { > + /* > + * Locate the address-port separator (':' or ','). > + * Could be one of the following: > + * hostname:port > + * IPv4 a.b.c.d,port > + * IPv4 a.b.c.d:port > + * IPv6 w:x:y::z,port > + * The ':' can only be used with hostname and IPv4 address. > + * XXX-BZ Should we also support [w:x:y::z]:port? > + */ > + struct sockaddr_storage result; > + struct addrinfo *res; > + char *s, *end; > + int family; > + u_short port_number; > + > + NEED1("missing forward address[:port]"); > + > + /* > + * locate the address-port separator (':' or ',') > + */ > + s = strchr(*av, ','); > + if (s == NULL) { > + /* Distinguish between IPv4:port and IPv6 cases. */ > + s = strchr(*av, ':'); > + if (s && strchr(s+1, ':')) > + s = NULL; /* no port */ > + } > + > + port_number = 0; > + if (s != NULL) { > + /* Terminate host portion and set s to start of port. */ > + *(s++) = '\0'; > + i = strtoport(s, &end, 0 /* base */, 0 /* proto */); > + if (s == end) > + errx(EX_DATAERR, > + "illegal forwarding port ``%s''", s); > + port_number = (u_short)i; > + } > + > + if (_substrcmp(*av, "tablearg") == 0) { > + family = PF_INET; > + ((struct sockaddr_in*)&result)->sin_addr.s_addr = > + INADDR_ANY; > + } else { > + /* > + * Resolve the host name or address to a family and a > + * network representation of the address. > + */ > + if (getaddrinfo(*av, NULL, NULL, &res)) > + errx(EX_DATAERR, NULL); > + /* Just use the first host in the answer. */ > + family = res->ai_family; > + memcpy(&result, res->ai_addr, res->ai_addrlen); > + freeaddrinfo(res); > + } > + > + if (family == PF_INET) { > + ipfw_insn_sa *p = (ipfw_insn_sa *)action; > + > + action->opcode = O_FORWARD_IP; > + action->len = F_INSN_SIZE(ipfw_insn_sa); > + CHECK_ACTLEN; > + > + /* > + * In the kernel we assume AF_INET and use only > + * sin_port and sin_addr. Remember to set sin_len as > + * the routing code seems to use it too. > + */ > + p->sa.sin_len = sizeof(struct sockaddr_in); > + p->sa.sin_family = AF_INET; > + p->sa.sin_port = port_number; > + p->sa.sin_addr.s_addr = > + ((struct sockaddr_in *)&result)->sin_addr.s_addr; > + } else if (family == PF_INET6) { > + ipfw_insn_sa6 *p = (ipfw_insn_sa6 *)action; > + > + action->opcode = O_FORWARD_IP6; > + action->len = F_INSN_SIZE(ipfw_insn_sa6); > + CHECK_ACTLEN; > + > + p->sa.sin6_len = sizeof(struct sockaddr_in6); > + p->sa.sin6_family = AF_INET6; > + p->sa.sin6_port = port_number; > + p->sa.sin6_flowinfo = 0; > + p->sa.sin6_scope_id = 0; > + /* No table support for v6 yet. */ > + bcopy(&((struct sockaddr_in6*)&result)->sin6_addr, > + &p->sa.sin6_addr, sizeof(p->sa.sin6_addr)); > + } else { > + errx(EX_DATAERR, "Invalid address family in forward action"); > + } > + av++; > + break; > + } > + case TOK_COMMENT: > + /* pretend it is a 'count' rule followed by the comment */ > + action->opcode = O_COUNT; > + av--; /* go back... */ > + break; > + > + case TOK_SETFIB: > + { > + int numfibs; > + size_t intsize = sizeof(int); > + > + action->opcode = O_SETFIB; > + NEED1("missing fib number"); > + if (_substrcmp(*av, "tablearg") == 0) { > + action->arg1 = IP_FW_TARG; > + } else { > + action->arg1 = strtoul(*av, NULL, 10); > + if (sysctlbyname("net.fibs", &numfibs, &intsize, > + NULL, 0) == -1) > + errx(EX_DATAERR, "fibs not suported.\n"); > + if (action->arg1 >= numfibs) /* Temporary */ > + errx(EX_DATAERR, "fib too large.\n"); > + /* Add high-order bit to fib to make room for tablearg*/ > + action->arg1 |= 0x8000; > + } > + av++; > + break; > + } > + > + case TOK_SETDSCP: > + { > + int code; > + > + action->opcode = O_SETDSCP; > + NEED1("missing DSCP code"); > + if (_substrcmp(*av, "tablearg") == 0) { > + action->arg1 = IP_FW_TARG; > + } else if (isalpha(*av[0])) { > + if ((code = match_token(f_ipdscp, *av)) == -1) > + errx(EX_DATAERR, "Unknown DSCP code"); > + action->arg1 = code; > + } else > + action->arg1 = strtoul(*av, NULL, 10); > + /* Add high-order bit to DSCP to make room for tablearg */ > + if (action->arg1 != IP_FW_TARG) > + action->arg1 |= 0x8000; > + av++; > + break; > + } > + > + case TOK_REASS: > + action->opcode = O_REASS; > + break; > + > + case TOK_RETURN: > + fill_cmd(action, O_CALLRETURN, F_NOT, 0); > + break; > + > + default: > + errx(EX_DATAERR, "invalid action %s\n", av[-1]); > + } > + action = next_cmd(action, &ablen); > + > + /* > + * [altq queuename] -- altq tag, optional > + * [log [logamount N]] -- log, optional > + * > + * If they exist, it go first in the cmdbuf, but then it is > + * skipped in the copy section to the end of the buffer. > + */ > + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { > + av++; > + switch (i) { > + case TOK_LOG: > + { > + ipfw_insn_log *c = (ipfw_insn_log *)cmd; > + int l; > + > + if (have_log) > + errx(EX_DATAERR, > + "log cannot be specified more than once"); > + have_log = (ipfw_insn *)c; > + cmd->len = F_INSN_SIZE(ipfw_insn_log); > + CHECK_CMDLEN; > + cmd->opcode = O_LOG; > + if (av[0] && _substrcmp(*av, "logamount") == 0) { > + av++; > + NEED1("logamount requires argument"); > + l = atoi(*av); > + if (l < 0) > + errx(EX_DATAERR, > + "logamount must be positive"); > + c->max_log = l; > + av++; > + } else { > + len = sizeof(c->max_log); > + if (sysctlbyname("net.inet.ip.fw.verbose_limit", > + &c->max_log, &len, NULL, 0) == -1) { > + if (co.test_only) { > + c->max_log = 0; > + break; > + } > + errx(1, "sysctlbyname(\"%s\")", > + "net.inet.ip.fw.verbose_limit"); > + } > + } > + } > + break; > + > +#ifndef NO_ALTQ > + case TOK_ALTQ: > + { > + ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; > + > + NEED1("missing altq queue name"); > + if (have_altq) > + errx(EX_DATAERR, > + "altq cannot be specified more than once"); > + have_altq = (ipfw_insn *)a; > + cmd->len = F_INSN_SIZE(ipfw_insn_altq); > + CHECK_CMDLEN; > + cmd->opcode = O_ALTQ; > + a->qid = altq_name_to_qid(*av); > + av++; > + } > + break; > +#endif > + > + case TOK_TAG: > + case TOK_UNTAG: { > + uint16_t tag; > + > + if (have_tag) > + errx(EX_USAGE, "tag and untag cannot be " > + "specified more than once"); > + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, > + rule_action_params); > + have_tag = cmd; > + fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); > + av++; > + break; > + } > + > + default: > + abort(); > + } > + cmd = next_cmd(cmd, &cblen); > + } > + > + if (have_state) /* must be a check-state, we are done */ > + goto done; > + > +#define OR_START(target) \ > + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ > + if (open_par) \ > + errx(EX_USAGE, "nested \"(\" not allowed\n"); \ > + prev = NULL; \ > + open_par = 1; \ > + if ( (av[0])[1] == '\0') { \ > + av++; \ > + } else \ > + (*av)++; \ > + } \ > + target: \ > + > + > +#define CLOSE_PAR \ > + if (open_par) { \ > + if (av[0] && ( \ > + strcmp(*av, ")") == 0 || \ > + strcmp(*av, "}") == 0)) { \ > + prev = NULL; \ > + open_par = 0; \ > + av++; \ > + } else \ > + errx(EX_USAGE, "missing \")\"\n"); \ > + } > + > +#define NOT_BLOCK \ > + if (av[0] && _substrcmp(*av, "not") == 0) { \ > + if (cmd->len & F_NOT) \ > + errx(EX_USAGE, "double \"not\" not allowed\n"); \ > + cmd->len |= F_NOT; \ > + av++; \ > + } > + > +#define OR_BLOCK(target) \ > + if (av[0] && _substrcmp(*av, "or") == 0) { \ > + if (prev == NULL || open_par == 0) \ > + errx(EX_DATAERR, "invalid OR block"); \ > + prev->len |= F_OR; \ > + av++; \ > + goto target; \ > + } \ > + CLOSE_PAR; > + > + first_cmd = cmd; > + > +#if 0 > + /* > + * MAC addresses, optional. > + * If we have this, we skip the part "proto from src to dst" > + * and jump straight to the option parsing. > + */ > + NOT_BLOCK; > + NEED1("missing protocol"); > + if (_substrcmp(*av, "MAC") == 0 || > + _substrcmp(*av, "mac") == 0) { > + av++; /* the "MAC" keyword */ > + add_mac(cmd, av); /* exits in case of errors */ > + cmd = next_cmd(cmd); > + av += 2; /* dst-mac and src-mac */ > + NOT_BLOCK; > + NEED1("missing mac type"); > + if (add_mactype(cmd, av[0])) > + cmd = next_cmd(cmd); > + av++; /* any or mac-type */ > + goto read_options; > + } > +#endif > + > + /* > + * protocol, mandatory > + */ > + OR_START(get_proto); > + NOT_BLOCK; > + NEED1("missing protocol"); > + if (add_proto_compat(cmd, *av, &proto)) { > + av++; > + if (F_LEN(cmd) != 0) { > + prev = cmd; > + cmd = next_cmd(cmd, &cblen); > + } > + } else if (first_cmd != cmd) { > + errx(EX_DATAERR, "invalid protocol ``%s''", *av); > + } else > + goto read_options; > + OR_BLOCK(get_proto); > + > + /* > + * "from", mandatory > + */ > + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) > + errx(EX_USAGE, "missing ``from''"); > + av++; > + > + /* > + * source IP, mandatory > + */ > + OR_START(source_ip); > + NOT_BLOCK; /* optional "not" */ > + NEED1("missing source address"); > + if (add_src(cmd, *av, proto, cblen, tstate)) { > + av++; > + if (F_LEN(cmd) != 0) { /* ! any */ > + prev = cmd; > + cmd = next_cmd(cmd, &cblen); > + } > + } else > + errx(EX_USAGE, "bad source address %s", *av); > + OR_BLOCK(source_ip); > + > + /* > + * source ports, optional > + */ > + NOT_BLOCK; /* optional "not" */ > + if ( av[0] != NULL ) { > + if (_substrcmp(*av, "any") == 0 || > + add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { > + av++; > + if (F_LEN(cmd) != 0) > + cmd = next_cmd(cmd, &cblen); > + } > + } > + > + /* > + * "to", mandatory > + */ > + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) > + errx(EX_USAGE, "missing ``to''"); > + av++; > + > + /* > + * destination, mandatory > + */ > + OR_START(dest_ip); > + NOT_BLOCK; /* optional "not" */ > + NEED1("missing dst address"); > + if (add_dst(cmd, *av, proto, cblen, tstate)) { > + av++; > + if (F_LEN(cmd) != 0) { /* ! any */ > + prev = cmd; > + cmd = next_cmd(cmd, &cblen); > + } > + } else > + errx( EX_USAGE, "bad destination address %s", *av); > + OR_BLOCK(dest_ip); > + > + /* > + * dest. ports, optional > + */ > + NOT_BLOCK; /* optional "not" */ > + if (av[0]) { > + if (_substrcmp(*av, "any") == 0 || > + add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { > + av++; > + if (F_LEN(cmd) != 0) > + cmd = next_cmd(cmd, &cblen); > + } > + } > + > +read_options: > + if (av[0] && first_cmd == cmd) { > + /* > + * nothing specified so far, store in the rule to ease > + * printout later. > + */ > + rule->flags |= IPFW_RULE_NOOPT; > + } > + prev = NULL; > + while ( av[0] != NULL ) { > + char *s; > + ipfw_insn_u32 *cmd32; /* alias for cmd */ > + > + s = *av; > + cmd32 = (ipfw_insn_u32 *)cmd; > + > + if (*s == '!') { /* alternate syntax for NOT */ > + if (cmd->len & F_NOT) > + errx(EX_USAGE, "double \"not\" not allowed\n"); > + cmd->len = F_NOT; > + s++; > + } > + i = match_token(rule_options, s); > + av++; > + switch(i) { > + case TOK_NOT: > + if (cmd->len & F_NOT) > + errx(EX_USAGE, "double \"not\" not allowed\n"); > + cmd->len = F_NOT; > + break; > + > + case TOK_OR: > + if (open_par == 0 || prev == NULL) > + errx(EX_USAGE, "invalid \"or\" block\n"); > + prev->len |= F_OR; > + break; > + > + case TOK_STARTBRACE: > + if (open_par) > + errx(EX_USAGE, "+nested \"(\" not allowed\n"); > + open_par = 1; > + break; > + > + case TOK_ENDBRACE: > + if (!open_par) > + errx(EX_USAGE, "+missing \")\"\n"); > + open_par = 0; > + prev = NULL; > + break; > + > + case TOK_IN: > + fill_cmd(cmd, O_IN, 0, 0); > + break; > + > + case TOK_OUT: > + cmd->len ^= F_NOT; /* toggle F_NOT */ > + fill_cmd(cmd, O_IN, 0, 0); > + break; > + > + case TOK_DIVERTED: > + fill_cmd(cmd, O_DIVERTED, 0, 3); > + break; > + > + case TOK_DIVERTEDLOOPBACK: > + fill_cmd(cmd, O_DIVERTED, 0, 1); > + break; > + > + case TOK_DIVERTEDOUTPUT: > + fill_cmd(cmd, O_DIVERTED, 0, 2); > + break; > + > + case TOK_FRAG: > + fill_cmd(cmd, O_FRAG, 0, 0); > + break; > + > + case TOK_LAYER2: > + fill_cmd(cmd, O_LAYER2, 0, 0); > + break; > + > + case TOK_XMIT: > + case TOK_RECV: > + case TOK_VIA: > + NEED1("recv, xmit, via require interface name" > + " or address"); > + fill_iface((ipfw_insn_if *)cmd, av[0], cblen, tstate); > + av++; > + if (F_LEN(cmd) == 0) /* not a valid address */ > + break; > + if (i == TOK_XMIT) > + cmd->opcode = O_XMIT; > + else if (i == TOK_RECV) > + cmd->opcode = O_RECV; > + else if (i == TOK_VIA) > + cmd->opcode = O_VIA; > + break; > + > + case TOK_ICMPTYPES: > + NEED1("icmptypes requires list of types"); > + fill_icmptypes((ipfw_insn_u32 *)cmd, *av); > + av++; > + break; > + > + case TOK_ICMP6TYPES: > + NEED1("icmptypes requires list of types"); > + fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av, cblen); > + av++; > + break; > + > + case TOK_IPTTL: > + NEED1("ipttl requires TTL"); > + if (strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_IPTTL, cblen)) > + errx(EX_DATAERR, "invalid ipttl %s", *av); > + } else > + fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_IPID: > + NEED1("ipid requires id"); > + if (strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_IPID, cblen)) > + errx(EX_DATAERR, "invalid ipid %s", *av); > + } else > + fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_IPLEN: > + NEED1("iplen requires length"); > + if (strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_IPLEN, cblen)) > + errx(EX_DATAERR, "invalid ip len %s", *av); > + } else > + fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_IPVER: > + NEED1("ipver requires version"); > + fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_IPPRECEDENCE: > + NEED1("ipprecedence requires value"); > + fill_cmd(cmd, O_IPPRECEDENCE, 0, > + (strtoul(*av, NULL, 0) & 7) << 5); > + av++; > + break; > + > + case TOK_DSCP: > + NEED1("missing DSCP code"); > + fill_dscp(cmd, *av, cblen); > + av++; > + break; > + > + case TOK_IPOPTS: > + NEED1("missing argument for ipoptions"); > + fill_flags_cmd(cmd, O_IPOPT, f_ipopts, *av); > + av++; > + break; > + > + case TOK_IPTOS: > + NEED1("missing argument for iptos"); > + fill_flags_cmd(cmd, O_IPTOS, f_iptos, *av); > + av++; > + break; > + > + case TOK_UID: > + NEED1("uid requires argument"); > + { > + char *end; > + uid_t uid; > + struct passwd *pwd; > + > + cmd->opcode = O_UID; > + uid = strtoul(*av, &end, 0); > + pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); > + if (pwd == NULL) > + errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); > + cmd32->d[0] = pwd->pw_uid; > + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); > + av++; > + } > + break; > + > + case TOK_GID: > + NEED1("gid requires argument"); > + { > + char *end; > + gid_t gid; > + struct group *grp; > + > + cmd->opcode = O_GID; > + gid = strtoul(*av, &end, 0); > + grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); > + if (grp == NULL) > + errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); > + cmd32->d[0] = grp->gr_gid; > + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); > + av++; > + } > + break; > + > + case TOK_JAIL: > + NEED1("jail requires argument"); > + { > + char *end; > + int jid; > + > + cmd->opcode = O_JAIL; > + jid = (int)strtol(*av, &end, 0); > + if (jid < 0 || *end != '\0') > + errx(EX_DATAERR, "jail requires prison ID"); > + cmd32->d[0] = (uint32_t)jid; > + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); > + av++; > + } > + break; > + > + case TOK_ESTAB: > + fill_cmd(cmd, O_ESTAB, 0, 0); > + break; > + > + case TOK_SETUP: > + fill_cmd(cmd, O_TCPFLAGS, 0, > + (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); > + break; > + > + case TOK_TCPDATALEN: > + NEED1("tcpdatalen requires length"); > + if (strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_TCPDATALEN, cblen)) > + errx(EX_DATAERR, "invalid tcpdata len %s", *av); > + } else > + fill_cmd(cmd, O_TCPDATALEN, 0, > + strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_TCPOPTS: > + NEED1("missing argument for tcpoptions"); > + fill_flags_cmd(cmd, O_TCPOPTS, f_tcpopts, *av); > + av++; > + break; > + > + case TOK_TCPSEQ: > + case TOK_TCPACK: > + NEED1("tcpseq/tcpack requires argument"); > + cmd->len = F_INSN_SIZE(ipfw_insn_u32); > + cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; > + cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_TCPWIN: > + NEED1("tcpwin requires length"); > + if (strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_TCPWIN, cblen)) > + errx(EX_DATAERR, "invalid tcpwin len %s", *av); > + } else > + fill_cmd(cmd, O_TCPWIN, 0, > + strtoul(*av, NULL, 0)); > + av++; > + break; > + > + case TOK_TCPFLAGS: > + NEED1("missing argument for tcpflags"); > + cmd->opcode = O_TCPFLAGS; > + fill_flags_cmd(cmd, O_TCPFLAGS, f_tcpflags, *av); > + av++; > + break; > + > + case TOK_KEEPSTATE: > + if (open_par) > + errx(EX_USAGE, "keep-state cannot be part " > + "of an or block"); > + if (have_state) > + errx(EX_USAGE, "only one of keep-state " > + "and limit is allowed"); > + have_state = cmd; > + fill_cmd(cmd, O_KEEP_STATE, 0, 0); > + break; > + > + case TOK_LIMIT: { > + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; > + int val; > + > + if (open_par) > + errx(EX_USAGE, > + "limit cannot be part of an or block"); > + if (have_state) > + errx(EX_USAGE, "only one of keep-state and " > + "limit is allowed"); > + have_state = cmd; > + > + cmd->len = F_INSN_SIZE(ipfw_insn_limit); > + CHECK_CMDLEN; > + cmd->opcode = O_LIMIT; > + c->limit_mask = c->conn_limit = 0; > + > + while ( av[0] != NULL ) { > + if ((val = match_token(limit_masks, *av)) <= 0) > + break; > + c->limit_mask |= val; > + av++; > + } > + > + if (c->limit_mask == 0) > + errx(EX_USAGE, "limit: missing limit mask"); > + > + GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, > + TOK_LIMIT, rule_options); > + > + av++; > + break; > + } > + > + case TOK_PROTO: > + NEED1("missing protocol"); > + if (add_proto(cmd, *av, &proto)) { > + av++; > + } else > + errx(EX_DATAERR, "invalid protocol ``%s''", > + *av); > + break; > + > + case TOK_SRCIP: > + NEED1("missing source IP"); > + if (add_srcip(cmd, *av, cblen, tstate)) { > + av++; > + } > + break; > + > + case TOK_DSTIP: > + NEED1("missing destination IP"); > + if (add_dstip(cmd, *av, cblen, tstate)) { > + av++; > + } > + break; > + > + case TOK_SRCIP6: > + NEED1("missing source IP6"); > + if (add_srcip6(cmd, *av, cblen)) { > + av++; > + } > + break; > + > + case TOK_DSTIP6: > + NEED1("missing destination IP6"); > + if (add_dstip6(cmd, *av, cblen)) { > + av++; > + } > + break; > + > + case TOK_SRCPORT: > + NEED1("missing source port"); > + if (_substrcmp(*av, "any") == 0 || > + add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { > + av++; > + } else > + errx(EX_DATAERR, "invalid source port %s", *av); > + break; > + > + case TOK_DSTPORT: > + NEED1("missing destination port"); > + if (_substrcmp(*av, "any") == 0 || > + add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { > + av++; > + } else > + errx(EX_DATAERR, "invalid destination port %s", > + *av); > + break; > + > + case TOK_MAC: > + if (add_mac(cmd, av, cblen)) > + av += 2; > + break; > + > + case TOK_MACTYPE: > + NEED1("missing mac type"); > + if (!add_mactype(cmd, *av, cblen)) > + errx(EX_DATAERR, "invalid mac type %s", *av); > + av++; > + break; > + > + case TOK_VERREVPATH: > + fill_cmd(cmd, O_VERREVPATH, 0, 0); > + break; > + > + case TOK_VERSRCREACH: > + fill_cmd(cmd, O_VERSRCREACH, 0, 0); > + break; > + > + case TOK_ANTISPOOF: > + fill_cmd(cmd, O_ANTISPOOF, 0, 0); > + break; > + > + case TOK_IPSEC: > + fill_cmd(cmd, O_IPSEC, 0, 0); > + break; > + > + case TOK_IPV6: > + fill_cmd(cmd, O_IP6, 0, 0); > + break; > + > + case TOK_IPV4: > + fill_cmd(cmd, O_IP4, 0, 0); > + break; > + > + case TOK_EXT6HDR: > + fill_ext6hdr( cmd, *av ); > + av++; > + break; > + > + case TOK_FLOWID: > + if (proto != IPPROTO_IPV6 ) > + errx( EX_USAGE, "flow-id filter is active " > + "only for ipv6 protocol\n"); > + fill_flow6( (ipfw_insn_u32 *) cmd, *av, cblen); > + av++; > + break; > + > + case TOK_COMMENT: > + fill_comment(cmd, av, cblen); > + av[0]=NULL; > + break; > + > + case TOK_TAGGED: > + if (av[0] && strpbrk(*av, "-,")) { > + if (!add_ports(cmd, *av, 0, O_TAGGED, cblen)) > + errx(EX_DATAERR, "tagged: invalid tag" > + " list: %s", *av); > + } > + else { > + uint16_t tag; > + > + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, > + TOK_TAGGED, rule_options); > + fill_cmd(cmd, O_TAGGED, 0, tag); > + } > + av++; > + break; > + > + case TOK_FIB: > + NEED1("fib requires fib number"); > + fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); > + av++; > + break; > + case TOK_SOCKARG: > + fill_cmd(cmd, O_SOCKARG, 0, 0); > + break; > + > + case TOK_LOOKUP: { > + ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; > + int j; > + > + if (!av[0] || !av[1]) > + errx(EX_USAGE, "format: lookup argument tablenum"); > + cmd->opcode = O_IP_DST_LOOKUP; > + cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; > + i = match_token(rule_options, *av); > + for (j = 0; lookup_key[j] >= 0 ; j++) { > + if (i == lookup_key[j]) > + break; > + } > + if (lookup_key[j] <= 0) > + errx(EX_USAGE, "format: cannot lookup on %s", *av); > + __PAST_END(c->d, 1) = j; // i converted to option > + av++; > + > + if ((j = pack_table(tstate, *av)) == 0) > + errx(EX_DATAERR, "Invalid table name: %s", *av); > + > + cmd->arg1 = j; > + av++; > + } > + break; > + case TOK_FLOW: > + NEED1("missing table name"); > + if (strncmp(*av, "table(", 6) != 0) > + errx(EX_DATAERR, > + "enclose table name into \"table()\""); > + fill_table(cmd, *av, O_IP_FLOW_LOOKUP, tstate); > + av++; > + break; > + > + default: > + errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); > + } > + if (F_LEN(cmd) > 0) { /* prepare to advance */ > + prev = cmd; > + cmd = next_cmd(cmd, &cblen); > + } > + } > + > +done: > + /* > + * Now copy stuff into the rule. > + * If we have a keep-state option, the first instruction > + * must be a PROBE_STATE (which is generated here). > + * If we have a LOG option, it was stored as the first command, > + * and now must be moved to the top of the action part. > + */ > + dst = (ipfw_insn *)rule->cmd; > + > + /* > + * First thing to write into the command stream is the match probability. > + */ > + if (match_prob != 1) { /* 1 means always match */ > + dst->opcode = O_PROB; > + dst->len = 2; > + *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); > + dst += dst->len; > + } > + > + /* > + * generate O_PROBE_STATE if necessary > + */ > + if (have_state && have_state->opcode != O_CHECK_STATE) { > + fill_cmd(dst, O_PROBE_STATE, 0, 0); > + dst = next_cmd(dst, &rblen); > + } > + > + /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ > + for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { > + i = F_LEN(src); > + CHECK_RBUFLEN(i); > + > + switch (src->opcode) { > + case O_LOG: > + case O_KEEP_STATE: > + case O_LIMIT: > + case O_ALTQ: > + case O_TAG: > + break; > + default: > + bcopy(src, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + } > + > + /* > + * put back the have_state command as last opcode > + */ > + if (have_state && have_state->opcode != O_CHECK_STATE) { > + i = F_LEN(have_state); > + CHECK_RBUFLEN(i); > + bcopy(have_state, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + /* > + * start action section > + */ > + rule->act_ofs = dst - rule->cmd; > + > + /* put back O_LOG, O_ALTQ, O_TAG if necessary */ > + if (have_log) { > + i = F_LEN(have_log); > + CHECK_RBUFLEN(i); > + bcopy(have_log, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + if (have_altq) { > + i = F_LEN(have_altq); > + CHECK_RBUFLEN(i); > + bcopy(have_altq, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + if (have_tag) { > + i = F_LEN(have_tag); > + CHECK_RBUFLEN(i); > + bcopy(have_tag, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + > + /* > + * copy all other actions > + */ > + for (src = (ipfw_insn *)actbuf; src != action; src += i) { > + i = F_LEN(src); > + CHECK_RBUFLEN(i); > + bcopy(src, dst, i * sizeof(uint32_t)); > + dst += i; > + } > + > + rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); > + *rbufsize = (char *)dst - (char *)rule; > +} > + > +/* > + * Adds one or more rules to ipfw chain. > + * Data layout: > + * Request: > + * [ > + * ip_fw3_opheader > + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) > + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] (*2) (*3) > + * ] > + * Reply: > + * [ > + * ip_fw3_opheader > + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) > + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] > + * ] > + * > + * Rules in reply are modified to store their actual ruleset number. > + * > + * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending > + * accoring to their idx field and there has to be no duplicates. > + * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. > + * (*3) Each ip_fw structure needs to be aligned to u64 boundary. > + */ > +void > +ipfw_add(char *av[]) > +{ > + uint32_t rulebuf[1024]; > + int rbufsize, default_off, tlen, rlen; > + size_t sz; > + struct tidx ts; > + struct ip_fw_rule *rule; > + caddr_t tbuf; > + ip_fw3_opheader *op3; > + ipfw_obj_ctlv *ctlv, *tstate; > + > + rbufsize = sizeof(rulebuf); > + memset(rulebuf, 0, rbufsize); > + memset(&ts, 0, sizeof(ts)); > + > + /* Optimize case with no tables */ > + default_off = sizeof(ipfw_obj_ctlv) + sizeof(ip_fw3_opheader); > + op3 = (ip_fw3_opheader *)rulebuf; > + ctlv = (ipfw_obj_ctlv *)(op3 + 1); > + rule = (struct ip_fw_rule *)(ctlv + 1); > + rbufsize -= default_off; > + > + compile_rule(av, (uint32_t *)rule, &rbufsize, &ts); > + /* Align rule size to u64 boundary */ > + rlen = roundup2(rbufsize, sizeof(uint64_t)); > + > + tbuf = NULL; > + sz = 0; > + tstate = NULL; > + if (ts.count != 0) { > + /* Some tables. We have to alloc more data */ > + tlen = ts.count * sizeof(ipfw_obj_ntlv); > + sz = default_off + sizeof(ipfw_obj_ctlv) + tlen + rlen; > + > + if ((tbuf = calloc(1, sz)) == NULL) > + err(EX_UNAVAILABLE, "malloc() failed for IP_FW_ADD"); > + op3 = (ip_fw3_opheader *)tbuf; > + /* Tables first */ > + ctlv = (ipfw_obj_ctlv *)(op3 + 1); > + ctlv->head.type = IPFW_TLV_TBLNAME_LIST; > + ctlv->head.length = sizeof(ipfw_obj_ctlv) + tlen; > + ctlv->count = ts.count; > + ctlv->objsize = sizeof(ipfw_obj_ntlv); > + memcpy(ctlv + 1, ts.idx, tlen); > + table_sort_ctlv(ctlv); > + tstate = ctlv; > + /* Rule next */ > + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); > + ctlv->head.type = IPFW_TLV_RULE_LIST; > + ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; > + ctlv->count = 1; > + memcpy(ctlv + 1, rule, rbufsize); > + } else { > + /* Simply add header */ > + sz = rlen + default_off; > + memset(ctlv, 0, sizeof(*ctlv)); > + ctlv->head.type = IPFW_TLV_RULE_LIST; > + ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; > + ctlv->count = 1; > + } > + > + if (do_get3(IP_FW_XADD, op3, &sz) != 0) > + err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_XADD"); > + > + if (!co.do_quiet) { > + struct format_opts sfo; > + struct buf_pr bp; > + memset(&sfo, 0, sizeof(sfo)); > + sfo.tstate = tstate; > + sfo.set_mask = (uint32_t)(-1); > + bp_alloc(&bp, 4096); > + show_static_rule(&co, &sfo, &bp, rule, NULL); > + printf("%s", bp.buf); > + bp_free(&bp); > + } > + > + if (tbuf != NULL) > + free(tbuf); > + > + if (ts.idx != NULL) > + free(ts.idx); > +} > + > +/* > + * clear the counters or the log counters. > + * optname has the following values: > + * 0 (zero both counters and logging) > + * 1 (zero logging only) > + */ > +void > +ipfw_zero(int ac, char *av[], int optname) > +{ > + ipfw_range_tlv rt; > + uint32_t arg; > + int failed = EX_OK; > + char const *errstr; > + char const *name = optname ? "RESETLOG" : "ZERO"; > + > + optname = optname ? IP_FW_XRESETLOG : IP_FW_XZERO; > + memset(&rt, 0, sizeof(rt)); > + > + av++; ac--; > + > + if (ac == 0) { > + /* clear all entries */ > + rt.flags = IPFW_RCFLAG_ALL; > + if (do_range_cmd(optname, &rt) < 0) > + err(EX_UNAVAILABLE, "setsockopt(IP_FW_X%s)", name); > + if (!co.do_quiet) > + printf("%s.\n", optname == IP_FW_XZERO ? > + "Accounting cleared":"Logging counts reset"); > + > + return; > + } > + > + while (ac) { > + /* Rule number */ > + if (isdigit(**av)) { > + arg = strtonum(*av, 0, 0xffff, &errstr); > + if (errstr) > + errx(EX_DATAERR, > + "invalid rule number %s\n", *av); > + rt.start_rule = arg; > + rt.end_rule = arg; > + rt.flags |= IPFW_RCFLAG_RANGE; > + if (co.use_set != 0) { > + rt.set = co.use_set - 1; > + rt.flags |= IPFW_RCFLAG_SET; > + } > + if (do_range_cmd(optname, &rt) != 0) { > + warn("rule %u: setsockopt(IP_FW_X%s)", > + arg, name); > + failed = EX_UNAVAILABLE; > + } else if (rt.new_set == 0) { > + printf("Entry %d not found\n", arg); > + failed = EX_UNAVAILABLE; > + } else if (!co.do_quiet) > + printf("Entry %d %s.\n", arg, > + optname == IP_FW_XZERO ? > + "cleared" : "logging count reset"); > + } else { > + errx(EX_USAGE, "invalid rule number ``%s''", *av); > + } > + av++; ac--; > + } > + if (failed != EX_OK) > + exit(failed); > +} > + > +void > +ipfw_flush(int force) > +{ > + ipfw_range_tlv rt; > + > + if (!force && !co.do_quiet) { /* need to ask user */ > + int c; > + > + printf("Are you sure? [yn] "); > + fflush(stdout); > + do { > + c = toupper(getc(stdin)); > + while (c != '\n' && getc(stdin) != '\n') > + if (feof(stdin)) > + return; /* and do not flush */ > + } while (c != 'Y' && c != 'N'); > + printf("\n"); > + if (c == 'N') /* user said no */ > + return; > + } > + if (co.do_pipe) { > + dummynet_flush(); > + return; > + } > + /* `ipfw set N flush` - is the same that `ipfw delete set N` */ > + memset(&rt, 0, sizeof(rt)); > + if (co.use_set != 0) { > + rt.set = co.use_set - 1; > + rt.flags = IPFW_RCFLAG_SET; > + } else > + rt.flags = IPFW_RCFLAG_ALL; > + if (do_range_cmd(IP_FW_XDEL, &rt) != 0) > + err(EX_UNAVAILABLE, "setsockopt(IP_FW_XDEL)"); > + if (!co.do_quiet) > + printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); > +} > + > +static struct _s_x intcmds[] = { > + { "talist", TOK_TALIST }, > + { "iflist", TOK_IFLIST }, > + { "vlist", TOK_VLIST }, > + { NULL, 0 } > +}; > + > +void > +ipfw_internal_handler(int ac, char *av[]) > +{ > + int tcmd; > + > + ac--; av++; > + NEED1("internal cmd required"); > + > + if ((tcmd = match_token(intcmds, *av)) == -1) > + errx(EX_USAGE, "invalid internal sub-cmd: %s", *av); > + > + switch (tcmd) { > + case TOK_IFLIST: > + ipfw_list_tifaces(); > + break; > + case TOK_TALIST: > + ipfw_list_ta(ac, av); > + break; > + case TOK_VLIST: > + ipfw_list_values(ac, av); > + break; > + } > +} > + > +static int > +ipfw_get_tracked_ifaces(ipfw_obj_lheader **polh) > +{ > + ipfw_obj_lheader req, *olh; > + size_t sz; > + > + memset(&req, 0, sizeof(req)); > + sz = sizeof(req); > + > + if (do_get3(IP_FW_XIFLIST, &req.opheader, &sz) != 0) { > + if (errno != ENOMEM) > + return (errno); > + } > + > + sz = req.size; > + if ((olh = calloc(1, sz)) == NULL) > + return (ENOMEM); > + > + olh->size = sz; > + if (do_get3(IP_FW_XIFLIST, &olh->opheader, &sz) != 0) { > + free(olh); > + return (errno); > + } > + > + *polh = olh; > + return (0); > +} > + > +static int > +ifinfo_cmp(const void *a, const void *b) > +{ > + ipfw_iface_info *ia, *ib; > + > + ia = (ipfw_iface_info *)a; > + ib = (ipfw_iface_info *)b; > + > + return (stringnum_cmp(ia->ifname, ib->ifname)); > +} > + > +/* > + * Retrieves table list from kernel, > + * optionally sorts it and calls requested function for each table. > + * Returns 0 on success. > + */ > +static void > +ipfw_list_tifaces() > +{ > + ipfw_obj_lheader *olh = NULL; > + ipfw_iface_info *info; > + int i, error; > + > + if ((error = ipfw_get_tracked_ifaces(&olh)) != 0) > + err(EX_OSERR, "Unable to request ipfw tracked interface list"); > + > + > + qsort(olh + 1, olh->count, olh->objsize, ifinfo_cmp); > + > + info = (ipfw_iface_info *)(olh + 1); > + for (i = 0; i < olh->count; i++) { > + if (info->flags & IPFW_IFFLAG_RESOLVED) > + printf("%s ifindex: %d refcount: %u changes: %u\n", > + info->ifname, info->ifindex, info->refcnt, > + info->gencnt); > + else > + printf("%s ifindex: unresolved refcount: %u changes: %u\n", > + info->ifname, info->refcnt, info->gencnt); > + info = (ipfw_iface_info *)((caddr_t)info + olh->objsize); > + } > + > + free(olh); > +} > + > + > + > + > diff --git a/example/ipfw/ipfw/ipfw2.h b/example/ipfw/ipfw/ipfw2.h > new file mode 100644 > index 0000000..8770534 > --- /dev/null > +++ b/example/ipfw/ipfw/ipfw2.h > @@ -0,0 +1,352 @@ > +/* > + * Copyright (c) 2002-2003 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * NEW command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/ipfw2.h 272840 2014-10-09 19:32:35Z melifaro $ > + */ > + > +/* > + * Options that can be set on the command line. > + * When reading commands from a file, a subset of the options can also > + * be applied globally by specifying them before the file name. > + * After that, each line can contain its own option that changes > + * the global value. > + * XXX The context is not restored after each line. > + */ > + > +struct cmdline_opts { > + /* boolean options: */ > + int do_value_as_ip; /* show table value as IP */ > + int do_resolv; /* try to resolve all ip to names */ > + int do_time; /* Show time stamps */ > + int do_quiet; /* Be quiet in add and flush */ > + int do_pipe; /* this cmd refers to a pipe/queue/sched */ > + int do_nat; /* this cmd refers to a nat config */ > + int do_dynamic; /* display dynamic rules */ > + int do_expired; /* display expired dynamic rules */ > + int do_compact; /* show rules in compact mode */ > + int do_force; /* do not ask for confirmation */ > + int show_sets; /* display the set each rule belongs to */ > + int test_only; /* only check syntax */ > + int comment_only; /* only print action and comment */ > + int verbose; /* be verbose on some commands */ > + > + /* The options below can have multiple values. */ > + > + int do_sort; /* field to sort results (0 = no) */ > + /* valid fields are 1 and above */ > + > + int use_set; /* work with specified set number */ > + /* 0 means all sets, otherwise apply to set use_set - 1 */ > + > +}; > + > +extern struct cmdline_opts co; > + > +/* > + * _s_x is a structure that stores a string <-> token pairs, used in > + * various places in the parser. Entries are stored in arrays, > + * with an entry with s=NULL as terminator. > + * The search routines are match_token() and match_value(). > + * Often, an element with x=0 contains an error string. > + * > + */ > +struct _s_x { > + char const *s; > + int x; > +}; > + > +extern struct _s_x f_ipdscp[]; > + > +enum tokens { > + TOK_NULL=0, > + > + TOK_OR, > + TOK_NOT, > + TOK_STARTBRACE, > + TOK_ENDBRACE, > + > + TOK_ACCEPT, > + TOK_COUNT, > + TOK_PIPE, > + TOK_LINK, > + TOK_QUEUE, > + TOK_FLOWSET, > + TOK_SCHED, > + TOK_DIVERT, > + TOK_TEE, > + TOK_NETGRAPH, > + TOK_NGTEE, > + TOK_FORWARD, > + TOK_SKIPTO, > + TOK_DENY, > + TOK_REJECT, > + TOK_RESET, > + TOK_UNREACH, > + TOK_CHECKSTATE, > + TOK_NAT, > + TOK_REASS, > + TOK_CALL, > + TOK_RETURN, > + > + TOK_ALTQ, > + TOK_LOG, > + TOK_TAG, > + TOK_UNTAG, > + > + TOK_TAGGED, > + TOK_UID, > + TOK_GID, > + TOK_JAIL, > + TOK_IN, > + TOK_LIMIT, > + TOK_KEEPSTATE, > + TOK_LAYER2, > + TOK_OUT, > + TOK_DIVERTED, > + TOK_DIVERTEDLOOPBACK, > + TOK_DIVERTEDOUTPUT, > + TOK_XMIT, > + TOK_RECV, > + TOK_VIA, > + TOK_FRAG, > + TOK_IPOPTS, > + TOK_IPLEN, > + TOK_IPID, > + TOK_IPPRECEDENCE, > + TOK_DSCP, > + TOK_IPTOS, > + TOK_IPTTL, > + TOK_IPVER, > + TOK_ESTAB, > + TOK_SETUP, > + TOK_TCPDATALEN, > + TOK_TCPFLAGS, > + TOK_TCPOPTS, > + TOK_TCPSEQ, > + TOK_TCPACK, > + TOK_TCPWIN, > + TOK_ICMPTYPES, > + TOK_MAC, > + TOK_MACTYPE, > + TOK_VERREVPATH, > + TOK_VERSRCREACH, > + TOK_ANTISPOOF, > + TOK_IPSEC, > + TOK_COMMENT, > + > + TOK_PLR, > + TOK_NOERROR, > + TOK_BUCKETS, > + TOK_DSTIP, > + TOK_SRCIP, > + TOK_DSTPORT, > + TOK_SRCPORT, > + TOK_ALL, > + TOK_MASK, > + TOK_FLOW_MASK, > + TOK_SCHED_MASK, > + TOK_BW, > + TOK_DELAY, > + TOK_PROFILE, > + TOK_BURST, > + TOK_RED, > + TOK_GRED, > + TOK_ECN, > + TOK_DROPTAIL, > + TOK_PROTO, > + /* dummynet tokens */ > + TOK_WEIGHT, > + TOK_LMAX, > + TOK_PRI, > + TOK_TYPE, > + TOK_SLOTSIZE, > + > + TOK_IP, > + TOK_IF, > + TOK_ALOG, > + TOK_DENY_INC, > + TOK_SAME_PORTS, > + TOK_UNREG_ONLY, > + TOK_SKIP_GLOBAL, > + TOK_RESET_ADDR, > + TOK_ALIAS_REV, > + TOK_PROXY_ONLY, > + TOK_REDIR_ADDR, > + TOK_REDIR_PORT, > + TOK_REDIR_PROTO, > + > + TOK_IPV6, > + TOK_FLOWID, > + TOK_ICMP6TYPES, > + TOK_EXT6HDR, > + TOK_DSTIP6, > + TOK_SRCIP6, > + > + TOK_IPV4, > + TOK_UNREACH6, > + TOK_RESET6, > + > + TOK_FIB, > + TOK_SETFIB, > + TOK_LOOKUP, > + TOK_SOCKARG, > + TOK_SETDSCP, > + TOK_FLOW, > + TOK_IFLIST, > + /* Table tokens */ > + TOK_CREATE, > + TOK_DESTROY, > + TOK_LIST, > + TOK_INFO, > + TOK_DETAIL, > + TOK_MODIFY, > + TOK_FLUSH, > + TOK_SWAP, > + TOK_ADD, > + TOK_DEL, > + TOK_VALTYPE, > + TOK_ALGO, > + TOK_TALIST, > + TOK_ATOMIC, > + TOK_LOCK, > + TOK_UNLOCK, > + TOK_VLIST, > +}; > + > +/* > + * the following macro returns an error message if we run out of > + * arguments. > + */ > +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} > +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} > + > +struct buf_pr { > + char *buf; /* allocated buffer */ > + char *ptr; /* current pointer */ > + size_t size; /* total buffer size */ > + size_t avail; /* available storage */ > + size_t needed; /* length needed */ > +}; > + > +int pr_u64(struct buf_pr *bp, uint64_t *pd, int width); > +int bp_alloc(struct buf_pr *b, size_t size); > +void bp_free(struct buf_pr *b); > +int bprintf(struct buf_pr *b, char *format, ...); > + > + > +/* memory allocation support */ > +void *safe_calloc(size_t number, size_t size); > +void *safe_realloc(void *ptr, size_t size); > + > +/* string comparison functions used for historical compatibility */ > +int _substrcmp(const char *str1, const char* str2); > +int _substrcmp2(const char *str1, const char* str2, const char* str3); > +int stringnum_cmp(const char *a, const char *b); > + > +/* utility functions */ > +int match_token(struct _s_x *table, char *string); > +int match_token_relaxed(struct _s_x *table, char *string); > +char const *match_value(struct _s_x *p, int value); > +size_t concat_tokens(char *buf, size_t bufsize, struct _s_x *table, > + char *delimiter); > +int fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, > + uint32_t *clear); > +void print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set); > + > +struct _ip_fw3_opheader; > +int do_cmd(int optname, void *optval, uintptr_t optlen); > +int do_set3(int optname, struct _ip_fw3_opheader *op3, uintptr_t optlen); > +int do_get3(int optname, struct _ip_fw3_opheader *op3, size_t *optlen); > + > +struct in6_addr; > +void n2mask(struct in6_addr *mask, int n); > +int contigmask(uint8_t *p, int len); > + > +/* > + * Forward declarations to avoid include way too many headers. > + * C does not allow duplicated typedefs, so we use the base struct > + * that the typedef points to. > + * Should the typedefs use a different type, the compiler will > + * still detect the change when compiling the body of the > + * functions involved, so we do not lose error checking. > + */ > +struct _ipfw_insn; > +struct _ipfw_insn_altq; > +struct _ipfw_insn_u32; > +struct _ipfw_insn_ip6; > +struct _ipfw_insn_icmp6; > + > +/* > + * The reserved set numer. This is a constant in ip_fw.h > + * but we store it in a variable so other files do not depend > + * in that header just for one constant. > + */ > +extern int resvd_set_number; > + > +/* first-level command handlers */ > +void ipfw_add(char *av[]); > +void ipfw_show_nat(int ac, char **av); > +void ipfw_config_pipe(int ac, char **av); > +void ipfw_config_nat(int ac, char **av); > +void ipfw_sets_handler(char *av[]); > +void ipfw_table_handler(int ac, char *av[]); > +void ipfw_sysctl_handler(char *av[], int which); > +void ipfw_delete(char *av[]); > +void ipfw_flush(int force); > +void ipfw_zero(int ac, char *av[], int optname); > +void ipfw_list(int ac, char *av[], int show_counters); > +void ipfw_internal_handler(int ac, char *av[]); > + > +#ifdef PF > +/* altq.c */ > +void altq_set_enabled(int enabled); > +u_int32_t altq_name_to_qid(const char *name); > +void print_altq_cmd(struct buf_pr *bp, struct _ipfw_insn_altq *altqptr); > +#else > +#define NO_ALTQ > +#endif > + > +/* dummynet.c */ > +void dummynet_list(int ac, char *av[], int show_counters); > +void dummynet_flush(void); > +int ipfw_delete_pipe(int pipe_or_queue, int n); > + > +/* ipv6.c */ > +void print_unreach6_code(uint16_t code); > +void print_ip6(struct buf_pr *bp, struct _ipfw_insn_ip6 *cmd, char const *s); > +void print_flow6id(struct buf_pr *bp, struct _ipfw_insn_u32 *cmd); > +void print_icmp6types(struct buf_pr *bp, struct _ipfw_insn_u32 *cmd); > +void print_ext6hdr(struct buf_pr *bp, struct _ipfw_insn *cmd ); > + > +struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av, int cblen); > +struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av, int cblen); > + > +void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av, int cblen); > +void fill_unreach6_code(u_short *codep, char *str); > +void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av, int cblen); > +int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); > + > +/* tables.c */ > +struct _ipfw_obj_ctlv; > +char *table_search_ctlv(struct _ipfw_obj_ctlv *ctlv, uint16_t idx); > +void table_sort_ctlv(struct _ipfw_obj_ctlv *ctlv); > +int table_check_name(char *tablename); > +void ipfw_list_ta(int ac, char *av[]); > +void ipfw_list_values(int ac, char *av[]); > + > diff --git a/example/ipfw/ipfw/ipv6.c b/example/ipfw/ipfw/ipv6.c > new file mode 100644 > index 0000000..0871a88 > --- /dev/null > +++ b/example/ipfw/ipfw/ipv6.c > @@ -0,0 +1,536 @@ > +/* > + * Copyright (c) 2002-2003 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * NEW command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/ipv6.c 270424 2014-08-23 17:37:18Z melifaro $ > + * > + * ipv6 support > + */ > + > +#include <sys/types.h> > +#include <sys/socket.h> > + > +#include "ipfw2.h" > + > +#include <err.h> > +#include <netdb.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > + > +#include <net/if.h> > +#include <netinet/in.h> > +#include <netinet/in_systm.h> > +#include <netinet/ip.h> > +#include <netinet/icmp6.h> > +#include <netinet/ip_fw.h> > +#include <arpa/inet.h> > + > +#define CHECK_LENGTH(v, len) do { \ > + if ((v) < (len)) \ > + errx(EX_DATAERR, "Rule too long"); \ > + } while (0) > + > +static struct _s_x icmp6codes[] = { > + { "no-route", ICMP6_DST_UNREACH_NOROUTE }, > + { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, > + { "address", ICMP6_DST_UNREACH_ADDR }, > + { "port", ICMP6_DST_UNREACH_NOPORT }, > + { NULL, 0 } > +}; > + > +void > +fill_unreach6_code(u_short *codep, char *str) > +{ > + int val; > + char *s; > + > + val = strtoul(str, &s, 0); > + if (s == str || *s != '\0' || val >= 0x100) > + val = match_token(icmp6codes, str); > + if (val < 0) > + errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); > + *codep = val; > + return; > +} > + > +void > +print_unreach6_code(uint16_t code) > +{ > + char const *s = match_value(icmp6codes, code); > + > + if (s != NULL) > + printf("unreach6 %s", s); > + else > + printf("unreach6 %u", code); > +} > + > +/* > + * Print the ip address contained in a command. > + */ > +void > +print_ip6(struct buf_pr *bp, ipfw_insn_ip6 *cmd, char const *s) > +{ > + struct hostent *he = NULL; > + int len = F_LEN((ipfw_insn *) cmd) - 1; > + struct in6_addr *a = &(cmd->addr6); > + char trad[255]; > + > + bprintf(bp, "%s%s ", cmd->o.len & F_NOT ? " not": "", s); > + > + if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { > + bprintf(bp, "me6"); > + return; > + } > + if (cmd->o.opcode == O_IP6) { > + bprintf(bp, " ip6"); > + return; > + } > + > + /* > + * len == 4 indicates a single IP, whereas lists of 1 or more > + * addr/mask pairs have len = (2n+1). We convert len to n so we > + * use that to count the number of entries. > + */ > + > + for (len = len / 4; len > 0; len -= 2, a += 2) { > + int mb = /* mask length */ > + (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? > + 128 : contigmask((uint8_t *)&(a[1]), 128); > + > + if (mb == 128 && co.do_resolv) > + he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); > + if (he != NULL) /* resolved to name */ > + bprintf(bp, "%s", he->h_name); > + else if (mb == 0) /* any */ > + bprintf(bp, "any"); > + else { /* numeric IP followed by some kind of mask */ > + if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) > + bprintf(bp, "Error ntop in print_ip6\n"); > + bprintf(bp, "%s", trad ); > + if (mb < 0) /* XXX not really legal... */ > + bprintf(bp, ":%s", > + inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); > + else if (mb < 128) > + bprintf(bp, "/%d", mb); > + } > + if (len > 2) > + bprintf(bp, ","); > + } > +} > + > +void > +fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av, int cblen) > +{ > + uint8_t type; > + > + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn_icmp6)); > + > + bzero(cmd, sizeof(*cmd)); > + while (*av) { > + if (*av == ',') > + av++; > + type = strtoul(av, &av, 0); > + if (*av != ',' && *av != '\0') > + errx(EX_DATAERR, "invalid ICMP6 type"); > + /* > + * XXX: shouldn't this be 0xFF? I can't see any reason why > + * we shouldn't be able to filter all possiable values > + * regardless of the ability of the rest of the kernel to do > + * anything useful with them. > + */ > + if (type > ICMP6_MAXTYPE) > + errx(EX_DATAERR, "ICMP6 type out of range"); > + cmd->d[type / 32] |= ( 1 << (type % 32)); > + } > + cmd->o.opcode = O_ICMP6TYPE; > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); > +} > + > + > +void > +print_icmp6types(struct buf_pr *bp, ipfw_insn_u32 *cmd) > +{ > + int i, j; > + char sep= ' '; > + > + bprintf(bp, " ip6 icmp6types"); > + for (i = 0; i < 7; i++) > + for (j=0; j < 32; ++j) { > + if ( (cmd->d[i] & (1 << (j))) == 0) > + continue; > + bprintf(bp, "%c%d", sep, (i*32 + j)); > + sep = ','; > + } > +} > + > +void > +print_flow6id(struct buf_pr *bp, ipfw_insn_u32 *cmd) > +{ > + uint16_t i, limit = cmd->o.arg1; > + char sep = ','; > + > + bprintf(bp, " flow-id "); > + for( i=0; i < limit; ++i) { > + if (i == limit - 1) > + sep = ' '; > + bprintf(bp, "%d%c", cmd->d[i], sep); > + } > +} > + > +/* structure and define for the extension header in ipv6 */ > +static struct _s_x ext6hdrcodes[] = { > + { "frag", EXT_FRAGMENT }, > + { "hopopt", EXT_HOPOPTS }, > + { "route", EXT_ROUTING }, > + { "dstopt", EXT_DSTOPTS }, > + { "ah", EXT_AH }, > + { "esp", EXT_ESP }, > + { "rthdr0", EXT_RTHDR0 }, > + { "rthdr2", EXT_RTHDR2 }, > + { NULL, 0 } > +}; > + > +/* fills command for the extension header filtering */ > +int > +fill_ext6hdr( ipfw_insn *cmd, char *av) > +{ > + int tok; > + char *s = av; > + > + cmd->arg1 = 0; > + > + while(s) { > + av = strsep( &s, ",") ; > + tok = match_token(ext6hdrcodes, av); > + switch (tok) { > + case EXT_FRAGMENT: > + cmd->arg1 |= EXT_FRAGMENT; > + break; > + > + case EXT_HOPOPTS: > + cmd->arg1 |= EXT_HOPOPTS; > + break; > + > + case EXT_ROUTING: > + cmd->arg1 |= EXT_ROUTING; > + break; > + > + case EXT_DSTOPTS: > + cmd->arg1 |= EXT_DSTOPTS; > + break; > + > + case EXT_AH: > + cmd->arg1 |= EXT_AH; > + break; > + > + case EXT_ESP: > + cmd->arg1 |= EXT_ESP; > + break; > + > + case EXT_RTHDR0: > + cmd->arg1 |= EXT_RTHDR0; > + break; > + > + case EXT_RTHDR2: > + cmd->arg1 |= EXT_RTHDR2; > + break; > + > + default: > + errx( EX_DATAERR, "invalid option for ipv6 exten header" ); > + break; > + } > + } > + if (cmd->arg1 == 0 ) > + return 0; > + cmd->opcode = O_EXT_HDR; > + cmd->len |= F_INSN_SIZE( ipfw_insn ); > + return 1; > +} > + > +void > +print_ext6hdr(struct buf_pr *bp, ipfw_insn *cmd ) > +{ > + char sep = ' '; > + > + bprintf(bp, " extension header:"); > + if (cmd->arg1 & EXT_FRAGMENT ) { > + bprintf(bp, "%cfragmentation", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_HOPOPTS ) { > + bprintf(bp, "%chop options", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_ROUTING ) { > + bprintf(bp, "%crouting options", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_RTHDR0 ) { > + bprintf(bp, "%crthdr0", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_RTHDR2 ) { > + bprintf(bp, "%crthdr2", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_DSTOPTS ) { > + bprintf(bp, "%cdestination options", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_AH ) { > + bprintf(bp, "%cauthentication header", sep); > + sep = ','; > + } > + if (cmd->arg1 & EXT_ESP ) { > + bprintf(bp, "%cencapsulated security payload", sep); > + } > +} > + > +/* Try to find ipv6 address by hostname */ > +static int > +lookup_host6 (char *host, struct in6_addr *ip6addr) > +{ > + struct hostent *he; > + > + if (!inet_pton(AF_INET6, host, ip6addr)) { > + if ((he = gethostbyname2(host, AF_INET6)) == NULL) > + return(-1); > + memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); > + } > + return(0); > +} > + > + > +/* > + * fill the addr and mask fields in the instruction as appropriate from av. > + * Update length as appropriate. > + * The following formats are allowed: > + * any matches any IP6. Actually returns an empty instruction. > + * me returns O_IP6_*_ME > + * > + * 03f1::234:123:0342 single IP6 addres > + * 03f1::234:123:0342/24 address/mask > + * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address > + * > + * Set of address (as in ipv6) not supported because ipv6 address > + * are typically random past the initial prefix. > + * Return 1 on success, 0 on failure. > + */ > +static int > +fill_ip6(ipfw_insn_ip6 *cmd, char *av, int cblen) > +{ > + int len = 0; > + struct in6_addr *d = &(cmd->addr6); > + /* > + * Needed for multiple address. > + * Note d[1] points to struct in6_add r mask6 of cmd > + */ > + > + cmd->o.len &= ~F_LEN_MASK; /* zero len */ > + > + if (strcmp(av, "any") == 0) > + return (1); > + > + > + if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ > + cmd->o.len |= F_INSN_SIZE(ipfw_insn); > + return (1); > + } > + > + if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ > + cmd->o.len |= F_INSN_SIZE(ipfw_insn); > + return (1); > + } > + > + if (strncmp(av, "table(", 6) == 0) { > + char *p = strchr(av + 6, ','); > + uint32_t *dm = ((ipfw_insn_u32 *)cmd)->d; > + > + if (p) > + *p++ = '\0'; > + cmd->o.opcode = O_IP_DST_LOOKUP; > + cmd->o.arg1 = strtoul(av + 6, NULL, 0); > + if (p) { > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); > + dm[0] = strtoul(p, NULL, 0); > + } else > + cmd->o.len |= F_INSN_SIZE(ipfw_insn); > + return (1); > + } > + > + av = strdup(av); > + while (av) { > + /* > + * After the address we can have '/' indicating a mask, > + * or ',' indicating another address follows. > + */ > + > + char *p; > + int masklen; > + char md = '\0'; > + > + CHECK_LENGTH(cblen, 1 + len + 2 * F_INSN_SIZE(struct in6_addr)); > + > + if ((p = strpbrk(av, "/,")) ) { > + md = *p; /* save the separator */ > + *p = '\0'; /* terminate address string */ > + p++; /* and skip past it */ > + } > + /* now p points to NULL, mask or next entry */ > + > + /* lookup stores address in *d as a side effect */ > + if (lookup_host6(av, d) != 0) { > + /* XXX: failed. Free memory and go */ > + errx(EX_DATAERR, "bad address \"%s\"", av); > + } > + /* next, look at the mask, if any */ > + masklen = (md == '/') ? atoi(p) : 128; > + if (masklen > 128 || masklen < 0) > + errx(EX_DATAERR, "bad width \"%s\''", p); > + else > + n2mask(&d[1], masklen); > + > + APPLY_MASK(d, &d[1]) /* mask base address with mask */ > + > + /* find next separator */ > + > + if (md == '/') { /* find separator past the mask */ > + p = strpbrk(p, ","); > + if (p != NULL) > + p++; > + } > + av = p; > + > + /* Check this entry */ > + if (masklen == 0) { > + /* > + * 'any' turns the entire list into a NOP. > + * 'not any' never matches, so it is removed from the > + * list unless it is the only item, in which case we > + * report an error. > + */ > + if (cmd->o.len & F_NOT && av == NULL && len == 0) > + errx(EX_DATAERR, "not any never matches"); > + continue; > + } > + > + /* > + * A single IP can be stored alone > + */ > + if (masklen == 128 && av == NULL && len == 0) { > + len = F_INSN_SIZE(struct in6_addr); > + break; > + } > + > + /* Update length and pointer to arguments */ > + len += F_INSN_SIZE(struct in6_addr)*2; > + d += 2; > + } /* end while */ > + > + /* > + * Total length of the command, remember that 1 is the size of > + * the base command. > + */ > + if (len + 1 > F_LEN_MASK) > + errx(EX_DATAERR, "address list too long"); > + cmd->o.len |= len+1; > + free(av); > + return (1); > +} > + > +/* > + * fills command for ipv6 flow-id filtering > + * note that the 20 bit flow number is stored in a array of u_int32_t > + * it's supported lists of flow-id, so in the o.arg1 we store how many > + * additional flow-id we want to filter, the basic is 1 > + */ > +void > +fill_flow6( ipfw_insn_u32 *cmd, char *av, int cblen) > +{ > + u_int32_t type; /* Current flow number */ > + u_int16_t nflow = 0; /* Current flow index */ > + char *s = av; > + cmd->d[0] = 0; /* Initializing the base number*/ > + > + while (s) { > + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn_u32) + nflow + 1); > + > + av = strsep( &s, ",") ; > + type = strtoul(av, &av, 0); > + if (*av != ',' && *av != '\0') > + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); > + if (type > 0xfffff) > + errx(EX_DATAERR, "flow number out of range %s", av); > + cmd->d[nflow] |= type; > + nflow++; > + } > + if( nflow > 0 ) { > + cmd->o.opcode = O_FLOW6ID; > + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; > + cmd->o.arg1 = nflow; > + } > + else { > + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); > + } > +} > + > +ipfw_insn * > +add_srcip6(ipfw_insn *cmd, char *av, int cblen) > +{ > + > + fill_ip6((ipfw_insn_ip6 *)cmd, av, cblen); > + if (cmd->opcode == O_IP_DST_SET) /* set */ > + cmd->opcode = O_IP_SRC_SET; > + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ > + cmd->opcode = O_IP_SRC_LOOKUP; > + else if (F_LEN(cmd) == 0) { /* any */ > + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ > + cmd->opcode = O_IP6_SRC_ME; > + } else if (F_LEN(cmd) == > + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { > + /* single IP, no mask*/ > + cmd->opcode = O_IP6_SRC; > + } else { /* addr/mask opt */ > + cmd->opcode = O_IP6_SRC_MASK; > + } > + return cmd; > +} > + > +ipfw_insn * > +add_dstip6(ipfw_insn *cmd, char *av, int cblen) > +{ > + > + fill_ip6((ipfw_insn_ip6 *)cmd, av, cblen); > + if (cmd->opcode == O_IP_DST_SET) /* set */ > + ; > + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ > + ; > + else if (F_LEN(cmd) == 0) { /* any */ > + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ > + cmd->opcode = O_IP6_DST_ME; > + } else if (F_LEN(cmd) == > + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { > + /* single IP, no mask*/ > + cmd->opcode = O_IP6_DST; > + } else { /* addr/mask opt */ > + cmd->opcode = O_IP6_DST_MASK; > + } > + return cmd; > +} > diff --git a/example/ipfw/ipfw/main.c b/example/ipfw/ipfw/main.c > new file mode 100644 > index 0000000..a8f5fed > --- /dev/null > +++ b/example/ipfw/ipfw/main.c > @@ -0,0 +1,628 @@ > +/* > + * Copyright (c) 2002-2003,2010 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * Command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/main.c 272840 2014-10-09 19:32:35Z melifaro $ > + */ > + > +#include <sys/wait.h> > +#include <ctype.h> > +#include <err.h> > +#include <errno.h> > +#include <signal.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > +#include <unistd.h> > + > +#include "ipfw2.h" > + > +static void > +help(void) > +{ > + fprintf(stderr, > +"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" > +"\tipfw [-abcdefhnNqStTv] <command>\n\n" > +"where <command> is one of the following:\n\n" > +"add [num] [set N] [prob x] RULE-BODY\n" > +"{pipe|queue} N config PIPE-BODY\n" > +"[pipe|queue] {zero|delete|show} [N{,N}]\n" > +"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" > +" reverse|proxy_only|redirect_addr linkspec|\n" > +" redirect_port linkspec|redirect_proto linkspec}\n" > +"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" > +"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" > +"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" > +"table all {flush | list}\n" > +"\n" > +"RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" > +"ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" > +" skipto N | {divert|tee} PORT | forward ADDR |\n" > +" pipe N | queue N | nat N | setfib FIB | reass\n" > +"PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" > +"ADDR: [ MAC dst src ether_type ] \n" > +" [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" > +" [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" > +"IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" > +"IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" > +"IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" > +"IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" > +"OPTION_LIST: OPTION [OPTION_LIST]\n" > +"OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" > +" {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" > +" {dst-port|src-port} LIST |\n" > +" estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" > +" iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" > +" ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" > +" icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" > +" mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" > +" setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" > +" tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" > +); > + > + exit(0); > +} > + > +/* > + * Called with the arguments, including program name because getopt > + * wants it to be present. > + * Returns 0 if successful, 1 if empty command, errx() in case of errors. > + * First thing we do is process parameters creating an argv[] array > + * which includes the program name and a NULL entry at the end. > + * If we are called with a single string, we split it on whitespace. > + * Also, arguments with a trailing ',' are joined to the next one. > + * The pointers (av[]) and data are in a single chunk of memory. > + * av[0] points to the original program name, all other entries > + * point into the allocated chunk. > + */ > +static int > +ipfw_main(int oldac, char **oldav) > +{ > + int ch, ac; > + const char *errstr; > + char **av, **save_av; > + int do_acct = 0; /* Show packet/byte count */ > + int try_next = 0; /* set if pipe cmd not found */ > + int av_size; /* compute the av size */ > + char *av_p; /* used to build the av list */ > + > +#define WHITESP " \t\f\v\n\r" > + if (oldac < 2) > + return 1; /* need at least one argument */ > + > + if (oldac == 2) { > + /* > + * If we are called with one argument, try to split it into > + * words for subsequent parsing. Spaces after a ',' are > + * removed by copying the string in-place. > + */ > + char *arg = oldav[1]; /* The string is the first arg. */ > + int l = strlen(arg); > + int copy = 0; /* 1 if we need to copy, 0 otherwise */ > + int i, j; > + > + for (i = j = 0; i < l; i++) { > + if (arg[i] == '#') /* comment marker */ > + break; > + if (copy) { > + arg[j++] = arg[i]; > + copy = !strchr("," WHITESP, arg[i]); > + } else { > + copy = !strchr(WHITESP, arg[i]); > + if (copy) > + arg[j++] = arg[i]; > + } > + } > + if (!copy && j > 0) /* last char was a 'blank', remove it */ > + j--; > + l = j; /* the new argument length */ > + arg[j++] = '\0'; > + if (l == 0) /* empty string! */ > + return 1; > + > + /* > + * First, count number of arguments. Because of the previous > + * processing, this is just the number of blanks plus 1. > + */ > + for (i = 0, ac = 1; i < l; i++) > + if (strchr(WHITESP, arg[i]) != NULL) > + ac++; > + > + /* > + * Allocate the argument list structure as a single block > + * of memory, containing pointers and the argument > + * strings. We include one entry for the program name > + * because getopt expects it, and a NULL at the end > + * to simplify further parsing. > + */ > + ac++; /* add 1 for the program name */ > + av_size = (ac+1) * sizeof(char *) + l + 1; > + av = safe_calloc(av_size, 1); > + > + /* > + * Init the argument pointer to the end of the array > + * and copy arguments from arg[] to av[]. For each one, > + * j is the initial character, i is the one past the end. > + */ > + av_p = (char *)&av[ac+1]; > + for (ac = 1, i = j = 0; i < l; i++) { > + if (strchr(WHITESP, arg[i]) != NULL || i == l-1) { > + if (i == l-1) > + i++; > + bcopy(arg+j, av_p, i-j); > + av[ac] = av_p; > + av_p += i-j; /* the length of the string */ > + *av_p++ = '\0'; > + ac++; > + j = i + 1; > + } > + } > + } else { > + /* > + * If an argument ends with ',' join with the next one. > + */ > + int first, i, l=0; > + > + /* > + * Allocate the argument list structure as a single block > + * of memory, containing both pointers and the argument > + * strings. We include some space for the program name > + * because getopt expects it. > + * We add an extra pointer to the end of the array, > + * to make simpler further parsing. > + */ > + for (i=0; i<oldac; i++) > + l += strlen(oldav[i]); > + > + av_size = (oldac+1) * sizeof(char *) + l + oldac; > + av = safe_calloc(av_size, 1); > + > + /* > + * Init the argument pointer to the end of the array > + * and copy arguments from arg[] to av[] > + */ > + av_p = (char *)&av[oldac+1]; > + for (first = i = ac = 1, l = 0; i < oldac; i++) { > + char *arg = oldav[i]; > + int k = strlen(arg); > + > + l += k; > + if (arg[k-1] != ',' || i == oldac-1) { > + /* Time to copy. */ > + av[ac] = av_p; > + for (l=0; first <= i; first++) { > + strcat(av_p, oldav[first]); > + av_p += strlen(oldav[first]); > + } > + *av_p++ = '\0'; > + ac++; > + l = 0; > + first = i+1; > + } > + } > + } > + > + /* > + * set the progname pointer to the original string > + * and terminate the array with null > + */ > + av[0] = oldav[0]; > + av[ac] = NULL; > + > + /* Set the force flag for non-interactive processes */ > + if (!co.do_force) > + co.do_force = !isatty(STDIN_FILENO); > + > +#ifdef EMULATE_SYSCTL /* sysctl emulation */ > + if ( ac >= 2 && !strcmp(av[1], "sysctl")) { > + char *s; > + int i; > + > + if (ac != 3) { > + printf( "sysctl emulation usage:\n" > + " ipfw sysctl name[=value]\n" > + " ipfw sysctl -a\n"); > + return 0; > + } > + s = strchr(av[2], '='); > + if (s == NULL) { > + s = !strcmp(av[2], "-a") ? NULL : av[2]; > + sysctlbyname(s, NULL, NULL, NULL, 0); > + } else { /* ipfw sysctl x.y.z=value */ > + /* assume an INT value, will extend later */ > + if (s[1] == '\0') { > + printf("ipfw sysctl: missing value\n\n"); > + return 0; > + } > + *s = '\0'; > + i = strtol(s+1, NULL, 0); > + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); > + } > + return 0; > + } > +#endif > + > + /* Save arguments for final freeing of memory. */ > + save_av = av; > + > + optind = optreset = 1; /* restart getopt() */ > + while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1) > + switch (ch) { > + case 'a': > + do_acct = 1; > + break; > + > + case 'b': > + co.comment_only = 1; > + co.do_compact = 1; > + break; > + > + case 'c': > + co.do_compact = 1; > + break; > + > + case 'd': > + co.do_dynamic = 1; > + break; > + > + case 'e': > + co.do_expired = 1; > + break; > + > + case 'f': > + co.do_force = 1; > + break; > + > + case 'h': /* help */ > + free(save_av); > + help(); > + break; /* NOTREACHED */ > + > + case 'i': > + co.do_value_as_ip = 1; > + break; > + > + case 'n': > + co.test_only = 1; > + break; > + > + case 'N': > + co.do_resolv = 1; > + break; > + > + case 'p': > + errx(EX_USAGE, "An absolute pathname must be used " > + "with -p option."); > + /* NOTREACHED */ > + > + case 'q': > + co.do_quiet = 1; > + break; > + > + case 's': /* sort */ > + co.do_sort = atoi(optarg); > + break; > + > + case 'S': > + co.show_sets = 1; > + break; > + > + case 't': > + co.do_time = 1; > + break; > + > + case 'T': > + co.do_time = 2; /* numeric timestamp */ > + break; > + > + case 'v': /* verbose */ > + co.verbose = 1; > + break; > + > + default: > + free(save_av); > + return 1; > + } > + > + ac -= optind; > + av += optind; > + NEED1("bad arguments, for usage summary ``ipfw''"); > + > + /* > + * An undocumented behaviour of ipfw1 was to allow rule numbers first, > + * e.g. "100 add allow ..." instead of "add 100 allow ...". > + * In case, swap first and second argument to get the normal form. > + */ > + if (ac > 1 && isdigit(*av[0])) { > + char *p = av[0]; > + > + av[0] = av[1]; > + av[1] = p; > + } > + > + /* > + * Optional: pipe, queue or nat. > + */ > + co.do_nat = 0; > + co.do_pipe = 0; > + co.use_set = 0; > + if (!strncmp(*av, "nat", strlen(*av))) > + co.do_nat = 1; > + else if (!strncmp(*av, "pipe", strlen(*av))) > + co.do_pipe = 1; > + else if (_substrcmp(*av, "queue") == 0) > + co.do_pipe = 2; > + else if (_substrcmp(*av, "flowset") == 0) > + co.do_pipe = 2; > + else if (_substrcmp(*av, "sched") == 0) > + co.do_pipe = 3; > + else if (!strncmp(*av, "set", strlen(*av))) { > + if (ac > 1 && isdigit(av[1][0])) { > + co.use_set = strtonum(av[1], 0, resvd_set_number, > + &errstr); > + if (errstr) > + errx(EX_DATAERR, > + "invalid set number %s\n", av[1]); > + ac -= 2; av += 2; co.use_set++; > + } > + } > + > + if (co.do_pipe || co.do_nat) { > + ac--; > + av++; > + } > + NEED1("missing command"); > + > + /* > + * For pipes, queues and nats we normally say 'nat|pipe NN config' > + * but the code is easier to parse as 'nat|pipe config NN' > + * so we swap the two arguments. > + */ > + if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { > + char *p = av[0]; > + > + av[0] = av[1]; > + av[1] = p; > + } > + > + if (co.use_set == 0) { > + if (_substrcmp(*av, "add") == 0) > + ipfw_add(av); > + else if (co.do_nat && _substrcmp(*av, "show") == 0) > + ipfw_show_nat(ac, av); > + else if (co.do_pipe && _substrcmp(*av, "config") == 0) > + ipfw_config_pipe(ac, av); > + else if (co.do_nat && _substrcmp(*av, "config") == 0) > + ipfw_config_nat(ac, av); > + else if (_substrcmp(*av, "set") == 0) > + ipfw_sets_handler(av); > + else if (_substrcmp(*av, "table") == 0) > + ipfw_table_handler(ac, av); > + else if (_substrcmp(*av, "enable") == 0) > + ipfw_sysctl_handler(av, 1); > + else if (_substrcmp(*av, "disable") == 0) > + ipfw_sysctl_handler(av, 0); > + else > + try_next = 1; > + } > + > + if (co.use_set || try_next) { > + if (_substrcmp(*av, "delete") == 0) > + ipfw_delete(av); > + else if (_substrcmp(*av, "flush") == 0) > + ipfw_flush(co.do_force); > + else if (_substrcmp(*av, "zero") == 0) > + ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); > + else if (_substrcmp(*av, "resetlog") == 0) > + ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); > + else if (_substrcmp(*av, "print") == 0 || > + _substrcmp(*av, "list") == 0) > + ipfw_list(ac, av, do_acct); > + else if (_substrcmp(*av, "show") == 0) > + ipfw_list(ac, av, 1 /* show counters */); > + else if (_substrcmp(*av, "table") == 0) > + ipfw_table_handler(ac, av); > + else if (_substrcmp(*av, "internal") == 0) > + ipfw_internal_handler(ac, av); > + else > + errx(EX_USAGE, "bad command `%s'", *av); > + } > + > + /* Free memory allocated in the argument parsing. */ > + free(save_av); > + return 0; > +} > + > + > +static void > +ipfw_readfile(int ac, char *av[]) > +{ > +#define MAX_ARGS 32 > + char buf[4096]; > + char *progname = av[0]; /* original program name */ > + const char *cmd = NULL; /* preprocessor name, if any */ > + const char *filename = av[ac-1]; /* file to read */ > + int c, lineno=0; > + FILE *f = NULL; > + pid_t preproc = 0; > + > + while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { > + switch(c) { > + case 'c': > + co.do_compact = 1; > + break; > + > + case 'f': > + co.do_force = 1; > + break; > + > + case 'N': > + co.do_resolv = 1; > + break; > + > + case 'n': > + co.test_only = 1; > + break; > + > + case 'p': > + /* > + * ipfw -p cmd [args] filename > + * > + * We are done with getopt(). All arguments > + * except the filename go to the preprocessor, > + * so we need to do the following: > + * - check that a filename is actually present; > + * - advance av by optind-1 to skip arguments > + * already processed; > + * - decrease ac by optind, to remove the args > + * already processed and the final filename; > + * - set the last entry in av[] to NULL so > + * popen() can detect the end of the array; > + * - set optind=ac to let getopt() terminate. > + */ > + if (optind == ac) > + errx(EX_USAGE, "no filename argument"); > + cmd = optarg; > + av[ac-1] = NULL; > + av += optind - 1; > + ac -= optind; > + optind = ac; > + break; > + > + case 'q': > + co.do_quiet = 1; > + break; > + > + case 'S': > + co.show_sets = 1; > + break; > + > + default: > + errx(EX_USAGE, "bad arguments, for usage" > + " summary ``ipfw''"); > + } > + > + } > + > + if (cmd == NULL && ac != optind + 1) > + errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); > + > + if ((f = fopen(filename, "r")) == NULL) > + err(EX_UNAVAILABLE, "fopen: %s", filename); > + > + if (cmd != NULL) { /* pipe through preprocessor */ > + int pipedes[2]; > + > + if (pipe(pipedes) == -1) > + err(EX_OSERR, "cannot create pipe"); > + > + preproc = fork(); > + if (preproc == -1) > + err(EX_OSERR, "cannot fork"); > + > + if (preproc == 0) { > + /* > + * Child, will run the preprocessor with the > + * file on stdin and the pipe on stdout. > + */ > + if (dup2(fileno(f), 0) == -1 > + || dup2(pipedes[1], 1) == -1) > + err(EX_OSERR, "dup2()"); > + fclose(f); > + close(pipedes[1]); > + close(pipedes[0]); > + execvp(cmd, av); > + err(EX_OSERR, "execvp(%s) failed", cmd); > + } else { /* parent, will reopen f as the pipe */ > + fclose(f); > + close(pipedes[1]); > + if ((f = fdopen(pipedes[0], "r")) == NULL) { > + int savederrno = errno; > + > + (void)kill(preproc, SIGTERM); > + errno = savederrno; > + err(EX_OSERR, "fdopen()"); > + } > + } > + } > + > + while (fgets(buf, sizeof(buf), f)) { /* read commands */ > + char linename[20]; > + char *args[2]; > + > + lineno++; > + snprintf(linename, sizeof(linename), "Line %d", lineno); > + setprogname(linename); /* XXX */ > + args[0] = progname; > + args[1] = buf; > + ipfw_main(2, args); > + } > + fclose(f); > + if (cmd != NULL) { > + int status; > + > + if (waitpid(preproc, &status, 0) == -1) > + errx(EX_OSERR, "waitpid()"); > + if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) > + errx(EX_UNAVAILABLE, > + "preprocessor exited with status %d", > + WEXITSTATUS(status)); > + else if (WIFSIGNALED(status)) > + errx(EX_UNAVAILABLE, > + "preprocessor exited with signal %d", > + WTERMSIG(status)); > + } > +} > + > +int > +main(int ac, char *av[]) > +{ > +#if defined(_WIN32) && defined(TCC) > + { > + WSADATA wsaData; > + int ret=0; > + unsigned short wVersionRequested = MAKEWORD(2, 2); > + ret = WSAStartup(wVersionRequested, &wsaData); > + if (ret != 0) { > + /* Tell the user that we could not find a usable */ > + /* Winsock DLL. */ > + printf("WSAStartup failed with error: %d\n", ret); > + return 1; > + } > + } > +#endif > + /* > + * If the last argument is an absolute pathname, interpret it > + * as a file to be preprocessed. > + */ > + > + if (ac > 1 && av[ac - 1][0] == '/') { > + if (access(av[ac - 1], R_OK) == 0) > + ipfw_readfile(ac, av); > + else > + err(EX_USAGE, "pathname: %s", av[ac - 1]); > + } else { > + if (ipfw_main(ac, av)) { > + errx(EX_USAGE, > + "usage: ipfw [options]\n" > + "do \"ipfw -h\" or \"man ipfw\" for details"); > + } > + } > + return EX_OK; > +} > diff --git a/example/ipfw/ipfw/nat.c b/example/ipfw/ipfw/nat.c > new file mode 100644 > index 0000000..dc2364f > --- /dev/null > +++ b/example/ipfw/ipfw/nat.c > @@ -0,0 +1,1115 @@ > +/* > + * Copyright (c) 2002-2003 Luigi Rizzo > + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp > + * Copyright (c) 1994 Ugen J.S.Antsilevich > + * > + * Idea and grammar partially left from: > + * Copyright (c) 1993 Daniel Boulet > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * NEW command line interface for IP firewall facility > + * > + * $FreeBSD: head/sbin/ipfw/nat.c 272840 2014-10-09 19:32:35Z melifaro $ > + * > + * In-kernel nat support > + */ > + > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <sys/sysctl.h> > + > +#include "ipfw2.h" > + > +#include <ctype.h> > +#include <err.h> > +#include <errno.h> > +#include <netdb.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > + > +#include <net/if.h> > +#include <net/if_dl.h> > +#include <net/route.h> /* def. of struct route */ > +#include <netinet/in.h> > +#include <netinet/ip_fw.h> > +#include <arpa/inet.h> > +#include <alias.h> > + > +typedef int (nat_cb_t)(struct nat44_cfg_nat *cfg, void *arg); > +static void nat_show_cfg(struct nat44_cfg_nat *n, void *arg); > +static void nat_show_log(struct nat44_cfg_nat *n, void *arg); > +static int nat_show_data(struct nat44_cfg_nat *cfg, void *arg); > +static int natname_cmp(const void *a, const void *b); > +static int nat_foreach(nat_cb_t *f, void *arg, int sort); > +static int nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh); > + > +static struct _s_x nat_params[] = { > + { "ip", TOK_IP }, > + { "if", TOK_IF }, > + { "log", TOK_ALOG }, > + { "deny_in", TOK_DENY_INC }, > + { "same_ports", TOK_SAME_PORTS }, > + { "unreg_only", TOK_UNREG_ONLY }, > + { "skip_global", TOK_SKIP_GLOBAL }, > + { "reset", TOK_RESET_ADDR }, > + { "reverse", TOK_ALIAS_REV }, > + { "proxy_only", TOK_PROXY_ONLY }, > + { "redirect_addr", TOK_REDIR_ADDR }, > + { "redirect_port", TOK_REDIR_PORT }, > + { "redirect_proto", TOK_REDIR_PROTO }, > + { NULL, 0 } /* terminator */ > +}; > + > + > +/* > + * Search for interface with name "ifn", and fill n accordingly: > + * > + * n->ip ip address of interface "ifn" > + * n->if_name copy of interface name "ifn" > + */ > +static void > +set_addr_dynamic(const char *ifn, struct nat44_cfg_nat *n) > +{ > + size_t needed; > + int mib[6]; > + char *buf, *lim, *next; > + struct if_msghdr *ifm; > + struct ifa_msghdr *ifam; > + struct sockaddr_dl *sdl; > + struct sockaddr_in *sin; > + int ifIndex, ifMTU; > + > + mib[0] = CTL_NET; > + mib[1] = PF_ROUTE; > + mib[2] = 0; > + mib[3] = AF_INET; > + mib[4] = NET_RT_IFLIST; > + mib[5] = 0; > +/* > + * Get interface data. > + */ > + if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) > + err(1, "iflist-sysctl-estimate"); > + buf = safe_calloc(1, needed); > + if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) > + err(1, "iflist-sysctl-get"); > + lim = buf + needed; > +/* > + * Loop through interfaces until one with > + * given name is found. This is done to > + * find correct interface index for routing > + * message processing. > + */ > + ifIndex = 0; > + next = buf; > + while (next < lim) { > + ifm = (struct if_msghdr *)next; > + next += ifm->ifm_msglen; > + if (ifm->ifm_version != RTM_VERSION) { > + if (co.verbose) > + warnx("routing message version %d " > + "not understood", ifm->ifm_version); > + continue; > + } > + if (ifm->ifm_type == RTM_IFINFO) { > + sdl = (struct sockaddr_dl *)(ifm + 1); > + if (strlen(ifn) == sdl->sdl_nlen && > + strncmp(ifn, sdl->sdl_data, sdl->sdl_nlen) == 0) { > + ifIndex = ifm->ifm_index; > + ifMTU = ifm->ifm_data.ifi_mtu; > + break; > + } > + } > + } > + if (!ifIndex) > + errx(1, "unknown interface name %s", ifn); > +/* > + * Get interface address. > + */ > + sin = NULL; > + while (next < lim) { > + ifam = (struct ifa_msghdr *)next; > + next += ifam->ifam_msglen; > + if (ifam->ifam_version != RTM_VERSION) { > + if (co.verbose) > + warnx("routing message version %d " > + "not understood", ifam->ifam_version); > + continue; > + } > + if (ifam->ifam_type != RTM_NEWADDR) > + break; > + if (ifam->ifam_addrs & RTA_IFA) { > + int i; > + char *cp = (char *)(ifam + 1); > + > + for (i = 1; i < RTA_IFA; i <<= 1) { > + if (ifam->ifam_addrs & i) > + cp += SA_SIZE((struct sockaddr *)cp); > + } > + if (((struct sockaddr *)cp)->sa_family == AF_INET) { > + sin = (struct sockaddr_in *)cp; > + break; > + } > + } > + } > + if (sin == NULL) > + errx(1, "%s: cannot get interface address", ifn); > + > + n->ip = sin->sin_addr; > + strncpy(n->if_name, ifn, IF_NAMESIZE); > + > + free(buf); > +} > + > +/* > + * XXX - The following functions, macros and definitions come from natd.c: > + * it would be better to move them outside natd.c, in a file > + * (redirect_support.[ch]?) shared by ipfw and natd, but for now i can live > + * with it. > + */ > + > +/* > + * Definition of a port range, and macros to deal with values. > + * FORMAT: HI 16-bits == first port in range, 0 == all ports. > + * LO 16-bits == number of ports in range > + * NOTES: - Port values are not stored in network byte order. > + */ > + > +#define port_range u_long > + > +#define GETLOPORT(x) ((x) >> 0x10) > +#define GETNUMPORTS(x) ((x) & 0x0000ffff) > +#define GETHIPORT(x) (GETLOPORT((x)) + GETNUMPORTS((x))) > + > +/* Set y to be the low-port value in port_range variable x. */ > +#define SETLOPORT(x,y) ((x) = ((x) & 0x0000ffff) | ((y) << 0x10)) > + > +/* Set y to be the number of ports in port_range variable x. */ > +#define SETNUMPORTS(x,y) ((x) = ((x) & 0xffff0000) | (y)) > + > +static void > +StrToAddr (const char* str, struct in_addr* addr) > +{ > + struct hostent* hp; > + > + if (inet_aton (str, addr)) > + return; > + > + hp = gethostbyname (str); > + if (!hp) > + errx (1, "unknown host %s", str); > + > + memcpy (addr, hp->h_addr, sizeof (struct in_addr)); > +} > + > +static int > +StrToPortRange (const char* str, const char* proto, port_range *portRange) > +{ > + char* sep; > + struct servent* sp; > + char* end; > + u_short loPort; > + u_short hiPort; > + > + /* First see if this is a service, return corresponding port if so. */ > + sp = getservbyname (str,proto); > + if (sp) { > + SETLOPORT(*portRange, ntohs(sp->s_port)); > + SETNUMPORTS(*portRange, 1); > + return 0; > + } > + > + /* Not a service, see if it's a single port or port range. */ > + sep = strchr (str, '-'); > + if (sep == NULL) { > + SETLOPORT(*portRange, strtol(str, &end, 10)); > + if (end != str) { > + /* Single port. */ > + SETNUMPORTS(*portRange, 1); > + return 0; > + } > + > + /* Error in port range field. */ > + errx (EX_DATAERR, "%s/%s: unknown service", str, proto); > + } > + > + /* Port range, get the values and sanity check. */ > + sscanf (str, "%hu-%hu", &loPort, &hiPort); > + SETLOPORT(*portRange, loPort); > + SETNUMPORTS(*portRange, 0); /* Error by default */ > + if (loPort <= hiPort) > + SETNUMPORTS(*portRange, hiPort - loPort + 1); > + > + if (GETNUMPORTS(*portRange) == 0) > + errx (EX_DATAERR, "invalid port range %s", str); > + > + return 0; > +} > + > +static int > +StrToProto (const char* str) > +{ > + if (!strcmp (str, "tcp")) > + return IPPROTO_TCP; > + > + if (!strcmp (str, "udp")) > + return IPPROTO_UDP; > + > + if (!strcmp (str, "sctp")) > + return IPPROTO_SCTP; > + errx (EX_DATAERR, "unknown protocol %s. Expected sctp, tcp or udp", str); > +} > + > +static int > +StrToAddrAndPortRange (const char* str, struct in_addr* addr, char* proto, > + port_range *portRange) > +{ > + char* ptr; > + > + ptr = strchr (str, ':'); > + if (!ptr) > + errx (EX_DATAERR, "%s is missing port number", str); > + > + *ptr = '\0'; > + ++ptr; > + > + StrToAddr (str, addr); > + return StrToPortRange (ptr, proto, portRange); > +} > + > +/* End of stuff taken from natd.c. */ > + > +/* > + * The next 3 functions add support for the addr, port and proto redirect and > + * their logic is loosely based on SetupAddressRedirect(), SetupPortRedirect() > + * and SetupProtoRedirect() from natd.c. > + * > + * Every setup_* function fills at least one redirect entry > + * (struct nat44_cfg_redir) and zero or more server pool entry > + * (struct nat44_cfg_spool) in buf. > + * > + * The format of data in buf is: > + * > + * nat44_cfg_nat nat44_cfg_redir nat44_cfg_spool ...... nat44_cfg_spool > + * > + * ------------------------------------- ------------ > + * | | .....X ..... | | | | ..... > + * ------------------------------------- ...... ------------ > + * ^ > + * spool_cnt n=0 ...... n=(X-1) > + * > + * len points to the amount of available space in buf > + * space counts the memory consumed by every function > + * > + * XXX - Every function get all the argv params so it > + * has to check, in optional parameters, that the next > + * args is a valid option for the redir entry and not > + * another token. Only redir_port and redir_proto are > + * affected by this. > + */ > + > +static int > +estimate_redir_addr(int *ac, char ***av) > +{ > + size_t space = sizeof(struct nat44_cfg_redir); > + char *sep = **av; > + u_int c = 0; > + > + (void)ac; /* UNUSED */ > + while ((sep = strchr(sep, ',')) != NULL) { > + c++; > + sep++; > + } > + > + if (c > 0) > + c++; > + > + space += c * sizeof(struct nat44_cfg_spool); > + > + return (space); > +} > + > +static int > +setup_redir_addr(char *buf, int *ac, char ***av) > +{ > + struct nat44_cfg_redir *r; > + char *sep; > + size_t space; > + > + r = (struct nat44_cfg_redir *)buf; > + r->mode = REDIR_ADDR; > + /* Skip nat44_cfg_redir at beginning of buf. */ > + buf = &buf[sizeof(struct nat44_cfg_redir)]; > + space = sizeof(struct nat44_cfg_redir); > + > + /* Extract local address. */ > + if (strchr(**av, ',') != NULL) { > + struct nat44_cfg_spool *spool; > + > + /* Setup LSNAT server pool. */ > + r->laddr.s_addr = INADDR_NONE; > + sep = strtok(**av, ","); > + while (sep != NULL) { > + spool = (struct nat44_cfg_spool *)buf; > + space += sizeof(struct nat44_cfg_spool); > + StrToAddr(sep, &spool->addr); > + spool->port = ~0; > + r->spool_cnt++; > + /* Point to the next possible nat44_cfg_spool. */ > + buf = &buf[sizeof(struct nat44_cfg_spool)]; > + sep = strtok(NULL, ","); > + } > + } else > + StrToAddr(**av, &r->laddr); > + (*av)++; (*ac)--; > + > + /* Extract public address. */ > + StrToAddr(**av, &r->paddr); > + (*av)++; (*ac)--; > + > + return (space); > +} > + > +static int > +estimate_redir_port(int *ac, char ***av) > +{ > + size_t space = sizeof(struct nat44_cfg_redir); > + char *sep = **av; > + u_int c = 0; > + > + (void)ac; /* UNUSED */ > + while ((sep = strchr(sep, ',')) != NULL) { > + c++; > + sep++; > + } > + > + if (c > 0) > + c++; > + > + space += c * sizeof(struct nat44_cfg_spool); > + > + return (space); > +} > + > +static int > +setup_redir_port(char *buf, int *ac, char ***av) > +{ > + struct nat44_cfg_redir *r; > + char *sep, *protoName, *lsnat = NULL; > + size_t space; > + u_short numLocalPorts; > + port_range portRange; > + > + numLocalPorts = 0; > + > + r = (struct nat44_cfg_redir *)buf; > + r->mode = REDIR_PORT; > + /* Skip nat44_cfg_redir at beginning of buf. */ > + buf = &buf[sizeof(struct nat44_cfg_redir)]; > + space = sizeof(struct nat44_cfg_redir); > + > + /* > + * Extract protocol. > + */ > + r->proto = StrToProto(**av); > + protoName = **av; > + (*av)++; (*ac)--; > + > + /* > + * Extract local address. > + */ > + if (strchr(**av, ',') != NULL) { > + r->laddr.s_addr = INADDR_NONE; > + r->lport = ~0; > + numLocalPorts = 1; > + lsnat = **av; > + } else { > + /* > + * The sctp nat does not allow the port numbers to be mapped to > + * new port numbers. Therefore, no ports are to be specified > + * in the target port field. > + */ > + if (r->proto == IPPROTO_SCTP) { > + if (strchr(**av, ':')) > + errx(EX_DATAERR, "redirect_port:" > + "port numbers do not change in sctp, so do " > + "not specify them as part of the target"); > + else > + StrToAddr(**av, &r->laddr); > + } else { > + if (StrToAddrAndPortRange(**av, &r->laddr, protoName, > + &portRange) != 0) > + errx(EX_DATAERR, "redirect_port: " > + "invalid local port range"); > + > + r->lport = GETLOPORT(portRange); > + numLocalPorts = GETNUMPORTS(portRange); > + } > + } > + (*av)++; (*ac)--; > + > + /* > + * Extract public port and optionally address. > + */ > + if (strchr(**av, ':') != NULL) { > + if (StrToAddrAndPortRange(**av, &r->paddr, protoName, > + &portRange) != 0) > + errx(EX_DATAERR, "redirect_port: " > + "invalid public port range"); > + } else { > + r->paddr.s_addr = INADDR_ANY; > + if (StrToPortRange(**av, protoName, &portRange) != 0) > + errx(EX_DATAERR, "redirect_port: " > + "invalid public port range"); > + } > + > + r->pport = GETLOPORT(portRange); > + if (r->proto == IPPROTO_SCTP) { /* so the logic below still works */ > + numLocalPorts = GETNUMPORTS(portRange); > + r->lport = r->pport; > + } > + r->pport_cnt = GETNUMPORTS(portRange); > + (*av)++; (*ac)--; > + > + /* > + * Extract remote address and optionally port. > + */ > + /* > + * NB: isdigit(**av) => we've to check that next parameter is really an > + * option for this redirect entry, else stop here processing arg[cv]. > + */ > + if (*ac != 0 && isdigit(***av)) { > + if (strchr(**av, ':') != NULL) { > + if (StrToAddrAndPortRange(**av, &r->raddr, protoName, > + &portRange) != 0) > + errx(EX_DATAERR, "redirect_port: " > + "invalid remote port range"); > + } else { > + SETLOPORT(portRange, 0); > + SETNUMPORTS(portRange, 1); > + StrToAddr(**av, &r->raddr); > + } > + (*av)++; (*ac)--; > + } else { > + SETLOPORT(portRange, 0); > + SETNUMPORTS(portRange, 1); > + r->raddr.s_addr = INADDR_ANY; > + } > + r->rport = GETLOPORT(portRange); > + r->rport_cnt = GETNUMPORTS(portRange); > + > + /* > + * Make sure port ranges match up, then add the redirect ports. > + */ > + if (numLocalPorts != r->pport_cnt) > + errx(EX_DATAERR, "redirect_port: " > + "port ranges must be equal in size"); > + > + /* Remote port range is allowed to be '0' which means all ports. */ > + if (r->rport_cnt != numLocalPorts && > + (r->rport_cnt != 1 || r->rport != 0)) > + errx(EX_DATAERR, "redirect_port: remote port must" > + "be 0 or equal to local port range in size"); > + > + /* Setup LSNAT server pool. */ > + if (lsnat != NULL) { > + struct nat44_cfg_spool *spool; > + > + sep = strtok(lsnat, ","); > + while (sep != NULL) { > + spool = (struct nat44_cfg_spool *)buf; > + space += sizeof(struct nat44_cfg_spool); > + /* > + * The sctp nat does not allow the port numbers to > + * be mapped to new port numbers. Therefore, no ports > + * are to be specified in the target port field. > + */ > + if (r->proto == IPPROTO_SCTP) { > + if (strchr (sep, ':')) { > + errx(EX_DATAERR, "redirect_port:" > + "port numbers do not change in " > + "sctp, so do not specify them as " > + "part of the target"); > + } else { > + StrToAddr(sep, &spool->addr); > + spool->port = r->pport; > + } > + } else { > + if (StrToAddrAndPortRange(sep, &spool->addr, > + protoName, &portRange) != 0) > + errx(EX_DATAERR, "redirect_port:" > + "invalid local port range"); > + if (GETNUMPORTS(portRange) != 1) > + errx(EX_DATAERR, "redirect_port: " > + "local port must be single in " > + "this context"); > + spool->port = GETLOPORT(portRange); > + } > + r->spool_cnt++; > + /* Point to the next possible nat44_cfg_spool. */ > + buf = &buf[sizeof(struct nat44_cfg_spool)]; > + sep = strtok(NULL, ","); > + } > + } > + > + return (space); > +} > + > +static int > +setup_redir_proto(char *buf, int *ac, char ***av) > +{ > + struct nat44_cfg_redir *r; > + struct protoent *protoent; > + size_t space; > + > + r = (struct nat44_cfg_redir *)buf; > + r->mode = REDIR_PROTO; > + /* Skip nat44_cfg_redir at beginning of buf. */ > + buf = &buf[sizeof(struct nat44_cfg_redir)]; > + space = sizeof(struct nat44_cfg_redir); > + > + /* > + * Extract protocol. > + */ > + protoent = getprotobyname(**av); > + if (protoent == NULL) > + errx(EX_DATAERR, "redirect_proto: unknown protocol %s", **av); > + else > + r->proto = protoent->p_proto; > + > + (*av)++; (*ac)--; > + > + /* > + * Extract local address. > + */ > + StrToAddr(**av, &r->laddr); > + > + (*av)++; (*ac)--; > + > + /* > + * Extract optional public address. > + */ > + if (*ac == 0) { > + r->paddr.s_addr = INADDR_ANY; > + r->raddr.s_addr = INADDR_ANY; > + } else { > + /* see above in setup_redir_port() */ > + if (isdigit(***av)) { > + StrToAddr(**av, &r->paddr); > + (*av)++; (*ac)--; > + > + /* > + * Extract optional remote address. > + */ > + /* see above in setup_redir_port() */ > + if (*ac != 0 && isdigit(***av)) { > + StrToAddr(**av, &r->raddr); > + (*av)++; (*ac)--; > + } > + } > + } > + > + return (space); > +} > + > +static void > +nat_show_log(struct nat44_cfg_nat *n, void *arg) > +{ > + char *buf; > + > + buf = (char *)(n + 1); > + if (buf[0] != '\0') > + printf("nat %s: %s\n", n->name, buf); > +} > + > +static void > +nat_show_cfg(struct nat44_cfg_nat *n, void *arg) > +{ > + int i, cnt, flag, off; > + struct nat44_cfg_redir *t; > + struct nat44_cfg_spool *s; > + caddr_t buf; > + struct protoent *p; > + > + buf = (caddr_t)n; > + flag = 1; > + off = sizeof(*n); > + printf("ipfw nat %s config", n->name); > + if (strlen(n->if_name) != 0) > + printf(" if %s", n->if_name); > + else if (n->ip.s_addr != 0) > + printf(" ip %s", inet_ntoa(n->ip)); > + while (n->mode != 0) { > + if (n->mode & PKT_ALIAS_LOG) { > + printf(" log"); > + n->mode &= ~PKT_ALIAS_LOG; > + } else if (n->mode & PKT_ALIAS_DENY_INCOMING) { > + printf(" deny_in"); > + n->mode &= ~PKT_ALIAS_DENY_INCOMING; > + } else if (n->mode & PKT_ALIAS_SAME_PORTS) { > + printf(" same_ports"); > + n->mode &= ~PKT_ALIAS_SAME_PORTS; > + } else if (n->mode & PKT_ALIAS_SKIP_GLOBAL) { > + printf(" skip_global"); > + n->mode &= ~PKT_ALIAS_SKIP_GLOBAL; > + } else if (n->mode & PKT_ALIAS_UNREGISTERED_ONLY) { > + printf(" unreg_only"); > + n->mode &= ~PKT_ALIAS_UNREGISTERED_ONLY; > + } else if (n->mode & PKT_ALIAS_RESET_ON_ADDR_CHANGE) { > + printf(" reset"); > + n->mode &= ~PKT_ALIAS_RESET_ON_ADDR_CHANGE; > + } else if (n->mode & PKT_ALIAS_REVERSE) { > + printf(" reverse"); > + n->mode &= ~PKT_ALIAS_REVERSE; > + } else if (n->mode & PKT_ALIAS_PROXY_ONLY) { > + printf(" proxy_only"); > + n->mode &= ~PKT_ALIAS_PROXY_ONLY; > + } > + } > + /* Print all the redirect's data configuration. */ > + for (cnt = 0; cnt < n->redir_cnt; cnt++) { > + t = (struct nat44_cfg_redir *)&buf[off]; > + off += sizeof(struct nat44_cfg_redir); > + switch (t->mode) { > + case REDIR_ADDR: > + printf(" redirect_addr"); > + if (t->spool_cnt == 0) > + printf(" %s", inet_ntoa(t->laddr)); > + else > + for (i = 0; i < t->spool_cnt; i++) { > + s = (struct nat44_cfg_spool *)&buf[off]; > + if (i) > + printf(","); > + else > + printf(" "); > + printf("%s", inet_ntoa(s->addr)); > + off += sizeof(struct nat44_cfg_spool); > + } > + printf(" %s", inet_ntoa(t->paddr)); > + break; > + case REDIR_PORT: > + p = getprotobynumber(t->proto); > + printf(" redirect_port %s ", p->p_name); > + if (!t->spool_cnt) { > + printf("%s:%u", inet_ntoa(t->laddr), t->lport); > + if (t->pport_cnt > 1) > + printf("-%u", t->lport + > + t->pport_cnt - 1); > + } else > + for (i=0; i < t->spool_cnt; i++) { > + s = (struct nat44_cfg_spool *)&buf[off]; > + if (i) > + printf(","); > + printf("%s:%u", inet_ntoa(s->addr), > + s->port); > + off += sizeof(struct nat44_cfg_spool); > + } > + > + printf(" "); > + if (t->paddr.s_addr) > + printf("%s:", inet_ntoa(t->paddr)); > + printf("%u", t->pport); > + if (!t->spool_cnt && t->pport_cnt > 1) > + printf("-%u", t->pport + t->pport_cnt - 1); > + > + if (t->raddr.s_addr) { > + printf(" %s", inet_ntoa(t->raddr)); > + if (t->rport) { > + printf(":%u", t->rport); > + if (!t->spool_cnt && t->rport_cnt > 1) > + printf("-%u", t->rport + > + t->rport_cnt - 1); > + } > + } > + break; > + case REDIR_PROTO: > + p = getprotobynumber(t->proto); > + printf(" redirect_proto %s %s", p->p_name, > + inet_ntoa(t->laddr)); > + if (t->paddr.s_addr != 0) { > + printf(" %s", inet_ntoa(t->paddr)); > + if (t->raddr.s_addr) > + printf(" %s", inet_ntoa(t->raddr)); > + } > + break; > + default: > + errx(EX_DATAERR, "unknown redir mode"); > + break; > + } > + } > + printf("\n"); > +} > + > +void > +ipfw_config_nat(int ac, char **av) > +{ > + ipfw_obj_header *oh; > + struct nat44_cfg_nat *n; /* Nat instance configuration. */ > + int i, off, tok, ac1; > + char *id, *buf, **av1, *end; > + size_t len; > + > + av++; > + ac--; > + /* Nat id. */ > + if (ac == 0) > + errx(EX_DATAERR, "missing nat id"); > + id = *av; > + i = (int)strtol(id, &end, 0); > + if (i <= 0 || *end != '\0') > + errx(EX_DATAERR, "illegal nat id: %s", id); > + av++; > + ac--; > + if (ac == 0) > + errx(EX_DATAERR, "missing option"); > + > + len = sizeof(*oh) + sizeof(*n); > + ac1 = ac; > + av1 = av; > + while (ac1 > 0) { > + tok = match_token(nat_params, *av1); > + ac1--; > + av1++; > + switch (tok) { > + case TOK_IP: > + case TOK_IF: > + ac1--; > + av1++; > + break; > + case TOK_ALOG: > + case TOK_DENY_INC: > + case TOK_SAME_PORTS: > + case TOK_SKIP_GLOBAL: > + case TOK_UNREG_ONLY: > + case TOK_RESET_ADDR: > + case TOK_ALIAS_REV: > + case TOK_PROXY_ONLY: > + break; > + case TOK_REDIR_ADDR: > + if (ac1 < 2) > + errx(EX_DATAERR, "redirect_addr: " > + "not enough arguments"); > + len += estimate_redir_addr(&ac1, &av1); > + av1 += 2; > + ac1 -= 2; > + break; > + case TOK_REDIR_PORT: > + if (ac1 < 3) > + errx(EX_DATAERR, "redirect_port: " > + "not enough arguments"); > + av1++; > + ac1--; > + len += estimate_redir_port(&ac1, &av1); > + av1 += 2; > + ac1 -= 2; > + /* Skip optional remoteIP/port */ > + if (ac1 != 0 && isdigit(**av1)) { > + av1++; > + ac1--; > + } > + break; > + case TOK_REDIR_PROTO: > + if (ac1 < 2) > + errx(EX_DATAERR, "redirect_proto: " > + "not enough arguments"); > + len += sizeof(struct nat44_cfg_redir); > + av1 += 2; > + ac1 -= 2; > + /* Skip optional remoteIP/port */ > + if (ac1 != 0 && isdigit(**av1)) { > + av1++; > + ac1--; > + } > + if (ac1 != 0 && isdigit(**av1)) { > + av1++; > + ac1--; > + } > + break; > + default: > + errx(EX_DATAERR, "unrecognised option ``%s''", av1[-1]); > + } > + } > + > + if ((buf = malloc(len)) == NULL) > + errx(EX_OSERR, "malloc failed"); > + > + /* Offset in buf: save space for header at the beginning. */ > + off = sizeof(*oh) + sizeof(*n); > + memset(buf, 0, len); > + oh = (ipfw_obj_header *)buf; > + n = (struct nat44_cfg_nat *)(oh + 1); > + oh->ntlv.head.length = sizeof(oh->ntlv); > + snprintf(oh->ntlv.name, sizeof(oh->ntlv.name), "%d", i); > + snprintf(n->name, sizeof(n->name), "%d", i); > + > + while (ac > 0) { > + tok = match_token(nat_params, *av); > + ac--; > + av++; > + switch (tok) { > + case TOK_IP: > + if (ac == 0) > + errx(EX_DATAERR, "missing option"); > + if (!inet_aton(av[0], &(n->ip))) > + errx(EX_DATAERR, "bad ip address ``%s''", > + av[0]); > + ac--; > + av++; > + break; > + case TOK_IF: > + if (ac == 0) > + errx(EX_DATAERR, "missing option"); > + set_addr_dynamic(av[0], n); > + ac--; > + av++; > + break; > + case TOK_ALOG: > + n->mode |= PKT_ALIAS_LOG; > + break; > + case TOK_DENY_INC: > + n->mode |= PKT_ALIAS_DENY_INCOMING; > + break; > + case TOK_SAME_PORTS: > + n->mode |= PKT_ALIAS_SAME_PORTS; > + break; > + case TOK_UNREG_ONLY: > + n->mode |= PKT_ALIAS_UNREGISTERED_ONLY; > + break; > + case TOK_SKIP_GLOBAL: > + n->mode |= PKT_ALIAS_SKIP_GLOBAL; > + break; > + case TOK_RESET_ADDR: > + n->mode |= PKT_ALIAS_RESET_ON_ADDR_CHANGE; > + break; > + case TOK_ALIAS_REV: > + n->mode |= PKT_ALIAS_REVERSE; > + break; > + case TOK_PROXY_ONLY: > + n->mode |= PKT_ALIAS_PROXY_ONLY; > + break; > + /* > + * All the setup_redir_* functions work directly in > + * the final buffer, see above for details. > + */ > + case TOK_REDIR_ADDR: > + case TOK_REDIR_PORT: > + case TOK_REDIR_PROTO: > + switch (tok) { > + case TOK_REDIR_ADDR: > + i = setup_redir_addr(&buf[off], &ac, &av); > + break; > + case TOK_REDIR_PORT: > + i = setup_redir_port(&buf[off], &ac, &av); > + break; > + case TOK_REDIR_PROTO: > + i = setup_redir_proto(&buf[off], &ac, &av); > + break; > + } > + n->redir_cnt++; > + off += i; > + break; > + } > + } > + > + i = do_set3(IP_FW_NAT44_XCONFIG, &oh->opheader, len); > + if (i != 0) > + err(1, "setsockopt(%s)", "IP_FW_NAT44_XCONFIG"); > + > + if (!co.do_quiet) { > + /* After every modification, we show the resultant rule. */ > + int _ac = 3; > + const char *_av[] = {"show", "config", id}; > + ipfw_show_nat(_ac, (char **)(void *)_av); > + } > +} > + > +struct nat_list_arg { > + uint16_t cmd; > + int is_all; > +}; > + > +static int > +nat_show_data(struct nat44_cfg_nat *cfg, void *arg) > +{ > + struct nat_list_arg *nla; > + ipfw_obj_header *oh; > + > + nla = (struct nat_list_arg *)arg; > + > + switch (nla->cmd) { > + case IP_FW_NAT44_XGETCONFIG: > + if (nat_get_cmd(cfg->name, nla->cmd, &oh) != 0) { > + warnx("Error getting nat instance %s info", cfg->name); > + break; > + } > + nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); > + free(oh); > + break; > + case IP_FW_NAT44_XGETLOG: > + if (nat_get_cmd(cfg->name, nla->cmd, &oh) == 0) { > + nat_show_log((struct nat44_cfg_nat *)(oh + 1), NULL); > + free(oh); > + break; > + } > + /* Handle error */ > + if (nla->is_all != 0 && errno == ENOENT) > + break; > + warn("Error getting nat instance %s info", cfg->name); > + break; > + } > + > + return (0); > +} > + > +/* > + * Compare nat names. > + * Honor number comparison. > + */ > +static int > +natname_cmp(const void *a, const void *b) > +{ > + struct nat44_cfg_nat *ia, *ib; > + > + ia = (struct nat44_cfg_nat *)a; > + ib = (struct nat44_cfg_nat *)b; > + > + return (stringnum_cmp(ia->name, ib->name)); > +} > + > +/* > + * Retrieves nat list from kernel, > + * optionally sorts it and calls requested function for each table. > + * Returns 0 on success. > + */ > +static int > +nat_foreach(nat_cb_t *f, void *arg, int sort) > +{ > + ipfw_obj_lheader *olh; > + struct nat44_cfg_nat *cfg; > + size_t sz; > + int i, error; > + > + /* Start with reasonable default */ > + sz = sizeof(*olh) + 16 * sizeof(struct nat44_cfg_nat); > + > + for (;;) { > + if ((olh = calloc(1, sz)) == NULL) > + return (ENOMEM); > + > + olh->size = sz; > + if (do_get3(IP_FW_NAT44_LIST_NAT, &olh->opheader, &sz) != 0) { > + free(olh); > + if (errno == ENOMEM) { > + sz = olh->size; > + continue; > + } > + return (errno); > + } > + > + if (sort != 0) > + qsort(olh + 1, olh->count, olh->objsize, natname_cmp); > + > + cfg = (struct nat44_cfg_nat*)(olh + 1); > + for (i = 0; i < olh->count; i++) { > + error = f(cfg, arg); /* Ignore errors for now */ > + cfg = (struct nat44_cfg_nat *)((caddr_t)cfg + > + olh->objsize); > + } > + > + free(olh); > + break; > + } > + > + return (0); > +} > + > +static int > +nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh) > +{ > + ipfw_obj_header *oh; > + struct nat44_cfg_nat *cfg; > + size_t sz; > + > + /* Start with reasonable default */ > + sz = sizeof(*oh) + sizeof(*cfg) + 128; > + > + for (;;) { > + if ((oh = calloc(1, sz)) == NULL) > + return (ENOMEM); > + cfg = (struct nat44_cfg_nat *)(oh + 1); > + oh->ntlv.head.length = sizeof(oh->ntlv); > + strlcpy(oh->ntlv.name, name, sizeof(oh->ntlv.name)); > + strlcpy(cfg->name, name, sizeof(cfg->name)); > + > + if (do_get3(cmd, &oh->opheader, &sz) != 0) { > + sz = cfg->size; > + free(oh); > + if (errno == ENOMEM) > + continue; > + return (errno); > + } > + > + *ooh = oh; > + break; > + } > + > + return (0); > +} > + > +void > +ipfw_show_nat(int ac, char **av) > +{ > + ipfw_obj_header *oh; > + char *name; > + int cmd; > + struct nat_list_arg nla; > + > + ac--; > + av++; > + > + if (co.test_only) > + return; > + > + /* Parse parameters. */ > + cmd = 0; /* XXX: Change to IP_FW_NAT44_XGETLOG @ MFC */ > + name = NULL; > + for ( ; ac != 0; ac--, av++) { > + if (!strncmp(av[0], "config", strlen(av[0]))) { > + cmd = IP_FW_NAT44_XGETCONFIG; > + continue; > + } > + if (strcmp(av[0], "log") == 0) { > + cmd = IP_FW_NAT44_XGETLOG; > + continue; > + } > + if (name != NULL) > + err(EX_USAGE,"only one instance name may be specified"); > + name = av[0]; > + } > + > + if (cmd == 0) > + errx(EX_USAGE, "Please specify action. Available: config,log"); > + > + if (name == NULL) { > + memset(&nla, 0, sizeof(nla)); > + nla.cmd = cmd; > + nla.is_all = 1; > + nat_foreach(nat_show_data, &nla, 1); > + } else { > + if (nat_get_cmd(name, cmd, &oh) != 0) > + err(EX_OSERR, "Error getting nat %s instance info", name); > + nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); > + free(oh); > + } > +} > + > diff --git a/example/ipfw/ipfw/tables.c b/example/ipfw/ipfw/tables.c > new file mode 100644 > index 0000000..e75b59a > --- /dev/null > +++ b/example/ipfw/ipfw/tables.c > @@ -0,0 +1,2013 @@ > +/* > + * Copyright (c) 2014 Yandex LLC > + * Copyright (c) 2014 Alexander V. Chernikov > + * > + * Redistribution and use in source forms, with and without modification, > + * are permitted provided that this entire comment appears intact. > + * > + * Redistribution in binary form may occur without any restrictions. > + * Obviously, it would be nice if you gave credit where credit is due > + * but requiring it would be too onerous. > + * > + * This software is provided ``AS IS'' without any warranties of any kind. > + * > + * in-kernel ipfw tables support. > + * > + * $FreeBSD: head/sbin/ipfw/tables.c 273241 2014-10-17 20:47:55Z melifaro $ > + */ > + > + > +#include <sys/types.h> > +#include <sys/param.h> > +#include <sys/socket.h> > +#include <sys/sysctl.h> > + > +#include <ctype.h> > +#include <err.h> > +#include <errno.h> > +#include <netdb.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sysexits.h> > + > +#include <net/if.h> > +#include <netinet/in.h> > +#include <netinet/ip_fw.h> > +#include <arpa/inet.h> > + > +#include "ipfw2.h" > + > +static void table_modify_record(ipfw_obj_header *oh, int ac, char *av[], > + int add, int quiet, int update, int atomic); > +static int table_flush(ipfw_obj_header *oh); > +static int table_destroy(ipfw_obj_header *oh); > +static int table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i); > +static int table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i); > +static int table_do_swap(ipfw_obj_header *oh, char *second); > +static void table_create(ipfw_obj_header *oh, int ac, char *av[]); > +static void table_modify(ipfw_obj_header *oh, int ac, char *av[]); > +static void table_lookup(ipfw_obj_header *oh, int ac, char *av[]); > +static void table_lock(ipfw_obj_header *oh, int lock); > +static int table_swap(ipfw_obj_header *oh, char *second); > +static int table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i); > +static int table_show_info(ipfw_xtable_info *i, void *arg); > +static void table_fill_ntlv(ipfw_obj_ntlv *ntlv, char *name, uint32_t set, > + uint16_t uidx); > + > +static int table_flush_one(ipfw_xtable_info *i, void *arg); > +static int table_show_one(ipfw_xtable_info *i, void *arg); > +static int table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh); > +static void table_show_list(ipfw_obj_header *oh, int need_header); > +static void table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent); > + > +static void tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, > + char *key, int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi); > +static void tentry_fill_value(ipfw_obj_header *oh, ipfw_obj_tentry *tent, > + char *arg, uint8_t type, uint32_t vmask); > +static void table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, > + uint32_t vmask, int print_ip); > + > +typedef int (table_cb_t)(ipfw_xtable_info *i, void *arg); > +static int tables_foreach(table_cb_t *f, void *arg, int sort); > + > +#ifndef s6_addr32 > +#define s6_addr32 __u6_addr.__u6_addr32 > +#endif > + > +static struct _s_x tabletypes[] = { > + { "addr", IPFW_TABLE_ADDR }, > + { "iface", IPFW_TABLE_INTERFACE }, > + { "number", IPFW_TABLE_NUMBER }, > + { "flow", IPFW_TABLE_FLOW }, > + { NULL, 0 } > +}; > + > +static struct _s_x tablevaltypes[] = { > + { "skipto", IPFW_VTYPE_SKIPTO }, > + { "pipe", IPFW_VTYPE_PIPE }, > + { "fib", IPFW_VTYPE_FIB }, > + { "nat", IPFW_VTYPE_NAT }, > + { "dscp", IPFW_VTYPE_DSCP }, > + { "tag", IPFW_VTYPE_TAG }, > + { "divert", IPFW_VTYPE_DIVERT }, > + { "netgraph", IPFW_VTYPE_NETGRAPH }, > + { "limit", IPFW_VTYPE_LIMIT }, > + { "ipv4", IPFW_VTYPE_NH4 }, > + { "ipv6", IPFW_VTYPE_NH6 }, > + { NULL, 0 } > +}; > + > +static struct _s_x tablecmds[] = { > + { "add", TOK_ADD }, > + { "delete", TOK_DEL }, > + { "create", TOK_CREATE }, > + { "destroy", TOK_DESTROY }, > + { "flush", TOK_FLUSH }, > + { "modify", TOK_MODIFY }, > + { "swap", TOK_SWAP }, > + { "info", TOK_INFO }, > + { "detail", TOK_DETAIL }, > + { "list", TOK_LIST }, > + { "lookup", TOK_LOOKUP }, > + { "atomic", TOK_ATOMIC }, > + { "lock", TOK_LOCK }, > + { "unlock", TOK_UNLOCK }, > + { NULL, 0 } > +}; > + > +static int > +lookup_host (char *host, struct in_addr *ipaddr) > +{ > + struct hostent *he; > + > + if (!inet_aton(host, ipaddr)) { > + if ((he = gethostbyname(host)) == NULL) > + return(-1); > + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; > + } > + return(0); > +} > + > +static int > +get_token(struct _s_x *table, char *string, char *errbase) > +{ > + int tcmd; > + > + if ((tcmd = match_token_relaxed(table, string)) < 0) > + errx(EX_USAGE, "%s %s %s", > + (tcmd == 0) ? "invalid" : "ambiguous", errbase, string); > + > + return (tcmd); > +} > + > +/* > + * This one handles all table-related commands > + * ipfw table NAME create ... > + * ipfw table NAME modify ... > + * ipfw table NAME destroy > + * ipfw table NAME swap NAME > + * ipfw table NAME lock > + * ipfw table NAME unlock > + * ipfw table NAME add addr[/masklen] [value] > + * ipfw table NAME add [addr[/masklen] value] [addr[/masklen] value] .. > + * ipfw table NAME delete addr[/masklen] [addr[/masklen]] .. > + * ipfw table NAME lookup addr > + * ipfw table {NAME | all} flush > + * ipfw table {NAME | all} list > + * ipfw table {NAME | all} info > + * ipfw table {NAME | all} detail > + */ > +void > +ipfw_table_handler(int ac, char *av[]) > +{ > + int do_add, is_all; > + int atomic, error, tcmd; > + ipfw_xtable_info i; > + ipfw_obj_header oh; > + char *tablename; > + uint32_t set; > + void *arg; > + > + memset(&oh, 0, sizeof(oh)); > + is_all = 0; > + if (co.use_set != 0) > + set = co.use_set - 1; > + else > + set = 0; > + > + ac--; av++; > + NEED1("table needs name"); > + tablename = *av; > + > + if (table_check_name(tablename) == 0) { > + table_fill_ntlv(&oh.ntlv, *av, set, 1); > + oh.idx = 1; > + } else { > + if (strcmp(tablename, "all") == 0) > + is_all = 1; > + else > + errx(EX_USAGE, "table name %s is invalid", tablename); > + } > + ac--; av++; > + NEED1("table needs command"); > + > + tcmd = get_token(tablecmds, *av, "table command"); > + /* Check if atomic operation was requested */ > + atomic = 0; > + if (tcmd == TOK_ATOMIC) { > + ac--; av++; > + NEED1("atomic needs command"); > + tcmd = get_token(tablecmds, *av, "table command"); > + switch (tcmd) { > + case TOK_ADD: > + break; > + default: > + errx(EX_USAGE, "atomic is not compatible with %s", *av); > + } > + atomic = 1; > + } > + > + switch (tcmd) { > + case TOK_LIST: > + case TOK_INFO: > + case TOK_DETAIL: > + case TOK_FLUSH: > + break; > + default: > + if (is_all != 0) > + errx(EX_USAGE, "table name required"); > + } > + > + switch (tcmd) { > + case TOK_ADD: > + case TOK_DEL: > + do_add = **av == 'a'; > + ac--; av++; > + table_modify_record(&oh, ac, av, do_add, co.do_quiet, > + co.do_quiet, atomic); > + break; > + case TOK_CREATE: > + ac--; av++; > + table_create(&oh, ac, av); > + break; > + case TOK_MODIFY: > + ac--; av++; > + table_modify(&oh, ac, av); > + break; > + case TOK_DESTROY: > + if (table_destroy(&oh) != 0) > + err(EX_OSERR, "failed to destroy table %s", tablename); > + break; > + case TOK_FLUSH: > + if (is_all == 0) { > + if ((error = table_flush(&oh)) != 0) > + err(EX_OSERR, "failed to flush table %s info", > + tablename); > + } else { > + error = tables_foreach(table_flush_one, &oh, 1); > + if (error != 0) > + err(EX_OSERR, "failed to flush tables list"); > + } > + break; > + case TOK_SWAP: > + ac--; av++; > + NEED1("second table name required"); > + table_swap(&oh, *av); > + break; > + case TOK_LOCK: > + case TOK_UNLOCK: > + table_lock(&oh, (tcmd == TOK_LOCK)); > + break; > + case TOK_DETAIL: > + case TOK_INFO: > + arg = (tcmd == TOK_DETAIL) ? (void *)1 : NULL; > + if (is_all == 0) { > + if ((error = table_get_info(&oh, &i)) != 0) > + err(EX_OSERR, "failed to request table info"); > + table_show_info(&i, arg); > + } else { > + error = tables_foreach(table_show_info, arg, 1); > + if (error != 0) > + err(EX_OSERR, "failed to request tables list"); > + } > + break; > + case TOK_LIST: > + if (is_all == 0) { > + ipfw_xtable_info i; > + if ((error = table_get_info(&oh, &i)) != 0) > + err(EX_OSERR, "failed to request table info"); > + table_show_one(&i, NULL); > + } else { > + error = tables_foreach(table_show_one, NULL, 1); > + if (error != 0) > + err(EX_OSERR, "failed to request tables list"); > + } > + break; > + case TOK_LOOKUP: > + ac--; av++; > + table_lookup(&oh, ac, av); > + break; > + } > +} > + > +static void > +table_fill_ntlv(ipfw_obj_ntlv *ntlv, char *name, uint32_t set, uint16_t uidx) > +{ > + > + ntlv->head.type = IPFW_TLV_TBL_NAME; > + ntlv->head.length = sizeof(ipfw_obj_ntlv); > + ntlv->idx = uidx; > + ntlv->set = set; > + strlcpy(ntlv->name, name, sizeof(ntlv->name)); > +} > + > +static void > +table_fill_objheader(ipfw_obj_header *oh, ipfw_xtable_info *i) > +{ > + > + oh->idx = 1; > + table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); > +} > + > +static struct _s_x tablenewcmds[] = { > + { "type", TOK_TYPE }, > + { "valtype", TOK_VALTYPE }, > + { "algo", TOK_ALGO }, > + { "limit", TOK_LIMIT }, > + { "locked", TOK_LOCK }, > + { NULL, 0 } > +}; > + > +static struct _s_x flowtypecmds[] = { > + { "src-ip", IPFW_TFFLAG_SRCIP }, > + { "proto", IPFW_TFFLAG_PROTO }, > + { "src-port", IPFW_TFFLAG_SRCPORT }, > + { "dst-ip", IPFW_TFFLAG_DSTIP }, > + { "dst-port", IPFW_TFFLAG_DSTPORT }, > + { NULL, 0 } > +}; > + > +int > +table_parse_type(uint8_t ttype, char *p, uint8_t *tflags) > +{ > + uint32_t fset, fclear; > + char *e; > + > + /* Parse type options */ > + switch(ttype) { > + case IPFW_TABLE_FLOW: > + fset = fclear = 0; > + if (fill_flags(flowtypecmds, p, &e, &fset, &fclear) != 0) > + errx(EX_USAGE, > + "unable to parse flow option %s", e); > + *tflags = fset; > + break; > + default: > + return (EX_USAGE); > + } > + > + return (0); > +} > + > +void > +table_print_type(char *tbuf, size_t size, uint8_t type, uint8_t tflags) > +{ > + const char *tname; > + int l; > + > + if ((tname = match_value(tabletypes, type)) == NULL) > + tname = "unknown"; > + > + l = snprintf(tbuf, size, "%s", tname); > + tbuf += l; > + size -= l; > + > + switch(type) { > + case IPFW_TABLE_FLOW: > + if (tflags != 0) { > + *tbuf++ = ':'; > + l--; > + print_flags_buffer(tbuf, size, flowtypecmds, tflags); > + } > + break; > + } > +} > + > +/* > + * Creates new table > + * > + * ipfw table NAME create [ type { addr | iface | number | flow } ] > + * [ algo algoname ] > + */ > +static void > +table_create(ipfw_obj_header *oh, int ac, char *av[]) > +{ > + ipfw_xtable_info xi; > + int error, tcmd, val; > + uint32_t fset, fclear; > + size_t sz; > + char *e, *p; > + char tbuf[128]; > + > + sz = sizeof(tbuf); > + memset(&xi, 0, sizeof(xi)); > + > + while (ac > 0) { > + tcmd = get_token(tablenewcmds, *av, "option"); > + ac--; av++; > + > + switch (tcmd) { > + case TOK_LIMIT: > + NEED1("limit value required"); > + xi.limit = strtol(*av, NULL, 10); > + ac--; av++; > + break; > + case TOK_TYPE: > + NEED1("table type required"); > + /* Type may have suboptions after ':' */ > + if ((p = strchr(*av, ':')) != NULL) > + *p++ = '\0'; > + val = match_token(tabletypes, *av); > + if (val == -1) { > + concat_tokens(tbuf, sizeof(tbuf), tabletypes, > + ", "); > + errx(EX_USAGE, > + "Unknown tabletype: %s. Supported: %s", > + *av, tbuf); > + } > + xi.type = val; > + if (p != NULL) { > + error = table_parse_type(val, p, &xi.tflags); > + if (error != 0) > + errx(EX_USAGE, > + "Unsupported suboptions: %s", p); > + } > + ac--; av++; > + break; > + case TOK_VALTYPE: > + NEED1("table value type required"); > + fset = fclear = 0; > + val = fill_flags(tablevaltypes, *av, &e, &fset, &fclear); > + if (val != -1) { > + xi.vmask = fset; > + ac--; av++; > + break; > + } > + concat_tokens(tbuf, sizeof(tbuf), tablevaltypes, ", "); > + errx(EX_USAGE, "Unknown value type: %s. Supported: %s", > + e, tbuf); > + break; > + case TOK_ALGO: > + NEED1("table algorithm name required"); > + if (strlen(*av) > sizeof(xi.algoname)) > + errx(EX_USAGE, "algorithm name too long"); > + strlcpy(xi.algoname, *av, sizeof(xi.algoname)); > + ac--; av++; > + break; > + case TOK_LOCK: > + xi.flags |= IPFW_TGFLAGS_LOCKED; > + break; > + } > + } > + > + /* Set some defaults to preserve compability */ > + if (xi.algoname[0] == '\0' && xi.type == 0) > + xi.type = IPFW_TABLE_ADDR; > + if (xi.vmask == 0) > + xi.vmask = IPFW_VTYPE_LEGACY; > + > + if ((error = table_do_create(oh, &xi)) != 0) > + err(EX_OSERR, "Table creation failed"); > +} > + > +/* > + * Creates new table > + * > + * Request: [ ipfw_obj_header ipfw_xtable_info ] > + * > + * Returns 0 on success. > + */ > +static int > +table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i) > +{ > + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; > + int error; > + > + memcpy(tbuf, oh, sizeof(*oh)); > + memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); > + oh = (ipfw_obj_header *)tbuf; > + > + error = do_set3(IP_FW_TABLE_XCREATE, &oh->opheader, sizeof(tbuf)); > + > + return (error); > +} > + > +/* > + * Modifies existing table > + * > + * ipfw table NAME modify [ limit number ] > + */ > +static void > +table_modify(ipfw_obj_header *oh, int ac, char *av[]) > +{ > + ipfw_xtable_info xi; > + int tcmd; > + size_t sz; > + char tbuf[128]; > + > + sz = sizeof(tbuf); > + memset(&xi, 0, sizeof(xi)); > + > + while (ac > 0) { > + tcmd = get_token(tablenewcmds, *av, "option"); > + ac--; av++; > + > + switch (tcmd) { > + case TOK_LIMIT: > + NEED1("limit value required"); > + xi.limit = strtol(*av, NULL, 10); > + xi.mflags |= IPFW_TMFLAGS_LIMIT; > + ac--; av++; > + break; > + default: > + errx(EX_USAGE, "cmd is not supported for modificatiob"); > + } > + } > + > + if (table_do_modify(oh, &xi) != 0) > + err(EX_OSERR, "Table modification failed"); > +} > + > +/* > + * Modifies existing table. > + * > + * Request: [ ipfw_obj_header ipfw_xtable_info ] > + * > + * Returns 0 on success. > + */ > +static int > +table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i) > +{ > + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; > + int error; > + > + memcpy(tbuf, oh, sizeof(*oh)); > + memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); > + oh = (ipfw_obj_header *)tbuf; > + > + error = do_set3(IP_FW_TABLE_XMODIFY, &oh->opheader, sizeof(tbuf)); > + > + return (error); > +} > + > +/* > + * Locks or unlocks given table > + */ > +static void > +table_lock(ipfw_obj_header *oh, int lock) > +{ > + ipfw_xtable_info xi; > + > + memset(&xi, 0, sizeof(xi)); > + > + xi.mflags |= IPFW_TMFLAGS_LOCK; > + xi.flags |= (lock != 0) ? IPFW_TGFLAGS_LOCKED : 0; > + > + if (table_do_modify(oh, &xi) != 0) > + err(EX_OSERR, "Table %s failed", lock != 0 ? "lock" : "unlock"); > +} > + > +/* > + * Destroys given table specified by @oh->ntlv. > + * Returns 0 on success. > + */ > +static int > +table_destroy(ipfw_obj_header *oh) > +{ > + > + if (do_set3(IP_FW_TABLE_XDESTROY, &oh->opheader, sizeof(*oh)) != 0) > + return (-1); > + > + return (0); > +} > + > +/* > + * Flushes given table specified by @oh->ntlv. > + * Returns 0 on success. > + */ > +static int > +table_flush(ipfw_obj_header *oh) > +{ > + > + if (do_set3(IP_FW_TABLE_XFLUSH, &oh->opheader, sizeof(*oh)) != 0) > + return (-1); > + > + return (0); > +} > + > +static int > +table_do_swap(ipfw_obj_header *oh, char *second) > +{ > + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ntlv)]; > + int error; > + > + memset(tbuf, 0, sizeof(tbuf)); > + memcpy(tbuf, oh, sizeof(*oh)); > + oh = (ipfw_obj_header *)tbuf; > + table_fill_ntlv((ipfw_obj_ntlv *)(oh + 1), second, oh->ntlv.set, 1); > + > + error = do_set3(IP_FW_TABLE_XSWAP, &oh->opheader, sizeof(tbuf)); > + > + return (error); > +} > + > +/* > + * Swaps given table with @second one. > + */ > +static int > +table_swap(ipfw_obj_header *oh, char *second) > +{ > + int error; > + > + if (table_check_name(second) != 0) > + errx(EX_USAGE, "table name %s is invalid", second); > + > + error = table_do_swap(oh, second); > + > + switch (error) { > + case EINVAL: > + errx(EX_USAGE, "Unable to swap table: check types"); > + case EFBIG: > + errx(EX_USAGE, "Unable to swap table: check limits"); > + } > + > + return (0); > +} > + > + > +/* > + * Retrieves table in given table specified by @oh->ntlv. > + * it inside @i. > + * Returns 0 on success. > + */ > +static int > +table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i) > +{ > + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; > + size_t sz; > + > + sz = sizeof(tbuf); > + memset(tbuf, 0, sizeof(tbuf)); > + memcpy(tbuf, oh, sizeof(*oh)); > + oh = (ipfw_obj_header *)tbuf; > + > + if (do_get3(IP_FW_TABLE_XINFO, &oh->opheader, &sz) != 0) > + return (errno); > + > + if (sz < sizeof(tbuf)) > + return (EINVAL); > + > + *i = *(ipfw_xtable_info *)(oh + 1); > + > + return (0); > +} > + > +static struct _s_x tablealgoclass[] = { > + { "hash", IPFW_TACLASS_HASH }, > + { "array", IPFW_TACLASS_ARRAY }, > + { "radix", IPFW_TACLASS_RADIX }, > + { NULL, 0 } > +}; > + > +struct ta_cldata { > + uint8_t taclass; > + uint8_t spare4; > + uint16_t itemsize; > + uint16_t itemsize6; > + uint32_t size; > + uint32_t count; > +}; > + > +/* > + * Print global/per-AF table @i algorithm info. > + */ > +static void > +table_show_tainfo(ipfw_xtable_info *i, struct ta_cldata *d, > + const char *af, const char *taclass) > +{ > + > + switch (d->taclass) { > + case IPFW_TACLASS_HASH: > + case IPFW_TACLASS_ARRAY: > + printf(" %salgorithm %s info\n", af, taclass); > + if (d->itemsize == d->itemsize6) > + printf(" size: %u items: %u itemsize: %u\n", > + d->size, d->count, d->itemsize); > + else > + printf(" size: %u items: %u " > + "itemsize4: %u itemsize6: %u\n", > + d->size, d->count, > + d->itemsize, d->itemsize6); > + break; > + case IPFW_TACLASS_RADIX: > + printf(" %salgorithm %s info\n", af, taclass); > + if (d->itemsize == d->itemsize6) > + printf(" items: %u itemsize: %u\n", > + d->count, d->itemsize); > + else > + printf(" items: %u " > + "itemsize4: %u itemsize6: %u\n", > + d->count, d->itemsize, d->itemsize6); > + break; > + default: > + printf(" algo class: %s\n", taclass); > + } > +} > + > +static void > +table_print_valheader(char *buf, size_t bufsize, uint32_t vmask) > +{ > + > + if (vmask == IPFW_VTYPE_LEGACY) { > + snprintf(buf, bufsize, "legacy"); > + return; > + } > + > + print_flags_buffer(buf, bufsize, tablevaltypes, vmask); > +} > + > +/* > + * Prints table info struct @i in human-readable form. > + */ > +static int > +table_show_info(ipfw_xtable_info *i, void *arg) > +{ > + const char *vtype; > + ipfw_ta_tinfo *tainfo; > + int afdata, afitem; > + struct ta_cldata d; > + char ttype[64], tvtype[64]; > + > + table_print_type(ttype, sizeof(ttype), i->type, i->tflags); > + table_print_valheader(tvtype, sizeof(tvtype), i->vmask); > + > + printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); > + if ((i->flags & IPFW_TGFLAGS_LOCKED) != 0) > + printf(" kindex: %d, type: %s, locked\n", i->kidx, ttype); > + else > + printf(" kindex: %d, type: %s\n", i->kidx, ttype); > + printf(" references: %u, valtype: %s\n", i->refcnt, tvtype); > + printf(" algorithm: %s\n", i->algoname); > + printf(" items: %u, size: %u\n", i->count, i->size); > + if (i->limit > 0) > + printf(" limit: %u\n", i->limit); > + > + /* Print algo-specific info if requested & set */ > + if (arg == NULL) > + return (0); > + > + if ((i->ta_info.flags & IPFW_TATFLAGS_DATA) == 0) > + return (0); > + tainfo = &i->ta_info; > + > + afdata = 0; > + afitem = 0; > + if (tainfo->flags & IPFW_TATFLAGS_AFDATA) > + afdata = 1; > + if (tainfo->flags & IPFW_TATFLAGS_AFITEM) > + afitem = 1; > + > + memset(&d, 0, sizeof(d)); > + d.taclass = tainfo->taclass4; > + d.size = tainfo->size4; > + d.count = tainfo->count4; > + d.itemsize = tainfo->itemsize4; > + if (afdata == 0 && afitem != 0) > + d.itemsize6 = tainfo->itemsize6; > + else > + d.itemsize6 = d.itemsize; > + if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) > + vtype = "unknown"; > + > + if (afdata == 0) { > + table_show_tainfo(i, &d, "", vtype); > + } else { > + table_show_tainfo(i, &d, "IPv4 ", vtype); > + memset(&d, 0, sizeof(d)); > + d.taclass = tainfo->taclass6; > + if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) > + vtype = "unknown"; > + d.size = tainfo->size6; > + d.count = tainfo->count6; > + d.itemsize = tainfo->itemsize6; > + d.itemsize6 = d.itemsize; > + table_show_tainfo(i, &d, "IPv6 ", vtype); > + } > + > + return (0); > +} > + > + > +/* > + * Function wrappers which can be used either > + * as is or as foreach function parameter. > + */ > + > +static int > +table_show_one(ipfw_xtable_info *i, void *arg) > +{ > + ipfw_obj_header *oh = NULL; // XXX uninitialized > + int error; > + > + if ((error = table_do_get_list(i, &oh)) != 0) { > + err(EX_OSERR, "Error requesting table %s list", i->tablename); > + return (error); > + } > + > + table_show_list(oh, 1); > + > + free(oh); > + return (0); > +} > + > +static int > +table_flush_one(ipfw_xtable_info *i, void *arg) > +{ > + ipfw_obj_header *oh; > + > + oh = (ipfw_obj_header *)arg; > + > + table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); > + > + return (table_flush(oh)); > +} > + > +static int > +table_do_modify_record(int cmd, ipfw_obj_header *oh, > + ipfw_obj_tentry *tent, int count, int atomic) > +{ > + ipfw_obj_ctlv *ctlv; > + ipfw_obj_tentry *tent_base; > + caddr_t pbuf; > + char xbuf[sizeof(*oh) + sizeof(ipfw_obj_ctlv) + sizeof(*tent)]; > + int error, i; > + size_t sz; > + > + sz = sizeof(*ctlv) + sizeof(*tent) * count; > + if (count == 1) { > + memset(xbuf, 0, sizeof(xbuf)); > + pbuf = xbuf; > + } else { > + if ((pbuf = calloc(1, sizeof(*oh) + sz)) == NULL) > + return (ENOMEM); > + } > + > + memcpy(pbuf, oh, sizeof(*oh)); > + oh = (ipfw_obj_header *)pbuf; > + oh->opheader.version = 1; > + > + ctlv = (ipfw_obj_ctlv *)(oh + 1); > + ctlv->count = count; > + ctlv->head.length = sz; > + if (atomic != 0) > + ctlv->flags |= IPFW_CTF_ATOMIC; > + > + tent_base = tent; > + memcpy(ctlv + 1, tent, sizeof(*tent) * count); > + tent = (ipfw_obj_tentry *)(ctlv + 1); > + for (i = 0; i < count; i++, tent++) { > + tent->head.length = sizeof(ipfw_obj_tentry); > + tent->idx = oh->idx; > + } > + > + sz += sizeof(*oh); > + error = do_get3(cmd, &oh->opheader, &sz); > + tent = (ipfw_obj_tentry *)(ctlv + 1); > + /* Copy result back to provided buffer */ > + memcpy(tent_base, ctlv + 1, sizeof(*tent) * count); > + > + if (pbuf != xbuf) > + free(pbuf); > + > + return (error); > +} > + > +static void > +table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add, > + int quiet, int update, int atomic) > +{ > + ipfw_obj_tentry *ptent, tent, *tent_buf; > + ipfw_xtable_info xi; > + uint8_t type; > + uint32_t vmask; > + int cmd, count, error, i, ignored; > + char *texterr, *etxt, *px; > + > + if (ac == 0) > + errx(EX_USAGE, "address required"); > + > + if (add != 0) { > + cmd = IP_FW_TABLE_XADD; > + texterr = "Adding record failed"; > + } else { > + cmd = IP_FW_TABLE_XDEL; > + texterr = "Deleting record failed"; > + } > + > + /* > + * Calculate number of entries: > + * Assume [key val] x N for add > + * and > + * key x N for delete > + */ > + count = (add != 0) ? ac / 2 + 1 : ac; > + > + if (count <= 1) { > + /* Adding single entry with/without value */ > + memset(&tent, 0, sizeof(tent)); > + tent_buf = &tent; > + } else { > + > + if ((tent_buf = calloc(count, sizeof(tent))) == NULL) > + errx(EX_OSERR, > + "Unable to allocate memory for all entries"); > + } > + ptent = tent_buf; > + > + memset(&xi, 0, sizeof(xi)); > + count = 0; > + while (ac > 0) { > + tentry_fill_key(oh, ptent, *av, add, &type, &vmask, &xi); > + > + /* > + * compability layer: auto-create table if not exists > + */ > + if (xi.tablename[0] == '\0') { > + xi.type = type; > + xi.vmask = vmask; > + strlcpy(xi.tablename, oh->ntlv.name, > + sizeof(xi.tablename)); > + fprintf(stderr, "DEPRECATED: inserting data info " > + "non-existent table %s. (auto-created)\n", > + xi.tablename); > + table_do_create(oh, &xi); > + } > + > + oh->ntlv.type = type; > + ac--; av++; > + > + if (add != 0 && ac > 0) { > + tentry_fill_value(oh, ptent, *av, type, vmask); > + ac--; av++; > + } > + > + if (update != 0) > + ptent->head.flags |= IPFW_TF_UPDATE; > + > + count++; > + ptent++; > + } > + > + error = table_do_modify_record(cmd, oh, tent_buf, count, atomic); > + > + quiet = 0; > + > + /* > + * Compatibility stuff: do not yell on duplicate keys or > + * failed deletions. > + */ > + if (error == 0 || (error == EEXIST && add != 0) || > + (error == ENOENT && add == 0)) { > + if (quiet != 0) { > + if (tent_buf != &tent) > + free(tent_buf); > + return; > + } > + } > + > + /* Report results back */ > + ptent = tent_buf; > + for (i = 0; i < count; ptent++, i++) { > + ignored = 0; > + switch (ptent->result) { > + case IPFW_TR_ADDED: > + px = "added"; > + break; > + case IPFW_TR_DELETED: > + px = "deleted"; > + break; > + case IPFW_TR_UPDATED: > + px = "updated"; > + break; > + case IPFW_TR_LIMIT: > + px = "limit"; > + ignored = 1; > + break; > + case IPFW_TR_ERROR: > + px = "error"; > + ignored = 1; > + break; > + case IPFW_TR_NOTFOUND: > + px = "notfound"; > + ignored = 1; > + break; > + case IPFW_TR_EXISTS: > + px = "exists"; > + ignored = 1; > + break; > + case IPFW_TR_IGNORED: > + px = "ignored"; > + ignored = 1; > + break; > + default: > + px = "unknown"; > + ignored = 1; > + } > + > + if (error != 0 && atomic != 0 && ignored == 0) > + printf("%s(reverted): ", px); > + else > + printf("%s: ", px); > + > + table_show_entry(&xi, ptent); > + } > + > + if (tent_buf != &tent) > + free(tent_buf); > + > + if (error == 0) > + return; > + /* Get real OS error */ > + error = errno; > + > + /* Try to provide more human-readable error */ > + switch (error) { > + case EEXIST: > + etxt = "record already exists"; > + break; > + case EFBIG: > + etxt = "limit hit"; > + break; > + case ESRCH: > + etxt = "table not found"; > + break; > + case ENOENT: > + etxt = "record not found"; > + break; > + case EACCES: > + etxt = "table is locked"; > + break; > + default: > + etxt = strerror(error); > + } > + > + errx(EX_OSERR, "%s: %s", texterr, etxt); > +} > + > +static int > +table_do_lookup(ipfw_obj_header *oh, char *key, ipfw_xtable_info *xi, > + ipfw_obj_tentry *xtent) > +{ > + char xbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_tentry)]; > + ipfw_obj_tentry *tent; > + uint8_t type; > + uint32_t vmask; > + size_t sz; > + > + memcpy(xbuf, oh, sizeof(*oh)); > + oh = (ipfw_obj_header *)xbuf; > + tent = (ipfw_obj_tentry *)(oh + 1); > + > + memset(tent, 0, sizeof(*tent)); > + tent->head.length = sizeof(*tent); > + tent->idx = 1; > + > + tentry_fill_key(oh, tent, key, 0, &type, &vmask, xi); > + oh->ntlv.type = type; > + > + sz = sizeof(xbuf); > + if (do_get3(IP_FW_TABLE_XFIND, &oh->opheader, &sz) != 0) > + return (errno); > + > + if (sz < sizeof(xbuf)) > + return (EINVAL); > + > + *xtent = *tent; > + > + return (0); > +} > + > +static void > +table_lookup(ipfw_obj_header *oh, int ac, char *av[]) > +{ > + ipfw_obj_tentry xtent; > + ipfw_xtable_info xi; > + char key[64]; > + int error; > + > + if (ac == 0) > + errx(EX_USAGE, "address required"); > + > + strlcpy(key, *av, sizeof(key)); > + > + memset(&xi, 0, sizeof(xi)); > + error = table_do_lookup(oh, key, &xi, &xtent); > + > + switch (error) { > + case 0: > + break; > + case ESRCH: > + errx(EX_UNAVAILABLE, "Table %s not found", oh->ntlv.name); > + case ENOENT: > + errx(EX_UNAVAILABLE, "Entry %s not found", *av); > + case ENOTSUP: > + errx(EX_UNAVAILABLE, "Table %s algo does not support " > + "\"lookup\" method", oh->ntlv.name); > + default: > + err(EX_OSERR, "getsockopt(IP_FW_TABLE_XFIND)"); > + } > + > + table_show_entry(&xi, &xtent); > +} > + > +static void > +tentry_fill_key_type(char *arg, ipfw_obj_tentry *tentry, uint8_t type, > + uint8_t tflags) > +{ > + char *p, *pp; > + int mask, af; > + struct in6_addr *paddr, tmp; > + struct tflow_entry *tfe; > + uint32_t key, *pkey; > + uint16_t port; > + struct protoent *pent; > + struct servent *sent; > + int masklen; > + > + mask = 0; // XXX uninitialized ? > + masklen = 0; > + af = 0; > + paddr = (struct in6_addr *)&tentry->k; > + > + switch (type) { > + case IPFW_TABLE_ADDR: > + /* Remove / if exists */ > + if ((p = strchr(arg, '/')) != NULL) { > + *p = '\0'; > + mask = atoi(p + 1); > + } > + > + if (inet_pton(AF_INET, arg, paddr) == 1) { > + if (p != NULL && mask > 32) > + errx(EX_DATAERR, "bad IPv4 mask width: %s", > + p + 1); > + > + masklen = p ? mask : 32; > + af = AF_INET; > + } else if (inet_pton(AF_INET6, arg, paddr) == 1) { > + if (IN6_IS_ADDR_V4COMPAT(paddr)) > + errx(EX_DATAERR, > + "Use IPv4 instead of v4-compatible"); > + if (p != NULL && mask > 128) > + errx(EX_DATAERR, "bad IPv6 mask width: %s", > + p + 1); > + > + masklen = p ? mask : 128; > + af = AF_INET6; > + } else { > + /* Assume FQDN */ > + if (lookup_host(arg, (struct in_addr *)paddr) != 0) > + errx(EX_NOHOST, "hostname ``%s'' unknown", arg); > + > + masklen = 32; > + type = IPFW_TABLE_ADDR; > + af = AF_INET; > + } > + break; > + case IPFW_TABLE_INTERFACE: > + /* Assume interface name. Copy significant data only */ > + mask = MIN(strlen(arg), IF_NAMESIZE - 1); > + memcpy(paddr, arg, mask); > + /* Set mask to exact match */ > + masklen = 8 * IF_NAMESIZE; > + break; > + case IPFW_TABLE_NUMBER: > + /* Port or any other key */ > + key = strtol(arg, &p, 10); > + if (*p != '\0') > + errx(EX_DATAERR, "Invalid number: %s", arg); > + > + pkey = (uint32_t *)paddr; > + *pkey = key; > + masklen = 32; > + break; > + case IPFW_TABLE_FLOW: > + /* Assume [src-ip][,proto][,src-port][,dst-ip][,dst-port] */ > + tfe = &tentry->k.flow; > + af = 0; > + > + /* Handle <ipv4|ipv6> */ > + if ((tflags & IPFW_TFFLAG_SRCIP) != 0) { > + if ((p = strchr(arg, ',')) != NULL) > + *p++ = '\0'; > + /* Determine family using temporary storage */ > + if (inet_pton(AF_INET, arg, &tmp) == 1) { > + if (af != 0 && af != AF_INET) > + errx(EX_DATAERR, > + "Inconsistent address family\n"); > + af = AF_INET; > + memcpy(&tfe->a.a4.sip, &tmp, 4); > + } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { > + if (af != 0 && af != AF_INET6) > + errx(EX_DATAERR, > + "Inconsistent address family\n"); > + af = AF_INET6; > + memcpy(&tfe->a.a6.sip6, &tmp, 16); > + } > + > + arg = p; > + } > + > + /* Handle <proto-num|proto-name> */ > + if ((tflags & IPFW_TFFLAG_PROTO) != 0) { > + if (arg == NULL) > + errx(EX_DATAERR, "invalid key: proto missing"); > + if ((p = strchr(arg, ',')) != NULL) > + *p++ = '\0'; > + > + key = strtol(arg, &pp, 10); > + if (*pp != '\0') { > + if ((pent = getprotobyname(arg)) == NULL) > + errx(EX_DATAERR, "Unknown proto: %s", > + arg); > + else > + key = pent->p_proto; > + } > + > + if (key > 255) > + errx(EX_DATAERR, "Bad protocol number: %u",key); > + > + tfe->proto = key; > + > + arg = p; > + } > + > + /* Handle <port-num|service-name> */ > + if ((tflags & IPFW_TFFLAG_SRCPORT) != 0) { > + if (arg == NULL) > + errx(EX_DATAERR, "invalid key: src port missing"); > + if ((p = strchr(arg, ',')) != NULL) > + *p++ = '\0'; > + > + if ((port = htons(strtol(arg, NULL, 10))) == 0) { > + if ((sent = getservbyname(arg, NULL)) == NULL) > + errx(EX_DATAERR, "Unknown service: %s", > + arg); > + else > + key = sent->s_port; > + } > + > + tfe->sport = port; > + > + arg = p; > + } > + > + /* Handle <ipv4|ipv6>*/ > + if ((tflags & IPFW_TFFLAG_DSTIP) != 0) { > + if (arg == NULL) > + errx(EX_DATAERR, "invalid key: dst ip missing"); > + if ((p = strchr(arg, ',')) != NULL) > + *p++ = '\0'; > + /* Determine family using temporary storage */ > + if (inet_pton(AF_INET, arg, &tmp) == 1) { > + if (af != 0 && af != AF_INET) > + errx(EX_DATAERR, > + "Inconsistent address family"); > + af = AF_INET; > + memcpy(&tfe->a.a4.dip, &tmp, 4); > + } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { > + if (af != 0 && af != AF_INET6) > + errx(EX_DATAERR, > + "Inconsistent address family"); > + af = AF_INET6; > + memcpy(&tfe->a.a6.dip6, &tmp, 16); > + } > + > + arg = p; > + } > + > + /* Handle <port-num|service-name> */ > + if ((tflags & IPFW_TFFLAG_DSTPORT) != 0) { > + if (arg == NULL) > + errx(EX_DATAERR, "invalid key: dst port missing"); > + if ((p = strchr(arg, ',')) != NULL) > + *p++ = '\0'; > + > + if ((port = htons(strtol(arg, NULL, 10))) == 0) { > + if ((sent = getservbyname(arg, NULL)) == NULL) > + errx(EX_DATAERR, "Unknown service: %s", > + arg); > + else > + key = sent->s_port; > + } > + > + tfe->dport = port; > + > + arg = p; > + } > + > + tfe->af = af; > + > + break; > + > + default: > + errx(EX_DATAERR, "Unsupported table type: %d", type); > + } > + > + tentry->subtype = af; > + tentry->masklen = masklen; > +} > + > +static void > +tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *key, > + int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi) > +{ > + uint8_t type, tflags; > + uint32_t vmask; > + int error; > + char *del; > + > + type = 0; > + tflags = 0; > + vmask = 0; > + > + if (xi->tablename[0] == '\0') > + error = table_get_info(oh, xi); > + else > + error = 0; > + > + if (error == 0) { > + /* Table found. */ > + type = xi->type; > + tflags = xi->tflags; > + vmask = xi->vmask; > + } else { > + if (error != ESRCH) > + errx(EX_OSERR, "Error requesting table %s info", > + oh->ntlv.name); > + if (add == 0) > + errx(EX_DATAERR, "Table %s does not exist", > + oh->ntlv.name); > + /* > + * Table does not exist. > + * Compability layer: try to interpret data as ADDR > + * before failing. > + */ > + if ((del = strchr(key, '/')) != NULL) > + *del = '\0'; > + if (inet_pton(AF_INET, key, &tent->k.addr6) == 1 || > + inet_pton(AF_INET6, key, &tent->k.addr6) == 1) { > + /* OK Prepare and send */ > + type = IPFW_TABLE_ADDR; > + vmask = IPFW_VTYPE_LEGACY; > + } else { > + /* Inknown key */ > + errx(EX_USAGE, "Table %s does not exist, cannot guess " > + "key '%s' type", oh->ntlv.name, key); > + } > + if (del != NULL) > + *del = '/'; > + } > + > + tentry_fill_key_type(key, tent, type, tflags); > + > + *ptype = type; > + *pvmask = vmask; > +} > + > +static void > +set_legacy_value(uint32_t val, ipfw_table_value *v) > +{ > + v->tag = val; > + v->pipe = val; > + v->divert = val; > + v->skipto = val; > + v->netgraph = val; > + v->fib = val; > + v->nat = val; > + v->nh4 = val; > + v->dscp = (uint8_t)val; > + v->limit = val; > +} > + > +static void > +tentry_fill_value(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *arg, > + uint8_t type, uint32_t vmask) > +{ > + uint32_t a4, flag, val, vm; > + ipfw_table_value *v; > + uint32_t i; > + int dval; > + char *comma, *e, *etype, *n, *p; > + > + v = &tent->v.value; > + vm = vmask; > + > + /* Compat layer: keep old behavior for legacy value types */ > + if (vmask == IPFW_VTYPE_LEGACY) { > + /* Try to interpret as number first */ > + val = strtoul(arg, &p, 0); > + if (*p == '\0') { > + set_legacy_value(val, v); > + return; > + } > + if (inet_pton(AF_INET, arg, &val) == 1) { > + set_legacy_value(ntohl(val), v); > + return; > + } > + /* Try hostname */ > + if (lookup_host(arg, (struct in_addr *)&val) == 0) { > + set_legacy_value(val, v); > + return; > + } > + errx(EX_OSERR, "Unable to parse value %s", arg); > + } > + > + /* > + * Shorthands: handle single value if vmask consists > + * of numbers only. e.g.: > + * vmask = "fib,skipto" -> treat input "1" as "1,1" > + */ > + > + n = arg; > + etype = NULL; > + for (i = 1; i < (1 << 31); i *= 2) { > + if ((flag = (vmask & i)) == 0) > + continue; > + vmask &= ~flag; > + > + if ((comma = strchr(n, ',')) != NULL) > + *comma = '\0'; > + > + switch (flag) { > + case IPFW_VTYPE_TAG: > + v->tag = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "tag"; > + break; > + case IPFW_VTYPE_PIPE: > + v->pipe = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "pipe"; > + break; > + case IPFW_VTYPE_DIVERT: > + v->divert = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "divert"; > + break; > + case IPFW_VTYPE_SKIPTO: > + v->skipto = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "skipto"; > + break; > + case IPFW_VTYPE_NETGRAPH: > + v->netgraph = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "netgraph"; > + break; > + case IPFW_VTYPE_FIB: > + v->fib = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "fib"; > + break; > + case IPFW_VTYPE_NAT: > + v->nat = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "nat"; > + break; > + case IPFW_VTYPE_LIMIT: > + v->limit = strtol(n, &e, 10); > + if (*e != '\0') > + etype = "limit"; > + break; > + case IPFW_VTYPE_NH4: > + if (strchr(n, '.') != NULL && > + inet_pton(AF_INET, n, &a4) == 1) { > + v->nh4 = ntohl(a4); > + break; > + } > + if (lookup_host(n, (struct in_addr *)&v->nh4) == 0) > + break; > + etype = "ipv4"; > + break; > + case IPFW_VTYPE_DSCP: > + if (isalpha(*n)) { > + if ((dval = match_token(f_ipdscp, n)) != -1) { > + v->dscp = dval; > + break; > + } else > + etype = "DSCP code"; > + } else { > + v->dscp = strtol(n, &e, 10); > + if (v->dscp > 63 || *e != '\0') > + etype = "DSCP value"; > + } > + break; > + case IPFW_VTYPE_NH6: > + if (strchr(n, ':') != NULL && > + inet_pton(AF_INET6, n, &v->nh6) == 1) > + break; > + etype = "ipv6"; > + break; > + } > + > + if (etype != NULL) > + errx(EX_USAGE, "Unable to parse %s as %s", n, etype); > + > + if (comma != NULL) > + *comma++ = ','; > + > + if ((n = comma) != NULL) > + continue; > + > + /* End of input. */ > + if (vmask != 0) > + errx(EX_USAGE, "Not enough fields inside value"); > + } > +} > + > +/* > + * Compare table names. > + * Honor number comparison. > + */ > +static int > +tablename_cmp(const void *a, const void *b) > +{ > + ipfw_xtable_info *ia, *ib; > + > + ia = (ipfw_xtable_info *)a; > + ib = (ipfw_xtable_info *)b; > + > + return (stringnum_cmp(ia->tablename, ib->tablename)); > +} > + > +/* > + * Retrieves table list from kernel, > + * optionally sorts it and calls requested function for each table. > + * Returns 0 on success. > + */ > +static int > +tables_foreach(table_cb_t *f, void *arg, int sort) > +{ > + ipfw_obj_lheader *olh; > + ipfw_xtable_info *info; > + size_t sz; > + int i, error; > + > + /* Start with reasonable default */ > + sz = sizeof(*olh) + 16 * sizeof(ipfw_xtable_info); > + > + for (;;) { > + if ((olh = calloc(1, sz)) == NULL) > + return (ENOMEM); > + > + olh->size = sz; > + if (do_get3(IP_FW_TABLES_XLIST, &olh->opheader, &sz) != 0) { > + sz = olh->size; > + free(olh); > + if (errno != ENOMEM) > + return (errno); > + continue; > + } > + > + if (sort != 0) > + qsort(olh + 1, olh->count, olh->objsize, tablename_cmp); > + > + info = (ipfw_xtable_info *)(olh + 1); > + for (i = 0; i < olh->count; i++) { > + error = f(info, arg); /* Ignore errors for now */ > + info = (ipfw_xtable_info *)((caddr_t)info + olh->objsize); > + } > + > + free(olh); > + break; > + } > + > + return (0); > +} > + > + > +/* > + * Retrieves all entries for given table @i in > + * eXtended format. Allocate buffer large enough > + * to store result. Called needs to free it later. > + * > + * Returns 0 on success. > + */ > +static int > +table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh) > +{ > + ipfw_obj_header *oh; > + size_t sz; > + int c; > + > + sz = 0; > + oh = NULL; > + for (c = 0; c < 8; c++) { > + if (sz < i->size) > + sz = i->size + 44; > + if (oh != NULL) > + free(oh); > + if ((oh = calloc(1, sz)) == NULL) > + continue; > + table_fill_objheader(oh, i); > + oh->opheader.version = 1; /* Current version */ > + if (do_get3(IP_FW_TABLE_XLIST, &oh->opheader, &sz) == 0) { > + *poh = oh; > + return (0); > + } > + > + if (errno != ENOMEM) > + break; > + } > + free(oh); > + > + return (errno); > +} > + > +/* > + * Shows all entries from @oh in human-readable format > + */ > +static void > +table_show_list(ipfw_obj_header *oh, int need_header) > +{ > + ipfw_obj_tentry *tent; > + uint32_t count; > + ipfw_xtable_info *i; > + > + i = (ipfw_xtable_info *)(oh + 1); > + tent = (ipfw_obj_tentry *)(i + 1); > + > + if (need_header) > + printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); > + > + count = i->count; > + while (count > 0) { > + table_show_entry(i, tent); > + tent = (ipfw_obj_tentry *)((caddr_t)tent + tent->head.length); > + count--; > + } > +} > + > +static void > +table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, > + uint32_t vmask, int print_ip) > +{ > + uint32_t flag, i, l; > + size_t sz; > + struct in_addr a4; > + char abuf[INET6_ADDRSTRLEN]; > + > + sz = bufsize; > + > + /* > + * Some shorthands for printing values: > + * legacy assumes all values are equal, so keep the first one. > + */ > + if (vmask == IPFW_VTYPE_LEGACY) { > + if (print_ip != 0) { > + flag = htonl(v->tag); > + inet_ntop(AF_INET, &flag, buf, sz); > + } else > + snprintf(buf, sz, "%u", v->tag); > + return; > + } > + > + for (i = 1; i < (1 << 31); i *= 2) { > + if ((flag = (vmask & i)) == 0) > + continue; > + l = 0; > + > + switch (flag) { > + case IPFW_VTYPE_TAG: > + l = snprintf(buf, sz, "%u,", v->tag); > + break; > + case IPFW_VTYPE_PIPE: > + l = snprintf(buf, sz, "%u,", v->pipe); > + break; > + case IPFW_VTYPE_DIVERT: > + l = snprintf(buf, sz, "%d,", v->divert); > + break; > + case IPFW_VTYPE_SKIPTO: > + l = snprintf(buf, sz, "%d,", v->skipto); > + break; > + case IPFW_VTYPE_NETGRAPH: > + l = snprintf(buf, sz, "%u,", v->netgraph); > + break; > + case IPFW_VTYPE_FIB: > + l = snprintf(buf, sz, "%u,", v->fib); > + break; > + case IPFW_VTYPE_NAT: > + l = snprintf(buf, sz, "%u,", v->nat); > + break; > + case IPFW_VTYPE_LIMIT: > + l = snprintf(buf, sz, "%u,", v->limit); > + break; > + case IPFW_VTYPE_NH4: > + a4.s_addr = htonl(v->nh4); > + inet_ntop(AF_INET, &a4, abuf, sizeof(abuf)); > + l = snprintf(buf, sz, "%s,", abuf); > + break; > + case IPFW_VTYPE_DSCP: > + l = snprintf(buf, sz, "%d,", v->dscp); > + break; > + case IPFW_VTYPE_NH6: > + inet_ntop(AF_INET6, &v->nh6, abuf, sizeof(abuf)); > + l = snprintf(buf, sz, "%s,", abuf); > + break; > + } > + > + buf += l; > + sz -= l; > + } > + > + if (sz != bufsize) > + *(buf - 1) = '\0'; > +} > + > +static void > +table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent) > +{ > + char *comma, tbuf[128], pval[128]; > + void *paddr; > + struct tflow_entry *tfe; > + > + table_show_value(pval, sizeof(pval), &tent->v.value, i->vmask, > + co.do_value_as_ip); > + > + switch (i->type) { > + case IPFW_TABLE_ADDR: > + /* IPv4 or IPv6 prefixes */ > + inet_ntop(tent->subtype, &tent->k, tbuf, sizeof(tbuf)); > + printf("%s/%u %s\n", tbuf, tent->masklen, pval); > + break; > + case IPFW_TABLE_INTERFACE: > + /* Interface names */ > + printf("%s %s\n", tent->k.iface, pval); > + break; > + case IPFW_TABLE_NUMBER: > + /* numbers */ > + printf("%u %s\n", tent->k.key, pval); > + break; > + case IPFW_TABLE_FLOW: > + /* flows */ > + tfe = &tent->k.flow; > + comma = ""; > + > + if ((i->tflags & IPFW_TFFLAG_SRCIP) != 0) { > + if (tfe->af == AF_INET) > + paddr = &tfe->a.a4.sip; > + else > + paddr = &tfe->a.a6.sip6; > + > + inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); > + printf("%s%s", comma, tbuf); > + comma = ","; > + } > + > + if ((i->tflags & IPFW_TFFLAG_PROTO) != 0) { > + printf("%s%d", comma, tfe->proto); > + comma = ","; > + } > + > + if ((i->tflags & IPFW_TFFLAG_SRCPORT) != 0) { > + printf("%s%d", comma, ntohs(tfe->sport)); > + comma = ","; > + } > + if ((i->tflags & IPFW_TFFLAG_DSTIP) != 0) { > + if (tfe->af == AF_INET) > + paddr = &tfe->a.a4.dip; > + else > + paddr = &tfe->a.a6.dip6; > + > + inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); > + printf("%s%s", comma, tbuf); > + comma = ","; > + } > + > + if ((i->tflags & IPFW_TFFLAG_DSTPORT) != 0) { > + printf("%s%d", comma, ntohs(tfe->dport)); > + comma = ","; > + } > + > + printf(" %s\n", pval); > + } > +} > + > +static int > +table_do_get_stdlist(uint16_t opcode, ipfw_obj_lheader **polh) > +{ > + ipfw_obj_lheader req, *olh; > + size_t sz; > + > + memset(&req, 0, sizeof(req)); > + sz = sizeof(req); > + > + if (do_get3(opcode, &req.opheader, &sz) != 0) > + if (errno != ENOMEM) > + return (errno); > + > + sz = req.size; > + if ((olh = calloc(1, sz)) == NULL) > + return (ENOMEM); > + > + olh->size = sz; > + if (do_get3(opcode, &olh->opheader, &sz) != 0) { > + free(olh); > + return (errno); > + } > + > + *polh = olh; > + return (0); > +} > + > +static int > +table_do_get_algolist(ipfw_obj_lheader **polh) > +{ > + > + return (table_do_get_stdlist(IP_FW_TABLES_ALIST, polh)); > +} > + > +static int > +table_do_get_vlist(ipfw_obj_lheader **polh) > +{ > + > + return (table_do_get_stdlist(IP_FW_TABLE_VLIST, polh)); > +} > + > +void > +ipfw_list_ta(int ac, char *av[]) > +{ > + ipfw_obj_lheader *olh; > + ipfw_ta_info *info; > + int error, i; > + const char *atype; > + > + error = table_do_get_algolist(&olh); > + if (error != 0) > + err(EX_OSERR, "Unable to request algorithm list"); > + > + info = (ipfw_ta_info *)(olh + 1); > + for (i = 0; i < olh->count; i++) { > + if ((atype = match_value(tabletypes, info->type)) == NULL) > + atype = "unknown"; > + printf("--- %s ---\n", info->algoname); > + printf(" type: %s\n refcount: %u\n", atype, info->refcnt); > + > + info = (ipfw_ta_info *)((caddr_t)info + olh->objsize); > + } > + > + free(olh); > +} > + > + > +/* Copy of current kernel table_value structure */ > +struct _table_value { > + uint32_t tag; /* O_TAG/O_TAGGED */ > + uint32_t pipe; /* O_PIPE/O_QUEUE */ > + uint16_t divert; /* O_DIVERT/O_TEE */ > + uint16_t skipto; /* skipto, CALLRET */ > + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ > + uint32_t fib; /* O_SETFIB */ > + uint32_t nat; /* O_NAT */ > + uint32_t nh4; > + uint8_t dscp; > + uint8_t spare0[3]; > + /* -- 32 bytes -- */ > + struct in6_addr nh6; > + uint32_t limit; /* O_LIMIT */ > + uint32_t spare1; > + uint64_t refcnt; /* Number of references */ > +}; > + > +int > +compare_values(const void *_a, const void *_b) > +{ > + struct _table_value *a, *b; > + > + a = (struct _table_value *)_a; > + b = (struct _table_value *)_b; > + > + if (a->spare1 < b->spare1) > + return (-1); > + else if (a->spare1 > b->spare1) > + return (1); > + > + return (0); > +} > + > +void > +ipfw_list_values(int ac, char *av[]) > +{ > + ipfw_obj_lheader *olh; > + struct _table_value *v; > + int error, i; > + uint32_t vmask; > + char buf[128]; > + > + error = table_do_get_vlist(&olh); > + if (error != 0) > + err(EX_OSERR, "Unable to request value list"); > + > + vmask = 0x7FFFFFFF; /* Similar to IPFW_VTYPE_LEGACY */ > + > + table_print_valheader(buf, sizeof(buf), vmask); > + printf("HEADER: %s\n", buf); > + v = (struct _table_value *)(olh + 1); > + qsort(v, olh->count, olh->objsize, compare_values); > + for (i = 0; i < olh->count; i++) { > + table_show_value(buf, sizeof(buf), (ipfw_table_value *)v, > + vmask, 0); > + printf("[%u] refs=%lu %s\n", v->spare1, (u_long)v->refcnt, buf); > + v = (struct _table_value *)((caddr_t)v + olh->objsize); > + } > + > + free(olh); > +} > + > +int > +compare_ntlv(const void *_a, const void *_b) > +{ > + ipfw_obj_ntlv *a, *b; > + > + a = (ipfw_obj_ntlv *)_a; > + b = (ipfw_obj_ntlv *)_b; > + > + if (a->set < b->set) > + return (-1); > + else if (a->set > b->set) > + return (1); > + > + if (a->idx < b->idx) > + return (-1); > + else if (a->idx > b->idx) > + return (1); > + > + return (0); > +} > + > +int > +compare_kntlv(const void *k, const void *v) > +{ > + ipfw_obj_ntlv *ntlv; > + uint16_t key; > + > + key = *((uint16_t *)k); > + ntlv = (ipfw_obj_ntlv *)v; > + > + if (key < ntlv->idx) > + return (-1); > + else if (key > ntlv->idx) > + return (1); > + > + return (0); > +} > + > +/* > + * Finds table name in @ctlv by @idx. > + * Uses the following facts: > + * 1) All TLVs are the same size > + * 2) Kernel implementation provides already sorted list. > + * > + * Returns table name or NULL. > + */ > +char * > +table_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx) > +{ > + ipfw_obj_ntlv *ntlv; > + > + ntlv = bsearch(&idx, (ctlv + 1), ctlv->count, ctlv->objsize, > + compare_kntlv); > + > + if (ntlv != 0) > + return (ntlv->name); > + > + return (NULL); > +} > + > +void > +table_sort_ctlv(ipfw_obj_ctlv *ctlv) > +{ > + > + qsort(ctlv + 1, ctlv->count, ctlv->objsize, compare_ntlv); > +} > + > +int > +table_check_name(char *tablename) > +{ > + int c, i, l; > + > + /* > + * Check if tablename is null-terminated and contains > + * valid symbols only. Valid mask is: > + * [a-zA-Z0-9\-_\.]{1,63} > + */ > + l = strlen(tablename); > + if (l == 0 || l >= 64) > + return (EINVAL); > + for (i = 0; i < l; i++) { > + c = tablename[i]; > + if (isalpha(c) || isdigit(c) || c == '_' || > + c == '-' || c == '.') > + continue; > + return (EINVAL); > + } > + > + /* Restrict some 'special' names */ > + if (strcmp(tablename, "all") == 0) > + return (EINVAL); > + > + return (0); > +} > + > diff --git a/example/ipfw/sys/net/pfil.h b/example/ipfw/sys/net/pfil.h > new file mode 100644 > index 0000000..6fa0c25 > --- /dev/null > +++ b/example/ipfw/sys/net/pfil.h > @@ -0,0 +1,148 @@ > +/* $FreeBSD: head/sys/net/pfil.h 254777 2013-08-24 12:03:24Z andre $ */ > +/* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ > + > +/*- > + * Copyright (c) 1996 Matthew R. Green > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 3. The name of the author may not be used to endorse or promote products > + * derived from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR > + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES > + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. > + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, > + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, > + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; > + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED > + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, > + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#ifndef _NET_PFIL_H_ > +#define _NET_PFIL_H_ > + > +#include <sys/systm.h> > +#include <sys/queue.h> > +#include <sys/_lock.h> > +#include <sys/_mutex.h> > +#include <sys/lock.h> > +#include <sys/rmlock.h> > + > +struct mbuf; > +struct ifnet; > +struct inpcb; > + > +typedef int (*pfil_func_t)(void *, struct mbuf **, struct ifnet *, int, > + struct inpcb *); > + > +/* > + * The packet filter hooks are designed for anything to call them to > + * possibly intercept the packet. Multiple filter hooks are chained > + * together and after each other in the specified order. > + */ > +struct packet_filter_hook { > + TAILQ_ENTRY(packet_filter_hook) pfil_chain; > + pfil_func_t pfil_func; > + void *pfil_arg; > +}; > + > +#define PFIL_IN 0x00000001 > +#define PFIL_OUT 0x00000002 > +#define PFIL_WAITOK 0x00000004 > +#define PFIL_ALL (PFIL_IN|PFIL_OUT) > + > +typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; > + > +#define PFIL_TYPE_AF 1 /* key is AF_* type */ > +#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ > + > +#define PFIL_FLAG_PRIVATE_LOCK 0x01 /* Personal lock instead of global */ > + > +/* > + * A pfil head is created by each protocol or packet intercept point. > + * For packet is then run through the hook chain for inspection. > + */ > +struct pfil_head { > + pfil_chain_t ph_in; > + pfil_chain_t ph_out; > + int ph_type; > + int ph_nhooks; > +#if defined( __linux__ ) || defined( _WIN32 ) > + rwlock_t ph_mtx; > +#else > + struct rmlock *ph_plock; /* Pointer to the used lock */ > + struct rmlock ph_lock; /* Private lock storage */ > + int flags; > +#endif > + union { > + u_long phu_val; > + void *phu_ptr; > + } ph_un; > +#define ph_af ph_un.phu_val > +#define ph_ifnet ph_un.phu_ptr > + LIST_ENTRY(pfil_head) ph_list; > +}; > + > +/* Public functions for pfil hook management by packet filters. */ > +struct pfil_head *pfil_head_get(int, u_long); > +int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); > +int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); > +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) > + > +/* Public functions to run the packet inspection by protocols. */ > +int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, > + int, struct inpcb *inp); > + > +/* Public functions for pfil head management by protocols. */ > +int pfil_head_register(struct pfil_head *); > +int pfil_head_unregister(struct pfil_head *); > + > +/* Public pfil locking functions for self managed locks by packet filters. */ > +struct rm_priotracker; /* Do not require including rmlock header */ > +int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *); > +void pfil_rlock(struct pfil_head *, struct rm_priotracker *); > +void pfil_runlock(struct pfil_head *, struct rm_priotracker *); > +void pfil_wlock(struct pfil_head *); > +void pfil_wunlock(struct pfil_head *); > +int pfil_wowned(struct pfil_head *ph); > + > +/* Internal pfil locking functions. */ > +#define PFIL_LOCK_INIT_REAL(l, t) \ > + rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE) > +#define PFIL_LOCK_DESTROY_REAL(l) \ > + rm_destroy(l) > +#define PFIL_LOCK_INIT(p) do { \ > + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) { \ > + PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private"); \ > + (p)->ph_plock = &(p)->ph_lock; \ > + } else \ > + (p)->ph_plock = &V_pfil_lock; \ > +} while (0) > +#define PFIL_LOCK_DESTROY(p) do { \ > + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) \ > + PFIL_LOCK_DESTROY_REAL((p)->ph_plock); \ > +} while (0) > + > +#define PFIL_TRY_RLOCK(p, t) rm_try_rlock((p)->ph_plock, (t)) > +#define PFIL_RLOCK(p, t) rm_rlock((p)->ph_plock, (t)) > +#define PFIL_WLOCK(p) rm_wlock((p)->ph_plock) > +#define PFIL_RUNLOCK(p, t) rm_runlock((p)->ph_plock, (t)) > +#define PFIL_WUNLOCK(p) rm_wunlock((p)->ph_plock) > +#define PFIL_WOWNED(p) rm_wowned((p)->ph_plock) > + > +/* Internal locking macros for global/vnet pfil_head_list. */ > +#define PFIL_HEADLIST_LOCK() mtx_lock(&pfil_global_lock) > +#define PFIL_HEADLIST_UNLOCK() mtx_unlock(&pfil_global_lock) > + > +#endif /* _NET_PFIL_H_ */ > diff --git a/example/ipfw/sys/net/radix.c b/example/ipfw/sys/net/radix.c > new file mode 100644 > index 0000000..b423662 > --- /dev/null > +++ b/example/ipfw/sys/net/radix.c > @@ -0,0 +1,1208 @@ > +/*- > + * Copyright (c) 1988, 1989, 1993 > + * The Regents of the University of California. All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 4. Neither the name of the University nor the names of its contributors > + * may be used to endorse or promote products derived from this software > + * without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * @(#)radix.c 8.5 (Berkeley) 5/19/95 > + * $FreeBSD: head/sys/net/radix.c 272385 2014-10-01 21:24:58Z melifaro $ > + */ > + > +/* > + * Routines to build and maintain radix trees for routing lookups. > + */ > +#include <sys/param.h> > +#ifdef _KERNEL > +#include <sys/lock.h> > +#include <sys/mutex.h> > +#include <sys/rwlock.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/syslog.h> > +#include <net/radix.h> > +#include "opt_mpath.h" > +#ifdef RADIX_MPATH > +#include <net/radix_mpath.h> > +#endif > +#else /* !_KERNEL */ > +#include <stdio.h> > +#include <strings.h> > +#include <stdlib.h> > +#define log(x, arg...) fprintf(stderr, ## arg) > +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) > +#define min(a, b) ((a) < (b) ? (a) : (b) ) > +#include <net/radix.h> > +#endif /* !_KERNEL */ > + > +static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, > + walktree_f_t *f, void *w); > +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); > +static struct radix_node > + *rn_insert(void *, struct radix_node_head *, int *, > + struct radix_node [2]), > + *rn_newpair(void *, int, struct radix_node[2]), > + *rn_search(void *, struct radix_node *), > + *rn_search_m(void *, struct radix_node *, void *); > + > +static void rn_detachhead_internal(void **head); > +static int rn_inithead_internal(void **head, int off); > + > +#define RADIX_MAX_KEY_LEN 32 > + > +static char rn_zeros[RADIX_MAX_KEY_LEN]; > +static char rn_ones[RADIX_MAX_KEY_LEN] = { > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > +}; > + > + > +static int rn_lexobetter(void *m_arg, void *n_arg); > +static struct radix_mask * > + rn_new_radix_mask(struct radix_node *tt, > + struct radix_mask *next); > +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, > + int skip); > + > +/* > + * The data structure for the keys is a radix tree with one way > + * branching removed. The index rn_bit at an internal node n represents a bit > + * position to be tested. The tree is arranged so that all descendants > + * of a node n have keys whose bits all agree up to position rn_bit - 1. > + * (We say the index of n is rn_bit.) > + * > + * There is at least one descendant which has a one bit at position rn_bit, > + * and at least one with a zero there. > + * > + * A route is determined by a pair of key and mask. We require that the > + * bit-wise logical and of the key and mask to be the key. > + * We define the index of a route to associated with the mask to be > + * the first bit number in the mask where 0 occurs (with bit number 0 > + * representing the highest order bit). > + * > + * We say a mask is normal if every bit is 0, past the index of the mask. > + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, > + * and m is a normal mask, then the route applies to every descendant of n. > + * If the index(m) < rn_bit, this implies the trailing last few bits of k > + * before bit b are all 0, (and hence consequently true of every descendant > + * of n), so the route applies to all descendants of the node as well. > + * > + * Similar logic shows that a non-normal mask m such that > + * index(m) <= index(n) could potentially apply to many children of n. > + * Thus, for each non-host route, we attach its mask to a list at an internal > + * node as high in the tree as we can go. > + * > + * The present version of the code makes use of normal routes in short- > + * circuiting an explict mask and compare operation when testing whether > + * a key satisfies a normal route, and also in remembering the unique leaf > + * that governs a subtree. > + */ > + > +/* > + * Most of the functions in this code assume that the key/mask arguments > + * are sockaddr-like structures, where the first byte is an u_char > + * indicating the size of the entire structure. > + * > + * To make the assumption more explicit, we use the LEN() macro to access > + * this field. It is safe to pass an expression with side effects > + * to LEN() as the argument is evaluated only once. > + * We cast the result to int as this is the dominant usage. > + */ > +#define LEN(x) ( (int) (*(const u_char *)(x)) ) > + > +/* > + * XXX THIS NEEDS TO BE FIXED > + * In the code, pointers to keys and masks are passed as either > + * 'void *' (because callers use to pass pointers of various kinds), or > + * 'caddr_t' (which is fine for pointer arithmetics, but not very > + * clean when you dereference it to access data). Furthermore, caddr_t > + * is really 'char *', while the natural type to operate on keys and > + * masks would be 'u_char'. This mismatch require a lot of casts and > + * intermediate variables to adapt types that clutter the code. > + */ > + > +/* > + * Search a node in the tree matching the key. > + */ > +static struct radix_node * > +rn_search(void *v_arg, struct radix_node *head) > +{ > + struct radix_node *x; > + caddr_t v; > + > + for (x = head, v = v_arg; x->rn_bit >= 0;) { > + if (x->rn_bmask & v[x->rn_offset]) > + x = x->rn_right; > + else > + x = x->rn_left; > + } > + return (x); > +} > + > +/* > + * Same as above, but with an additional mask. > + * XXX note this function is used only once. > + */ > +static struct radix_node * > +rn_search_m(void *v_arg, struct radix_node *head, void *m_arg) > +{ > + struct radix_node *x; > + caddr_t v = v_arg, m = m_arg; > + > + for (x = head; x->rn_bit >= 0;) { > + if ((x->rn_bmask & m[x->rn_offset]) && > + (x->rn_bmask & v[x->rn_offset])) > + x = x->rn_right; > + else > + x = x->rn_left; > + } > + return (x); > +} > + > +int > +rn_refines(void *m_arg, void *n_arg) > +{ > + caddr_t m = m_arg, n = n_arg; > + caddr_t lim, lim2 = lim = n + LEN(n); > + int longer = LEN(n++) - LEN(m++); > + int masks_are_equal = 1; > + > + if (longer > 0) > + lim -= longer; > + while (n < lim) { > + if (*n & ~(*m)) > + return (0); > + if (*n++ != *m++) > + masks_are_equal = 0; > + } > + while (n < lim2) > + if (*n++) > + return (0); > + if (masks_are_equal && (longer < 0)) > + for (lim2 = m - longer; m < lim2; ) > + if (*m++) > + return (1); > + return (!masks_are_equal); > +} > + > +/* > + * Search for exact match in given @head. > + * Assume host bits are cleared in @v_arg if @m_arg is not NULL > + * Note that prefixes with /32 or /128 masks are treated differently > + * from host routes. > + */ > +struct radix_node * > +rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) > +{ > + struct radix_node *x; > + caddr_t netmask; > + > + if (m_arg != NULL) { > + /* > + * Most common case: search exact prefix/mask > + */ > + x = rn_addmask(m_arg, head->rnh_masks, 1, > + head->rnh_treetop->rn_offset); > + if (x == NULL) > + return (NULL); > + netmask = x->rn_key; > + > + x = rn_match(v_arg, head); > + > + while (x != NULL && x->rn_mask != netmask) > + x = x->rn_dupedkey; > + > + return (x); > + } > + > + /* > + * Search for host address. > + */ > + if ((x = rn_match(v_arg, head)) == NULL) > + return (NULL); > + > + /* Check if found key is the same */ > + if (LEN(x->rn_key) != LEN(v_arg) || bcmp(x->rn_key, v_arg, LEN(v_arg))) > + return (NULL); > + > + /* Check if this is not host route */ > + if (x->rn_mask != NULL) > + return (NULL); > + > + return (x); > +} > + > +static int > +rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip) > +{ > + char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; > + char *cplim; > + int length = min(LEN(cp), LEN(cp2)); > + > + if (cp3 == NULL) > + cp3 = rn_ones; > + else > + length = min(length, LEN(cp3)); > + cplim = cp + length; cp3 += skip; cp2 += skip; > + for (cp += skip; cp < cplim; cp++, cp2++, cp3++) > + if ((*cp ^ *cp2) & *cp3) > + return (0); > + return (1); > +} > + > +/* > + * Search for longest-prefix match in given @head > + */ > +struct radix_node * > +rn_match(void *v_arg, struct radix_node_head *head) > +{ > + caddr_t v = v_arg; > + struct radix_node *t = head->rnh_treetop, *x; > + caddr_t cp = v, cp2; > + caddr_t cplim; > + struct radix_node *saved_t, *top = t; > + int off = t->rn_offset, vlen = LEN(cp), matched_off; > + int test, b, rn_bit; > + > + /* > + * Open code rn_search(v, top) to avoid overhead of extra > + * subroutine call. > + */ > + for (; t->rn_bit >= 0; ) { > + if (t->rn_bmask & cp[t->rn_offset]) > + t = t->rn_right; > + else > + t = t->rn_left; > + } > + /* > + * See if we match exactly as a host destination > + * or at least learn how many bits match, for normal mask finesse. > + * > + * It doesn't hurt us to limit how many bytes to check > + * to the length of the mask, since if it matches we had a genuine > + * match and the leaf we have is the most specific one anyway; > + * if it didn't match with a shorter length it would fail > + * with a long one. This wins big for class B&C netmasks which > + * are probably the most common case... > + */ > + if (t->rn_mask) > + vlen = *(u_char *)t->rn_mask; > + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; > + for (; cp < cplim; cp++, cp2++) > + if (*cp != *cp2) > + goto on1; > + /* > + * This extra grot is in case we are explicitly asked > + * to look up the default. Ugh! > + * > + * Never return the root node itself, it seems to cause a > + * lot of confusion. > + */ > + if (t->rn_flags & RNF_ROOT) > + t = t->rn_dupedkey; > + return (t); > +on1: > + test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ > + for (b = 7; (test >>= 1) > 0;) > + b--; > + matched_off = cp - v; > + b += matched_off << 3; > + rn_bit = -1 - b; > + /* > + * If there is a host route in a duped-key chain, it will be first. > + */ > + if ((saved_t = t)->rn_mask == 0) > + t = t->rn_dupedkey; > + for (; t; t = t->rn_dupedkey) > + /* > + * Even if we don't match exactly as a host, > + * we may match if the leaf we wound up at is > + * a route to a net. > + */ > + if (t->rn_flags & RNF_NORMAL) { > + if (rn_bit <= t->rn_bit) > + return (t); > + } else if (rn_satisfies_leaf(v, t, matched_off)) > + return (t); > + t = saved_t; > + /* start searching up the tree */ > + do { > + struct radix_mask *m; > + t = t->rn_parent; > + m = t->rn_mklist; > + /* > + * If non-contiguous masks ever become important > + * we can restore the masking and open coding of > + * the search and satisfaction test and put the > + * calculation of "off" back before the "do". > + */ > + while (m) { > + if (m->rm_flags & RNF_NORMAL) { > + if (rn_bit <= m->rm_bit) > + return (m->rm_leaf); > + } else { > + off = min(t->rn_offset, matched_off); > + x = rn_search_m(v, t, m->rm_mask); > + while (x && x->rn_mask != m->rm_mask) > + x = x->rn_dupedkey; > + if (x && rn_satisfies_leaf(v, x, off)) > + return (x); > + } > + m = m->rm_mklist; > + } > + } while (t != top); > + return (0); > +} > + > +#ifdef RN_DEBUG > +int rn_nodenum; > +struct radix_node *rn_clist; > +int rn_saveinfo; > +int rn_debug = 1; > +#endif > + > +/* > + * Whenever we add a new leaf to the tree, we also add a parent node, > + * so we allocate them as an array of two elements: the first one must be > + * the leaf (see RNTORT() in route.c), the second one is the parent. > + * This routine initializes the relevant fields of the nodes, so that > + * the leaf is the left child of the parent node, and both nodes have > + * (almost) all all fields filled as appropriate. > + * (XXX some fields are left unset, see the '#if 0' section). > + * The function returns a pointer to the parent node. > + */ > + > +static struct radix_node * > +rn_newpair(void *v, int b, struct radix_node nodes[2]) > +{ > + struct radix_node *tt = nodes, *t = tt + 1; > + t->rn_bit = b; > + t->rn_bmask = 0x80 >> (b & 7); > + t->rn_left = tt; > + t->rn_offset = b >> 3; > + > +#if 0 /* XXX perhaps we should fill these fields as well. */ > + t->rn_parent = t->rn_right = NULL; > + > + tt->rn_mask = NULL; > + tt->rn_dupedkey = NULL; > + tt->rn_bmask = 0; > +#endif > + tt->rn_bit = -1; > + tt->rn_key = (caddr_t)v; > + tt->rn_parent = t; > + tt->rn_flags = t->rn_flags = RNF_ACTIVE; > + tt->rn_mklist = t->rn_mklist = 0; > +#ifdef RN_DEBUG > + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; > + tt->rn_twin = t; > + tt->rn_ybro = rn_clist; > + rn_clist = tt; > +#endif > + return (t); > +} > + > +static struct radix_node * > +rn_insert(void *v_arg, struct radix_node_head *head, int *dupentry, > + struct radix_node nodes[2]) > +{ > + caddr_t v = v_arg; > + struct radix_node *top = head->rnh_treetop; > + int head_off = top->rn_offset, vlen = LEN(v); > + struct radix_node *t = rn_search(v_arg, top); > + caddr_t cp = v + head_off; > + int b; > + struct radix_node *p, *tt, *x; > + /* > + * Find first bit at which v and t->rn_key differ > + */ > + caddr_t cp2 = t->rn_key + head_off; > + int cmp_res; > + caddr_t cplim = v + vlen; > + > + while (cp < cplim) > + if (*cp2++ != *cp++) > + goto on1; > + *dupentry = 1; > + return (t); > +on1: > + *dupentry = 0; > + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; > + for (b = (cp - v) << 3; cmp_res; b--) > + cmp_res >>= 1; > + > + x = top; > + cp = v; > + do { > + p = x; > + if (cp[x->rn_offset] & x->rn_bmask) > + x = x->rn_right; > + else > + x = x->rn_left; > + } while (b > (unsigned) x->rn_bit); > + /* x->rn_bit < b && x->rn_bit >= 0 */ > +#ifdef RN_DEBUG > + if (rn_debug) > + log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); > +#endif > + t = rn_newpair(v_arg, b, nodes); > + tt = t->rn_left; > + if ((cp[p->rn_offset] & p->rn_bmask) == 0) > + p->rn_left = t; > + else > + p->rn_right = t; > + x->rn_parent = t; > + t->rn_parent = p; /* frees x, p as temp vars below */ > + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { > + t->rn_right = x; > + } else { > + t->rn_right = tt; > + t->rn_left = x; > + } > +#ifdef RN_DEBUG > + if (rn_debug) > + log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); > +#endif > + return (tt); > +} > + > +struct radix_node * > +rn_addmask(void *n_arg, struct radix_node_head *maskhead, int search, int skip) > +{ > + unsigned char *netmask = n_arg; > + unsigned char *cp, *cplim; > + struct radix_node *x; > + int b = 0, mlen, j; > + int maskduplicated, isnormal; > + struct radix_node *saved_x; > + unsigned char addmask_key[RADIX_MAX_KEY_LEN]; > + > + if ((mlen = LEN(netmask)) > RADIX_MAX_KEY_LEN) > + mlen = RADIX_MAX_KEY_LEN; > + if (skip == 0) > + skip = 1; > + if (mlen <= skip) > + return (maskhead->rnh_nodes); > + > + bzero(addmask_key, RADIX_MAX_KEY_LEN); > + if (skip > 1) > + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); > + bcopy(netmask + skip, addmask_key + skip, mlen - skip); > + /* > + * Trim trailing zeroes. > + */ > + for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) > + cp--; > + mlen = cp - addmask_key; > + if (mlen <= skip) > + return (maskhead->rnh_nodes); > + *addmask_key = mlen; > + x = rn_search(addmask_key, maskhead->rnh_treetop); > + if (bcmp(addmask_key, x->rn_key, mlen) != 0) > + x = 0; > + if (x || search) > + return (x); > + R_Zalloc(x, struct radix_node *, RADIX_MAX_KEY_LEN + 2 * sizeof (*x)); > + if ((saved_x = x) == 0) > + return (0); > + netmask = cp = (unsigned char *)(x + 2); > + bcopy(addmask_key, cp, mlen); > + x = rn_insert(cp, maskhead, &maskduplicated, x); > + if (maskduplicated) { > + log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); > + Free(saved_x); > + return (x); > + } > + /* > + * Calculate index of mask, and check for normalcy. > + * First find the first byte with a 0 bit, then if there are > + * more bits left (remember we already trimmed the trailing 0's), > + * the bits should be contiguous, otherwise we have got > + * a non-contiguous mask. > + */ > +#define CONTIG(_c) (((~(_c) + 1) & (_c)) == (unsigned char)(~(_c) + 1)) > + cplim = netmask + mlen; > + isnormal = 1; > + for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) > + cp++; > + if (cp != cplim) { > + for (j = 0x80; (j & *cp) != 0; j >>= 1) > + b++; > + if (!CONTIG(*cp) || cp != (cplim - 1)) > + isnormal = 0; > + } > + b += (cp - netmask) << 3; > + x->rn_bit = -1 - b; > + if (isnormal) > + x->rn_flags |= RNF_NORMAL; > + return (x); > +} > + > +static int /* XXX: arbitrary ordering for non-contiguous masks */ > +rn_lexobetter(void *m_arg, void *n_arg) > +{ > + u_char *mp = m_arg, *np = n_arg, *lim; > + > + if (LEN(mp) > LEN(np)) > + return (1); /* not really, but need to check longer one first */ > + if (LEN(mp) == LEN(np)) > + for (lim = mp + LEN(mp); mp < lim;) > + if (*mp++ > *np++) > + return (1); > + return (0); > +} > + > +static struct radix_mask * > +rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next) > +{ > + struct radix_mask *m; > + > + R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); > + if (m == NULL) { > + log(LOG_ERR, "Failed to allocate route mask\n"); > + return (0); > + } > + bzero(m, sizeof(*m)); > + m->rm_bit = tt->rn_bit; > + m->rm_flags = tt->rn_flags; > + if (tt->rn_flags & RNF_NORMAL) > + m->rm_leaf = tt; > + else > + m->rm_mask = tt->rn_mask; > + m->rm_mklist = next; > + tt->rn_mklist = m; > + return (m); > +} > + > +struct radix_node * > +rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, > + struct radix_node treenodes[2]) > +{ > + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; > + struct radix_node *t, *x = 0, *tt; > + struct radix_node *saved_tt, *top = head->rnh_treetop; > + short b = 0, b_leaf = 0; > + int keyduplicated; > + caddr_t mmask; > + struct radix_mask *m, **mp; > + > + /* > + * In dealing with non-contiguous masks, there may be > + * many different routes which have the same mask. > + * We will find it useful to have a unique pointer to > + * the mask to speed avoiding duplicate references at > + * nodes and possibly save time in calculating indices. > + */ > + if (netmask) { > + x = rn_addmask(netmask, head->rnh_masks, 0, top->rn_offset); > + if (x == NULL) > + return (0); > + b_leaf = x->rn_bit; > + b = -1 - x->rn_bit; > + netmask = x->rn_key; > + } > + /* > + * Deal with duplicated keys: attach node to previous instance > + */ > + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); > + if (keyduplicated) { > + for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { > +#ifdef RADIX_MPATH > + /* permit multipath, if enabled for the family */ > + if (rn_mpath_capable(head) && netmask == tt->rn_mask) { > + /* > + * go down to the end of multipaths, so that > + * new entry goes into the end of rn_dupedkey > + * chain. > + */ > + do { > + t = tt; > + tt = tt->rn_dupedkey; > + } while (tt && t->rn_mask == tt->rn_mask); > + break; > + } > +#endif > + if (tt->rn_mask == netmask) > + return (0); > + if (netmask == 0 || > + (tt->rn_mask && > + ((b_leaf < tt->rn_bit) /* index(netmask) > node */ > + || rn_refines(netmask, tt->rn_mask) > + || rn_lexobetter(netmask, tt->rn_mask)))) > + break; > + } > + /* > + * If the mask is not duplicated, we wouldn't > + * find it among possible duplicate key entries > + * anyway, so the above test doesn't hurt. > + * > + * We sort the masks for a duplicated key the same way as > + * in a masklist -- most specific to least specific. > + * This may require the unfortunate nuisance of relocating > + * the head of the list. > + * > + * We also reverse, or doubly link the list through the > + * parent pointer. > + */ > + if (tt == saved_tt) { > + struct radix_node *xx = x; > + /* link in at head of list */ > + (tt = treenodes)->rn_dupedkey = t; > + tt->rn_flags = t->rn_flags; > + tt->rn_parent = x = t->rn_parent; > + t->rn_parent = tt; /* parent */ > + if (x->rn_left == t) > + x->rn_left = tt; > + else > + x->rn_right = tt; > + saved_tt = tt; x = xx; > + } else { > + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; > + t->rn_dupedkey = tt; > + tt->rn_parent = t; /* parent */ > + if (tt->rn_dupedkey) /* parent */ > + tt->rn_dupedkey->rn_parent = tt; /* parent */ > + } > +#ifdef RN_DEBUG > + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; > + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; > +#endif > + tt->rn_key = (caddr_t) v; > + tt->rn_bit = -1; > + tt->rn_flags = RNF_ACTIVE; > + } > + /* > + * Put mask in tree. > + */ > + if (netmask) { > + tt->rn_mask = netmask; > + tt->rn_bit = x->rn_bit; > + tt->rn_flags |= x->rn_flags & RNF_NORMAL; > + } > + t = saved_tt->rn_parent; > + if (keyduplicated) > + goto on2; > + b_leaf = -1 - t->rn_bit; > + if (t->rn_right == saved_tt) > + x = t->rn_left; > + else > + x = t->rn_right; > + /* Promote general routes from below */ > + if (x->rn_bit < 0) { > + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) > + if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { > + *mp = m = rn_new_radix_mask(x, 0); > + if (m) > + mp = &m->rm_mklist; > + } > + } else if (x->rn_mklist) { > + /* > + * Skip over masks whose index is > that of new node > + */ > + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) > + if (m->rm_bit >= b_leaf) > + break; > + t->rn_mklist = m; *mp = 0; > + } > +on2: > + /* Add new route to highest possible ancestor's list */ > + if ((netmask == 0) || (b > t->rn_bit )) > + return (tt); /* can't lift at all */ > + b_leaf = tt->rn_bit; > + do { > + x = t; > + t = t->rn_parent; > + } while (b <= t->rn_bit && x != top); > + /* > + * Search through routes associated with node to > + * insert new route according to index. > + * Need same criteria as when sorting dupedkeys to avoid > + * double loop on deletion. > + */ > + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { > + if (m->rm_bit < b_leaf) > + continue; > + if (m->rm_bit > b_leaf) > + break; > + if (m->rm_flags & RNF_NORMAL) { > + mmask = m->rm_leaf->rn_mask; > + if (tt->rn_flags & RNF_NORMAL) { > +#if !defined(RADIX_MPATH) > + log(LOG_ERR, > + "Non-unique normal route, mask not entered\n"); > +#endif > + return (tt); > + } > + } else > + mmask = m->rm_mask; > + if (mmask == netmask) { > + m->rm_refs++; > + tt->rn_mklist = m; > + return (tt); > + } > + if (rn_refines(netmask, mmask) > + || rn_lexobetter(netmask, mmask)) > + break; > + } > + *mp = rn_new_radix_mask(tt, *mp); > + return (tt); > +} > + > +struct radix_node * > +rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) > +{ > + struct radix_node *t, *p, *x, *tt; > + struct radix_mask *m, *saved_m, **mp; > + struct radix_node *dupedkey, *saved_tt, *top; > + caddr_t v, netmask; > + int b, head_off, vlen; > + > + v = v_arg; > + netmask = netmask_arg; > + x = head->rnh_treetop; > + tt = rn_search(v, x); > + head_off = x->rn_offset; > + vlen = LEN(v); > + saved_tt = tt; > + top = x; > + if (tt == 0 || > + bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) > + return (0); > + /* > + * Delete our route from mask lists. > + */ > + if (netmask) { > + x = rn_addmask(netmask, head->rnh_masks, 1, head_off); > + if (x == NULL) > + return (0); > + netmask = x->rn_key; > + while (tt->rn_mask != netmask) > + if ((tt = tt->rn_dupedkey) == 0) > + return (0); > + } > + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) > + goto on1; > + if (tt->rn_flags & RNF_NORMAL) { > + if (m->rm_leaf != tt || m->rm_refs > 0) { > + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); > + return (0); /* dangling ref could cause disaster */ > + } > + } else { > + if (m->rm_mask != tt->rn_mask) { > + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); > + goto on1; > + } > + if (--m->rm_refs >= 0) > + goto on1; > + } > + b = -1 - tt->rn_bit; > + t = saved_tt->rn_parent; > + if (b > t->rn_bit) > + goto on1; /* Wasn't lifted at all */ > + do { > + x = t; > + t = t->rn_parent; > + } while (b <= t->rn_bit && x != top); > + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) > + if (m == saved_m) { > + *mp = m->rm_mklist; > + Free(m); > + break; > + } > + if (m == 0) { > + log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); > + if (tt->rn_flags & RNF_NORMAL) > + return (0); /* Dangling ref to us */ > + } > +on1: > + /* > + * Eliminate us from tree > + */ > + if (tt->rn_flags & RNF_ROOT) > + return (0); > +#ifdef RN_DEBUG > + /* Get us out of the creation list */ > + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} > + if (t) t->rn_ybro = tt->rn_ybro; > +#endif > + t = tt->rn_parent; > + dupedkey = saved_tt->rn_dupedkey; > + if (dupedkey) { > + /* > + * Here, tt is the deletion target and > + * saved_tt is the head of the dupekey chain. > + */ > + if (tt == saved_tt) { > + /* remove from head of chain */ > + x = dupedkey; x->rn_parent = t; > + if (t->rn_left == tt) > + t->rn_left = x; > + else > + t->rn_right = x; > + } else { > + /* find node in front of tt on the chain */ > + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) > + p = p->rn_dupedkey; > + if (p) { > + p->rn_dupedkey = tt->rn_dupedkey; > + if (tt->rn_dupedkey) /* parent */ > + tt->rn_dupedkey->rn_parent = p; > + /* parent */ > + } else log(LOG_ERR, "rn_delete: couldn't find us\n"); > + } > + t = tt + 1; > + if (t->rn_flags & RNF_ACTIVE) { > +#ifndef RN_DEBUG > + *++x = *t; > + p = t->rn_parent; > +#else > + b = t->rn_info; > + *++x = *t; > + t->rn_info = b; > + p = t->rn_parent; > +#endif > + if (p->rn_left == t) > + p->rn_left = x; > + else > + p->rn_right = x; > + x->rn_left->rn_parent = x; > + x->rn_right->rn_parent = x; > + } > + goto out; > + } > + if (t->rn_left == tt) > + x = t->rn_right; > + else > + x = t->rn_left; > + p = t->rn_parent; > + if (p->rn_right == t) > + p->rn_right = x; > + else > + p->rn_left = x; > + x->rn_parent = p; > + /* > + * Demote routes attached to us. > + */ > + if (t->rn_mklist) { > + if (x->rn_bit >= 0) { > + for (mp = &x->rn_mklist; (m = *mp);) > + mp = &m->rm_mklist; > + *mp = t->rn_mklist; > + } else { > + /* If there are any key,mask pairs in a sibling > + duped-key chain, some subset will appear sorted > + in the same order attached to our mklist */ > + for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) > + if (m == x->rn_mklist) { > + struct radix_mask *mm = m->rm_mklist; > + x->rn_mklist = 0; > + if (--(m->rm_refs) < 0) > + Free(m); > + m = mm; > + } > + if (m) > + log(LOG_ERR, > + "rn_delete: Orphaned Mask %p at %p\n", > + m, x); > + } > + } > + /* > + * We may be holding an active internal node in the tree. > + */ > + x = tt + 1; > + if (t != x) { > +#ifndef RN_DEBUG > + *t = *x; > +#else > + b = t->rn_info; > + *t = *x; > + t->rn_info = b; > +#endif > + t->rn_left->rn_parent = t; > + t->rn_right->rn_parent = t; > + p = x->rn_parent; > + if (p->rn_left == x) > + p->rn_left = t; > + else > + p->rn_right = t; > + } > +out: > + tt->rn_flags &= ~RNF_ACTIVE; > + tt[1].rn_flags &= ~RNF_ACTIVE; > + return (tt); > +} > + > +/* > + * This is the same as rn_walktree() except for the parameters and the > + * exit. > + */ > +static int > +rn_walktree_from(struct radix_node_head *h, void *a, void *m, > + walktree_f_t *f, void *w) > +{ > + int error; > + struct radix_node *base, *next; > + u_char *xa = (u_char *)a; > + u_char *xm = (u_char *)m; > + struct radix_node *rn, *last = NULL; /* shut up gcc */ > + int stopping = 0; > + int lastb; > + > + KASSERT(m != NULL, ("%s: mask needs to be specified", __func__)); > + > + /* > + * rn_search_m is sort-of-open-coded here. We cannot use the > + * function because we need to keep track of the last node seen. > + */ > + /* printf("about to search\n"); */ > + for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { > + last = rn; > + /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", > + rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ > + if (!(rn->rn_bmask & xm[rn->rn_offset])) { > + break; > + } > + if (rn->rn_bmask & xa[rn->rn_offset]) { > + rn = rn->rn_right; > + } else { > + rn = rn->rn_left; > + } > + } > + /* printf("done searching\n"); */ > + > + /* > + * Two cases: either we stepped off the end of our mask, > + * in which case last == rn, or we reached a leaf, in which > + * case we want to start from the leaf. > + */ > + if (rn->rn_bit >= 0) > + rn = last; > + lastb = last->rn_bit; > + > + /* printf("rn %p, lastb %d\n", rn, lastb);*/ > + > + /* > + * This gets complicated because we may delete the node > + * while applying the function f to it, so we need to calculate > + * the successor node in advance. > + */ > + while (rn->rn_bit >= 0) > + rn = rn->rn_left; > + > + while (!stopping) { > + /* printf("node %p (%d)\n", rn, rn->rn_bit); */ > + base = rn; > + /* If at right child go back up, otherwise, go right */ > + while (rn->rn_parent->rn_right == rn > + && !(rn->rn_flags & RNF_ROOT)) { > + rn = rn->rn_parent; > + > + /* if went up beyond last, stop */ > + if (rn->rn_bit <= lastb) { > + stopping = 1; > + /* printf("up too far\n"); */ > + /* > + * XXX we should jump to the 'Process leaves' > + * part, because the values of 'rn' and 'next' > + * we compute will not be used. Not a big deal > + * because this loop will terminate, but it is > + * inefficient and hard to understand! > + */ > + } > + } > + > + /* > + * At the top of the tree, no need to traverse the right > + * half, prevent the traversal of the entire tree in the > + * case of default route. > + */ > + if (rn->rn_parent->rn_flags & RNF_ROOT) > + stopping = 1; > + > + /* Find the next *leaf* since next node might vanish, too */ > + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) > + rn = rn->rn_left; > + next = rn; > + /* Process leaves */ > + while ((rn = base) != 0) { > + base = rn->rn_dupedkey; > + /* printf("leaf %p\n", rn); */ > + if (!(rn->rn_flags & RNF_ROOT) > + && (error = (*f)(rn, w))) > + return (error); > + } > + rn = next; > + > + if (rn->rn_flags & RNF_ROOT) { > + /* printf("root, stopping"); */ > + stopping = 1; > + } > + > + } > + return (0); > +} > + > +static int > +rn_walktree(struct radix_node_head *h, walktree_f_t *f, void *w) > +{ > + int error; > + struct radix_node *base, *next; > + struct radix_node *rn = h->rnh_treetop; > + /* > + * This gets complicated because we may delete the node > + * while applying the function f to it, so we need to calculate > + * the successor node in advance. > + */ > + > + /* First time through node, go left */ > + while (rn->rn_bit >= 0) > + rn = rn->rn_left; > + for (;;) { > + base = rn; > + /* If at right child go back up, otherwise, go right */ > + while (rn->rn_parent->rn_right == rn > + && (rn->rn_flags & RNF_ROOT) == 0) > + rn = rn->rn_parent; > + /* Find the next *leaf* since next node might vanish, too */ > + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) > + rn = rn->rn_left; > + next = rn; > + /* Process leaves */ > + while ((rn = base)) { > + base = rn->rn_dupedkey; > + if (!(rn->rn_flags & RNF_ROOT) > + && (error = (*f)(rn, w))) > + return (error); > + } > + rn = next; > + if (rn->rn_flags & RNF_ROOT) > + return (0); > + } > + /* NOTREACHED */ > +} > + > +/* > + * Allocate and initialize an empty tree. This has 3 nodes, which are > + * part of the radix_node_head (in the order <left,root,right>) and are > + * marked RNF_ROOT so they cannot be freed. > + * The leaves have all-zero and all-one keys, with significant > + * bits starting at 'off'. > + * Return 1 on success, 0 on error. > + */ > +static int > +rn_inithead_internal(void **head, int off) > +{ > + struct radix_node_head *rnh; > + struct radix_node *t, *tt, *ttt; > + if (*head) > + return (1); > + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); > + if (rnh == 0) > + return (0); > + *head = rnh; > + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); > + ttt = rnh->rnh_nodes + 2; > + t->rn_right = ttt; > + t->rn_parent = t; > + tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ > + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; > + tt->rn_bit = -1 - off; > + *ttt = *tt; > + ttt->rn_key = rn_ones; > + rnh->rnh_addaddr = rn_addroute; > + rnh->rnh_deladdr = rn_delete; > + rnh->rnh_matchaddr = rn_match; > + rnh->rnh_lookup = rn_lookup; > + rnh->rnh_walktree = rn_walktree; > + rnh->rnh_walktree_from = rn_walktree_from; > + rnh->rnh_treetop = t; > + return (1); > +} > + > +static void > +rn_detachhead_internal(void **head) > +{ > + struct radix_node_head *rnh; > + > + KASSERT((head != NULL && *head != NULL), > + ("%s: head already freed", __func__)); > + rnh = *head; > + > + /* Free <left,root,right> nodes. */ > + Free(rnh); > + > + *head = NULL; > +} > + > +int > +rn_inithead(void **head, int off) > +{ > + struct radix_node_head *rnh; > + > + if (*head != NULL) > + return (1); > + > + if (rn_inithead_internal(head, off) == 0) > + return (0); > + > + rnh = (struct radix_node_head *)(*head); > + > + if (rn_inithead_internal((void **)&rnh->rnh_masks, 0) == 0) { > + rn_detachhead_internal(head); > + return (0); > + } > + > + return (1); > +} > + > +static int > +rn_freeentry(struct radix_node *rn, void *arg) > +{ > + struct radix_node_head * const rnh = arg; > + struct radix_node *x; > + > + x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); > + if (x != NULL) > + Free(x); > + return (0); > +} > + > +int > +rn_detachhead(void **head) > +{ > + struct radix_node_head *rnh; > + > + KASSERT((head != NULL && *head != NULL), > + ("%s: head already freed", __func__)); > + > + rnh = *head; > + > + rn_walktree(rnh->rnh_masks, rn_freeentry, rnh->rnh_masks); > + rn_detachhead_internal((void **)&rnh->rnh_masks); > + rn_detachhead_internal(head); > + return (1); > +} > + > diff --git a/example/ipfw/sys/net/radix.h b/example/ipfw/sys/net/radix.h > new file mode 100644 > index 0000000..43742fa > --- /dev/null > +++ b/example/ipfw/sys/net/radix.h > @@ -0,0 +1,168 @@ > +/*- > + * Copyright (c) 1988, 1989, 1993 > + * The Regents of the University of California. All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 4. Neither the name of the University nor the names of its contributors > + * may be used to endorse or promote products derived from this software > + * without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * @(#)radix.h 8.2 (Berkeley) 10/31/94 > + * $FreeBSD: head/sys/net/radix.h 262758 2014-03-04 23:55:04Z gnn $ > + */ > + > +#ifndef _RADIX_H_ > +#define _RADIX_H_ > + > +#ifdef _KERNEL > +#include <sys/_lock.h> > +#include <sys/_mutex.h> > +#include <sys/_rwlock.h> > +#endif > + > +#ifdef MALLOC_DECLARE > +MALLOC_DECLARE(M_RTABLE); > +#endif > + > +/* > + * Radix search tree node layout. > + */ > + > +struct radix_node { > + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ > + struct radix_node *rn_parent; /* parent */ > + short rn_bit; /* bit offset; -1-index(netmask) */ > + char rn_bmask; /* node: mask for bit test*/ > + u_char rn_flags; /* enumerated next */ > +#define RNF_NORMAL 1 /* leaf contains normal route */ > +#define RNF_ROOT 2 /* leaf is root leaf for tree */ > +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ > + union { > + struct { /* leaf only data: */ > + caddr_t rn_Key; /* object of search */ > + caddr_t rn_Mask; /* netmask, if present */ > + struct radix_node *rn_Dupedkey; > + } rn_leaf; > + struct { /* node only data: */ > + int rn_Off; /* where to start compare */ > + struct radix_node *rn_L;/* progeny */ > + struct radix_node *rn_R;/* progeny */ > + } rn_node; > + } rn_u; > +#ifdef RN_DEBUG > + int rn_info; > + struct radix_node *rn_twin; > + struct radix_node *rn_ybro; > +#endif > +}; > + > +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey > +#define rn_key rn_u.rn_leaf.rn_Key > +#define rn_mask rn_u.rn_leaf.rn_Mask > +#define rn_offset rn_u.rn_node.rn_Off > +#define rn_left rn_u.rn_node.rn_L > +#define rn_right rn_u.rn_node.rn_R > + > +/* > + * Annotations to tree concerning potential routes applying to subtrees. > + */ > + > +struct radix_mask { > + short rm_bit; /* bit offset; -1-index(netmask) */ > + char rm_unused; /* cf. rn_bmask */ > + u_char rm_flags; /* cf. rn_flags */ > + struct radix_mask *rm_mklist; /* more masks to try */ > + union { > + caddr_t rmu_mask; /* the mask */ > + struct radix_node *rmu_leaf; /* for normal routes */ > + } rm_rmu; > + int rm_refs; /* # of references to this struct */ > +}; > + > +#define rm_mask rm_rmu.rmu_mask > +#define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ > + > +typedef int walktree_f_t(struct radix_node *, void *); > + > +struct radix_node_head { > + struct radix_node *rnh_treetop; > + u_int rnh_gen; /* generation counter */ > + int rnh_multipath; /* multipath capable ? */ > + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ > + (void *v, void *mask, > + struct radix_node_head *head, struct radix_node nodes[]); > + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ > + (void *v, void *mask, struct radix_node_head *head); > + struct radix_node *(*rnh_matchaddr) /* longest match for sockaddr */ > + (void *v, struct radix_node_head *head); > + struct radix_node *(*rnh_lookup) /*exact match for sockaddr*/ > + (void *v, void *mask, struct radix_node_head *head); > + int (*rnh_walktree) /* traverse tree */ > + (struct radix_node_head *head, walktree_f_t *f, void *w); > + int (*rnh_walktree_from) /* traverse tree below a */ > + (struct radix_node_head *head, void *a, void *m, > + walktree_f_t *f, void *w); > + void (*rnh_close) /* do something when the last ref drops */ > + (struct radix_node *rn, struct radix_node_head *head); > + struct radix_node rnh_nodes[3]; /* empty tree for common case */ > + struct radix_node_head *rnh_masks; /* Storage for our masks */ > +#ifdef _KERNEL > + struct rwlock rnh_lock; /* locks entire radix tree */ > +#endif > +}; > + > +#ifndef _KERNEL > +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) > +#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) > +#define R_Free(p) free((char *)p); > +#else > +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) > +#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) > +#define Free(p) free((caddr_t)p, M_RTABLE); > + > +#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ > + rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) > +#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) > +#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) > +#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) > +#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) > +#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) > + > + > +#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) > +#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) > +#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) > +#endif /* _KERNEL */ > + > +int rn_inithead(void **, int); > +int rn_detachhead(void **); > +int rn_refines(void *, void *); > +struct radix_node > + *rn_addmask(void *, struct radix_node_head *, int, int), > + *rn_addroute (void *, void *, struct radix_node_head *, > + struct radix_node [2]), > + *rn_delete(void *, void *, struct radix_node_head *), > + *rn_lookup (void *v_arg, void *m_arg, > + struct radix_node_head *head), > + *rn_match(void *, struct radix_node_head *); > + > +#endif /* _RADIX_H_ */ > diff --git a/example/ipfw/sys/netgraph/ng_ipfw.h b/example/ipfw/sys/netgraph/ng_ipfw.h > new file mode 100644 > index 0000000..c60426e > --- /dev/null > +++ b/example/ipfw/sys/netgraph/ng_ipfw.h > @@ -0,0 +1,33 @@ > +/*- > + * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org> > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * $FreeBSD: head/sys/netgraph/ng_ipfw.h 201124 2009-12-28 12:29:13Z luigi $ > + */ > + > +#ifndef _NG_IPFW_H > +#define _NG_IPFW_H > +#define NG_IPFW_NODE_TYPE "ipfw" > +#define NGM_IPFW_COOKIE 1105988990 > +#endif /* _NG_IPFW_H */ > diff --git a/example/ipfw/sys/netinet/in_cksum.c b/example/ipfw/sys/netinet/in_cksum.c > new file mode 100644 > index 0000000..8d95ce5 > --- /dev/null > +++ b/example/ipfw/sys/netinet/in_cksum.c > @@ -0,0 +1,146 @@ > +/*- > + * Copyright (c) 1988, 1992, 1993 > + * The Regents of the University of California. All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 4. Neither the name of the University nor the names of its contributors > + * may be used to endorse or promote products derived from this software > + * without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netinet/in_cksum.c 238941 2012-07-31 08:04:49Z luigi $"); > + > +#include <sys/param.h> > +#include <sys/mbuf.h> > + > +/* > + * Checksum routine for Internet Protocol family headers (Portable Version). > + * > + * This routine is very heavily used in the network > + * code and should be modified for each CPU to be as fast as possible. > + */ > + > +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) > +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} > + > +int > +in_cksum(struct mbuf *m, int len) > +{ > + register u_short *w; > + register int sum = 0; > + register int mlen = 0; > + int byte_swapped = 0; > + > + union { > + char c[2]; > + u_short s; > + } s_util; > + union { > + u_short s[2]; > + long l; > + } l_util; > + > + for (;m && len; m = m->m_next) { > + if (m->m_len == 0) > + continue; > + w = mtod(m, u_short *); > + if (mlen == -1) { > + /* > + * The first byte of this mbuf is the continuation > + * of a word spanning between this mbuf and the > + * last mbuf. > + * > + * s_util.c[0] is already saved when scanning previous > + * mbuf. > + */ > + s_util.c[1] = *(char *)w; > + sum += s_util.s; > + w = (u_short *)((char *)w + 1); > + mlen = m->m_len - 1; > + len--; > + } else > + mlen = m->m_len; > + if (len < mlen) > + mlen = len; > + len -= mlen; > + /* > + * Force to even boundary. > + */ > + if ((1 & (uintptr_t) w) && (mlen > 0)) { > + REDUCE; > + sum <<= 8; > + s_util.c[0] = *(u_char *)w; > + w = (u_short *)((char *)w + 1); > + mlen--; > + byte_swapped = 1; > + } > + /* > + * Unroll the loop to make overhead from > + * branches &c small. > + */ > + while ((mlen -= 32) >= 0) { > + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; > + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; > + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; > + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; > + w += 16; > + } > + mlen += 32; > + while ((mlen -= 8) >= 0) { > + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; > + w += 4; > + } > + mlen += 8; > + if (mlen == 0 && byte_swapped == 0) > + continue; > + REDUCE; > + while ((mlen -= 2) >= 0) { > + sum += *w++; > + } > + if (byte_swapped) { > + REDUCE; > + sum <<= 8; > + byte_swapped = 0; > + if (mlen == -1) { > + s_util.c[1] = *(char *)w; > + sum += s_util.s; > + mlen = 0; > + } else > + mlen = -1; > + } else if (mlen == -1) > + s_util.c[0] = *(char *)w; > + } > + if (len) > + printf("cksum: out of data\n"); > + if (mlen == -1) { > + /* The last mbuf has odd # of bytes. Follow the > + standard (the odd byte may be shifted left by 8 bits > + or not as determined by endian-ness of the machine) */ > + s_util.c[1] = 0; > + sum += s_util.s; > + } > + REDUCE; > + return (~sum & 0xffff); > +} > diff --git a/example/ipfw/sys/netinet/ip_dummynet.h b/example/ipfw/sys/netinet/ip_dummynet.h > new file mode 100644 > index 0000000..3378e82 > --- /dev/null > +++ b/example/ipfw/sys/netinet/ip_dummynet.h > @@ -0,0 +1,264 @@ > +/*- > + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa > + * Portions Copyright (c) 2000 Akamba Corp. > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * $FreeBSD: head/sys/netinet/ip_dummynet.h 266941 2014-06-01 07:28:24Z hiren $ > + */ > + > +#ifndef _IP_DUMMYNET_H > +#define _IP_DUMMYNET_H > + > +/* > + * Definition of the kernel-userland API for dummynet. > + * > + * Setsockopt() and getsockopt() pass a batch of objects, each > + * of them starting with a "struct dn_id" which should fully identify > + * the object and its relation with others in the sequence. > + * The first object in each request should have > + * type= DN_CMD_*, id = DN_API_VERSION. > + * For other objects, type and subtype specify the object, len indicates > + * the total length including the header, and 'id' identifies the specific > + * object. > + * > + * Most objects are numbered with an identifier in the range 1..65535. > + * DN_MAX_ID indicates the first value outside the range. > + */ > + > +#define DN_API_VERSION 12500000 > +#define DN_MAX_ID 0x10000 > + > +struct dn_id { > + uint16_t len; /* total obj len including this header */ > + uint8_t type; > + uint8_t subtype; > + uint32_t id; /* generic id */ > +}; > + > +/* > + * These values are in the type field of struct dn_id. > + * To preserve the ABI, never rearrange the list or delete > + * entries with the exception of DN_LAST > + */ > +enum { > + DN_NONE = 0, > + DN_LINK = 1, > + DN_FS, > + DN_SCH, > + DN_SCH_I, > + DN_QUEUE, > + DN_DELAY_LINE, > + DN_PROFILE, > + DN_FLOW, /* struct dn_flow */ > + DN_TEXT, /* opaque text is the object */ > + > + DN_CMD_CONFIG = 0x80, /* objects follow */ > + DN_CMD_DELETE, /* subtype + list of entries */ > + DN_CMD_GET, /* subtype + list of entries */ > + DN_CMD_FLUSH, > + /* for compatibility with FreeBSD 7.2/8 */ > + DN_COMPAT_PIPE, > + DN_COMPAT_QUEUE, > + DN_GET_COMPAT, > + > + /* special commands for emulation of sysctl variables */ > + DN_SYSCTL_GET, > + DN_SYSCTL_SET, > + > + DN_LAST, > +}; > + > +enum { /* subtype for schedulers, flowset and the like */ > + DN_SCHED_UNKNOWN = 0, > + DN_SCHED_FIFO = 1, > + DN_SCHED_WF2QP = 2, > + /* others are in individual modules */ > +}; > + > +enum { /* user flags */ > + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ > + DN_NOERROR = 0x0002, /* do not report errors */ > + DN_QHT_HASH = 0x0004, /* qht is a hash table */ > + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ > + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ > + DN_IS_RED = 0x0020, > + DN_IS_GENTLE_RED= 0x0040, > + DN_IS_ECN = 0x0080, > + DN_PIPE_CMD = 0x1000, /* pipe config... */ > +}; > + > +/* > + * link template. > + */ > +struct dn_link { > + struct dn_id oid; > + > + /* > + * Userland sets bw and delay in bits/s and milliseconds. > + * The kernel converts this back and forth to bits/tick and ticks. > + * XXX what about burst ? > + */ > + int32_t link_nr; > + int bandwidth; /* bit/s or bits/tick. */ > + int delay; /* ms and ticks */ > + uint64_t burst; /* scaled. bits*Hz XXX */ > +}; > + > +/* > + * A flowset, which is a template for flows. Contains parameters > + * from the command line: id, target scheduler, queue sizes, plr, > + * flow masks, buckets for the flow hash, and possibly scheduler- > + * specific parameters (weight, quantum and so on). > + */ > +struct dn_fs { > + struct dn_id oid; > + uint32_t fs_nr; /* the flowset number */ > + uint32_t flags; /* userland flags */ > + int qsize; /* queue size in slots or bytes */ > + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ > + uint32_t buckets; /* buckets used for the queue hash table */ > + > + struct ipfw_flow_id flow_mask; > + uint32_t sched_nr; /* the scheduler we attach to */ > + /* generic scheduler parameters. Leave them at -1 if unset. > + * Now we use 0: weight, 1: lmax, 2: priority > + */ > + int par[4]; > + > + /* RED/GRED parameters. > + * weight and probabilities are in the range 0..1 represented > + * in fixed point arithmetic with SCALE_RED decimal bits. > + */ > +#define SCALE_RED 16 > +#define SCALE(x) ( (x) << SCALE_RED ) > +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) > +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) > + int w_q ; /* queue weight (scaled) */ > + int max_th ; /* maximum threshold for queue (scaled) */ > + int min_th ; /* minimum threshold for queue (scaled) */ > + int max_p ; /* maximum value for p_b (scaled) */ > + > +}; > + > +/* > + * dn_flow collects flow_id and stats for queues and scheduler > + * instances, and is used to pass these info to userland. > + * oid.type/oid.subtype describe the object, oid.id is number > + * of the parent object. > + */ > +struct dn_flow { > + struct dn_id oid; > + struct ipfw_flow_id fid; > + uint64_t tot_pkts; /* statistics counters */ > + uint64_t tot_bytes; > + uint32_t length; /* Queue length, in packets */ > + uint32_t len_bytes; /* Queue length, in bytes */ > + uint32_t drops; > +}; > + > + > +/* > + * Scheduler template, mostly indicating the name, number, > + * sched_mask and buckets. > + */ > +struct dn_sch { > + struct dn_id oid; > + uint32_t sched_nr; /* N, scheduler number */ > + uint32_t buckets; /* number of buckets for the instances */ > + uint32_t flags; /* have_mask, ... */ > + > + char name[16]; /* null terminated */ > + /* mask to select the appropriate scheduler instance */ > + struct ipfw_flow_id sched_mask; /* M */ > +}; > + > + > +/* A delay profile is attached to a link. > + * Note that a profile, as any other object, cannot be longer than 2^16 > + */ > +#define ED_MAX_SAMPLES_NO 1024 > +struct dn_profile { > + struct dn_id oid; > + /* fields to simulate a delay profile */ > +#define ED_MAX_NAME_LEN 32 > + char name[ED_MAX_NAME_LEN]; > + int link_nr; > + int loss_level; > + int _bandwidth; // XXX use link bandwidth? unused ? > + int samples_no; /* actual len of samples[] */ > + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ > +}; > + > + > + > +/* > + * Overall structure of dummynet > + > +In dummynet, packets are selected with the firewall rules, and passed > +to two different objects: PIPE or QUEUE (bad name). > + > +A QUEUE defines a classifier, which groups packets into flows > +according to a 'mask', puts them into independent queues (one > +per flow) with configurable size and queue management policy, > +and passes flows to a scheduler: > + > + (flow_mask|sched_mask) sched_mask > + +---------+ weight Wx +-------------+ > + | |->-[flow]-->--| |-+ > + -->--| QUEUE x | ... | | | > + | |->-[flow]-->--| SCHEDuler N | | > + +---------+ | | | > + ... | +--[LINK N]-->-- > + +---------+ weight Wy | | +--[LINK N]-->-- > + | |->-[flow]-->--| | | > + -->--| QUEUE y | ... | | | > + | |->-[flow]-->--| | | > + +---------+ +-------------+ | > + +-------------+ > + > +Many QUEUE objects can connect to the same scheduler, each > +QUEUE object can have its own set of parameters. > + > +In turn, the SCHEDuler 'forks' multiple instances according > +to a 'sched_mask', each instance manages its own set of queues > +and transmits on a private instance of a configurable LINK. > + > +A PIPE is a simplified version of the above, where there > +is no flow_mask, and each scheduler instance handles a single queue. > + > +The following data structures (visible from userland) describe > +the objects used by dummynet: > + > + + dn_link, contains the main configuration parameters related > + to delay and bandwidth; > + + dn_profile describes a delay profile; > + + dn_flow describes the flow status (flow id, statistics) > + > + + dn_sch describes a scheduler > + + dn_fs describes a flowset (msk, weight, queue parameters) > + > + * > + */ > + > +#endif /* _IP_DUMMYNET_H */ > diff --git a/example/ipfw/sys/netinet/ip_fw.h b/example/ipfw/sys/netinet/ip_fw.h > new file mode 100644 > index 0000000..7ec6f87 > --- /dev/null > +++ b/example/ipfw/sys/netinet/ip_fw.h > @@ -0,0 +1,1009 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * $FreeBSD: head/sys/netinet/ip_fw.h 273035 2014-10-13 13:49:28Z melifaro $ > + */ > + > +#ifndef _IPFW2_H > +#define _IPFW2_H > + > +/* > + * The default rule number. By the design of ip_fw, the default rule > + * is the last one, so its number can also serve as the highest number > + * allowed for a rule. The ip_fw code relies on both meanings of this > + * constant. > + */ > +#define IPFW_DEFAULT_RULE 65535 > + > +#define RESVD_SET 31 /*set for default and persistent rules*/ > +#define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/ > + > +/* > + * Default number of ipfw tables. > + */ > +#define IPFW_TABLES_MAX 65535 > +#define IPFW_TABLES_DEFAULT 128 > + > +/* > + * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit > + * argument between 1 and 65534. The value 0 (IP_FW_TARG) is used > + * to represent 'tablearg' value, e.g. indicate the use of a 'tablearg' > + * result of the most recent table() lookup. > + * Note that 16bit is only a historical limit, resulting from > + * the use of a 16-bit fields for that value. In reality, we can have > + * 2^32 pipes, queues, tag values and so on. > + */ > +#define IPFW_ARG_MIN 1 > +#define IPFW_ARG_MAX 65534 > +#define IP_FW_TABLEARG 65535 /* Compat value for old clients */ > +#define IP_FW_TARG 0 /* Current tablearg value */ > + > +/* > + * Number of entries in the call stack of the call/return commands. > + * Call stack currently is an uint16_t array with rule numbers. > + */ > +#define IPFW_CALLSTACK_SIZE 16 > + > +/* IP_FW3 header/opcodes */ > +typedef struct _ip_fw3_opheader { > + uint16_t opcode; /* Operation opcode */ > + uint16_t version; /* Opcode version */ > + uint16_t reserved[2]; /* Align to 64-bit boundary */ > +} ip_fw3_opheader; > + > +/* IP_FW3 opcodes */ > +#define IP_FW_TABLE_XADD 86 /* add entry */ > +#define IP_FW_TABLE_XDEL 87 /* delete entry */ > +#define IP_FW_TABLE_XGETSIZE 88 /* get table size (deprecated) */ > +#define IP_FW_TABLE_XLIST 89 /* list table contents */ > +#define IP_FW_TABLE_XDESTROY 90 /* destroy table */ > +#define IP_FW_TABLES_XLIST 92 /* list all tables */ > +#define IP_FW_TABLE_XINFO 93 /* request info for one table */ > +#define IP_FW_TABLE_XFLUSH 94 /* flush table data */ > +#define IP_FW_TABLE_XCREATE 95 /* create new table */ > +#define IP_FW_TABLE_XMODIFY 96 /* modify existing table */ > +#define IP_FW_XGET 97 /* Retrieve configuration */ > +#define IP_FW_XADD 98 /* add rule */ > +#define IP_FW_XDEL 99 /* del rule */ > +#define IP_FW_XMOVE 100 /* move rules to different set */ > +#define IP_FW_XZERO 101 /* clear accounting */ > +#define IP_FW_XRESETLOG 102 /* zero rules logs */ > +#define IP_FW_SET_SWAP 103 /* Swap between 2 sets */ > +#define IP_FW_SET_MOVE 104 /* Move one set to another one */ > +#define IP_FW_SET_ENABLE 105 /* Enable/disable sets */ > +#define IP_FW_TABLE_XFIND 106 /* finds an entry */ > +#define IP_FW_XIFLIST 107 /* list tracked interfaces */ > +#define IP_FW_TABLES_ALIST 108 /* list table algorithms */ > +#define IP_FW_TABLE_XSWAP 109 /* swap two tables */ > +#define IP_FW_TABLE_VLIST 110 /* dump table value hash */ > + > +#define IP_FW_NAT44_XCONFIG 111 /* Create/modify NAT44 instance */ > +#define IP_FW_NAT44_DESTROY 112 /* Destroys NAT44 instance */ > +#define IP_FW_NAT44_XGETCONFIG 113 /* Get NAT44 instance config */ > +#define IP_FW_NAT44_LIST_NAT 114 /* List all NAT44 instances */ > +#define IP_FW_NAT44_XGETLOG 115 /* Get log from NAT44 instance */ > + > +#define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */ > + > +/* > + * The kernel representation of ipfw rules is made of a list of > + * 'instructions' (for all practical purposes equivalent to BPF > + * instructions), which specify which fields of the packet > + * (or its metadata) should be analysed. > + * > + * Each instruction is stored in a structure which begins with > + * "ipfw_insn", and can contain extra fields depending on the > + * instruction type (listed below). > + * Note that the code is written so that individual instructions > + * have a size which is a multiple of 32 bits. This means that, if > + * such structures contain pointers or other 64-bit entities, > + * (there is just one instance now) they may end up unaligned on > + * 64-bit architectures, so the must be handled with care. > + * > + * "enum ipfw_opcodes" are the opcodes supported. We can have up > + * to 256 different opcodes. When adding new opcodes, they should > + * be appended to the end of the opcode list before O_LAST_OPCODE, > + * this will prevent the ABI from being broken, otherwise users > + * will have to recompile ipfw(8) when they update the kernel. > + */ > + > +enum ipfw_opcodes { /* arguments (4 byte each) */ > + O_NOP, > + > + O_IP_SRC, /* u32 = IP */ > + O_IP_SRC_MASK, /* ip = IP/mask */ > + O_IP_SRC_ME, /* none */ > + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ > + > + O_IP_DST, /* u32 = IP */ > + O_IP_DST_MASK, /* ip = IP/mask */ > + O_IP_DST_ME, /* none */ > + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ > + > + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ > + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ > + O_PROTO, /* arg1=protocol */ > + > + O_MACADDR2, /* 2 mac addr:mask */ > + O_MAC_TYPE, /* same as srcport */ > + > + O_LAYER2, /* none */ > + O_IN, /* none */ > + O_FRAG, /* none */ > + > + O_RECV, /* none */ > + O_XMIT, /* none */ > + O_VIA, /* none */ > + > + O_IPOPT, /* arg1 = 2*u8 bitmap */ > + O_IPLEN, /* arg1 = len */ > + O_IPID, /* arg1 = id */ > + > + O_IPTOS, /* arg1 = id */ > + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ > + O_IPTTL, /* arg1 = TTL */ > + > + O_IPVER, /* arg1 = version */ > + O_UID, /* u32 = id */ > + O_GID, /* u32 = id */ > + O_ESTAB, /* none (tcp established) */ > + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ > + O_TCPWIN, /* arg1 = desired win */ > + O_TCPSEQ, /* u32 = desired seq. */ > + O_TCPACK, /* u32 = desired seq. */ > + O_ICMPTYPE, /* u32 = icmp bitmap */ > + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ > + > + O_VERREVPATH, /* none */ > + O_VERSRCREACH, /* none */ > + > + O_PROBE_STATE, /* none */ > + O_KEEP_STATE, /* none */ > + O_LIMIT, /* ipfw_insn_limit */ > + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ > + > + /* > + * These are really 'actions'. > + */ > + > + O_LOG, /* ipfw_insn_log */ > + O_PROB, /* u32 = match probability */ > + > + O_CHECK_STATE, /* none */ > + O_ACCEPT, /* none */ > + O_DENY, /* none */ > + O_REJECT, /* arg1=icmp arg (same as deny) */ > + O_COUNT, /* none */ > + O_SKIPTO, /* arg1=next rule number */ > + O_PIPE, /* arg1=pipe number */ > + O_QUEUE, /* arg1=queue number */ > + O_DIVERT, /* arg1=port number */ > + O_TEE, /* arg1=port number */ > + O_FORWARD_IP, /* fwd sockaddr */ > + O_FORWARD_MAC, /* fwd mac */ > + O_NAT, /* nope */ > + O_REASS, /* none */ > + > + /* > + * More opcodes. > + */ > + O_IPSEC, /* has ipsec history */ > + O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ > + O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ > + O_ANTISPOOF, /* none */ > + O_JAIL, /* u32 = id */ > + O_ALTQ, /* u32 = altq classif. qid */ > + O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ > + O_TCPDATALEN, /* arg1 = tcp data len */ > + O_IP6_SRC, /* address without mask */ > + O_IP6_SRC_ME, /* my addresses */ > + O_IP6_SRC_MASK, /* address with the mask */ > + O_IP6_DST, > + O_IP6_DST_ME, > + O_IP6_DST_MASK, > + O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ > + O_ICMP6TYPE, /* icmp6 packet type filtering */ > + O_EXT_HDR, /* filtering for ipv6 extension header */ > + O_IP6, > + > + /* > + * actions for ng_ipfw > + */ > + O_NETGRAPH, /* send to ng_ipfw */ > + O_NGTEE, /* copy to ng_ipfw */ > + > + O_IP4, > + > + O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ > + > + O_TAG, /* arg1=tag number */ > + O_TAGGED, /* arg1=tag number */ > + > + O_SETFIB, /* arg1=FIB number */ > + O_FIB, /* arg1=FIB desired fib number */ > + > + O_SOCKARG, /* socket argument */ > + > + O_CALLRETURN, /* arg1=called rule number */ > + > + O_FORWARD_IP6, /* fwd sockaddr_in6 */ > + > + O_DSCP, /* 2 u32 = DSCP mask */ > + O_SETDSCP, /* arg1=DSCP value */ > + O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */ > + > + O_LAST_OPCODE /* not an opcode! */ > +}; > + > + > +/* > + * The extension header are filtered only for presence using a bit > + * vector with a flag for each header. > + */ > +#define EXT_FRAGMENT 0x1 > +#define EXT_HOPOPTS 0x2 > +#define EXT_ROUTING 0x4 > +#define EXT_AH 0x8 > +#define EXT_ESP 0x10 > +#define EXT_DSTOPTS 0x20 > +#define EXT_RTHDR0 0x40 > +#define EXT_RTHDR2 0x80 > + > +/* > + * Template for instructions. > + * > + * ipfw_insn is used for all instructions which require no operands, > + * a single 16-bit value (arg1), or a couple of 8-bit values. > + * > + * For other instructions which require different/larger arguments > + * we have derived structures, ipfw_insn_*. > + * > + * The size of the instruction (in 32-bit words) is in the low > + * 6 bits of "len". The 2 remaining bits are used to implement > + * NOT and OR on individual instructions. Given a type, you can > + * compute the length to be put in "len" using F_INSN_SIZE(t) > + * > + * F_NOT negates the match result of the instruction. > + * > + * F_OR is used to build or blocks. By default, instructions > + * are evaluated as part of a logical AND. An "or" block > + * { X or Y or Z } contains F_OR set in all but the last > + * instruction of the block. A match will cause the code > + * to skip past the last instruction of the block. > + * > + * NOTA BENE: in a couple of places we assume that > + * sizeof(ipfw_insn) == sizeof(u_int32_t) > + * this needs to be fixed. > + * > + */ > +typedef struct _ipfw_insn { /* template for instructions */ > + u_int8_t opcode; > + u_int8_t len; /* number of 32-bit words */ > +#define F_NOT 0x80 > +#define F_OR 0x40 > +#define F_LEN_MASK 0x3f > +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) > + > + u_int16_t arg1; > +} ipfw_insn; > + > +/* > + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of > + * a given type. > + */ > +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) > + > +/* > + * This is used to store an array of 16-bit entries (ports etc.) > + */ > +typedef struct _ipfw_insn_u16 { > + ipfw_insn o; > + u_int16_t ports[2]; /* there may be more */ > +} ipfw_insn_u16; > + > +/* > + * This is used to store an array of 32-bit entries > + * (uid, single IPv4 addresses etc.) > + */ > +typedef struct _ipfw_insn_u32 { > + ipfw_insn o; > + u_int32_t d[1]; /* one or more */ > +} ipfw_insn_u32; > + > +/* > + * This is used to store IP addr-mask pairs. > + */ > +typedef struct _ipfw_insn_ip { > + ipfw_insn o; > + struct in_addr addr; > + struct in_addr mask; > +} ipfw_insn_ip; > + > +/* > + * This is used to forward to a given address (ip). > + */ > +typedef struct _ipfw_insn_sa { > + ipfw_insn o; > + struct sockaddr_in sa; > +} ipfw_insn_sa; > + > +/* > + * This is used to forward to a given address (ipv6). > + */ > +typedef struct _ipfw_insn_sa6 { > + ipfw_insn o; > + struct sockaddr_in6 sa; > +} ipfw_insn_sa6; > + > +/* > + * This is used for MAC addr-mask pairs. > + */ > +typedef struct _ipfw_insn_mac { > + ipfw_insn o; > + u_char addr[12]; /* dst[6] + src[6] */ > + u_char mask[12]; /* dst[6] + src[6] */ > +} ipfw_insn_mac; > + > +/* > + * This is used for interface match rules (recv xx, xmit xx). > + */ > +typedef struct _ipfw_insn_if { > + ipfw_insn o; > + union { > + struct in_addr ip; > + int glob; > + uint16_t kidx; > + } p; > + char name[IFNAMSIZ]; > +} ipfw_insn_if; > + > +/* > + * This is used for storing an altq queue id number. > + */ > +typedef struct _ipfw_insn_altq { > + ipfw_insn o; > + u_int32_t qid; > +} ipfw_insn_altq; > + > +/* > + * This is used for limit rules. > + */ > +typedef struct _ipfw_insn_limit { > + ipfw_insn o; > + u_int8_t _pad; > + u_int8_t limit_mask; /* combination of DYN_* below */ > +#define DYN_SRC_ADDR 0x1 > +#define DYN_SRC_PORT 0x2 > +#define DYN_DST_ADDR 0x4 > +#define DYN_DST_PORT 0x8 > + > + u_int16_t conn_limit; > +} ipfw_insn_limit; > + > +/* > + * This is used for log instructions. > + */ > +typedef struct _ipfw_insn_log { > + ipfw_insn o; > + u_int32_t max_log; /* how many do we log -- 0 = all */ > + u_int32_t log_left; /* how many left to log */ > +} ipfw_insn_log; > + > +/* Legacy NAT structures, compat only */ > +#ifndef _KERNEL > +/* > + * Data structures required by both ipfw(8) and ipfw(4) but not part of the > + * management API are protected by IPFW_INTERNAL. > + */ > +#ifdef IPFW_INTERNAL > +/* Server pool support (LSNAT). */ > +struct cfg_spool { > + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ > + struct in_addr addr; > + u_short port; > +}; > +#endif > + > +/* Redirect modes id. */ > +#define REDIR_ADDR 0x01 > +#define REDIR_PORT 0x02 > +#define REDIR_PROTO 0x04 > + > +#ifdef IPFW_INTERNAL > +/* Nat redirect configuration. */ > +struct cfg_redir { > + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ > + u_int16_t mode; /* type of redirect mode */ > + struct in_addr laddr; /* local ip address */ > + struct in_addr paddr; /* public ip address */ > + struct in_addr raddr; /* remote ip address */ > + u_short lport; /* local port */ > + u_short pport; /* public port */ > + u_short rport; /* remote port */ > + u_short pport_cnt; /* number of public ports */ > + u_short rport_cnt; /* number of remote ports */ > + int proto; /* protocol: tcp/udp */ > + struct alias_link **alink; > + /* num of entry in spool chain */ > + u_int16_t spool_cnt; > + /* chain of spool instances */ > + LIST_HEAD(spool_chain, cfg_spool) spool_chain; > +}; > +#endif > + > +#ifdef IPFW_INTERNAL > +/* Nat configuration data struct. */ > +struct cfg_nat { > + /* chain of nat instances */ > + LIST_ENTRY(cfg_nat) _next; > + int id; /* nat id */ > + struct in_addr ip; /* nat ip address */ > + char if_name[IF_NAMESIZE]; /* interface name */ > + int mode; /* aliasing mode */ > + struct libalias *lib; /* libalias instance */ > + /* number of entry in spool chain */ > + int redir_cnt; > + /* chain of redir instances */ > + LIST_HEAD(redir_chain, cfg_redir) redir_chain; > +}; > +#endif > + > +#define SOF_NAT sizeof(struct cfg_nat) > +#define SOF_REDIR sizeof(struct cfg_redir) > +#define SOF_SPOOL sizeof(struct cfg_spool) > + > +#endif /* ifndef _KERNEL */ > + > + > +struct nat44_cfg_spool { > + struct in_addr addr; > + uint16_t port; > + uint16_t spare; > +}; > +#define NAT44_REDIR_ADDR 0x01 > +#define NAT44_REDIR_PORT 0x02 > +#define NAT44_REDIR_PROTO 0x04 > + > +/* Nat redirect configuration. */ > +struct nat44_cfg_redir { > + struct in_addr laddr; /* local ip address */ > + struct in_addr paddr; /* public ip address */ > + struct in_addr raddr; /* remote ip address */ > + uint16_t lport; /* local port */ > + uint16_t pport; /* public port */ > + uint16_t rport; /* remote port */ > + uint16_t pport_cnt; /* number of public ports */ > + uint16_t rport_cnt; /* number of remote ports */ > + uint16_t mode; /* type of redirect mode */ > + uint16_t spool_cnt; /* num of entry in spool chain */ > + uint16_t spare; > + uint32_t proto; /* protocol: tcp/udp */ > +}; > + > +/* Nat configuration data struct. */ > +struct nat44_cfg_nat { > + char name[64]; /* nat name */ > + char if_name[64]; /* interface name */ > + uint32_t size; /* structure size incl. redirs */ > + struct in_addr ip; /* nat IPv4 address */ > + uint32_t mode; /* aliasing mode */ > + uint32_t redir_cnt; /* number of entry in spool chain */ > +}; > + > +/* Nat command. */ > +typedef struct _ipfw_insn_nat { > + ipfw_insn o; > + struct cfg_nat *nat; > +} ipfw_insn_nat; > + > +/* Apply ipv6 mask on ipv6 addr */ > +#define APPLY_MASK(addr,mask) \ > + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ > + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ > + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ > + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; > + > +/* Structure for ipv6 */ > +typedef struct _ipfw_insn_ip6 { > + ipfw_insn o; > + struct in6_addr addr6; > + struct in6_addr mask6; > +} ipfw_insn_ip6; > + > +/* Used to support icmp6 types */ > +typedef struct _ipfw_insn_icmp6 { > + ipfw_insn o; > + uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h > + * define ICMP6_MAXTYPE > + * as follows: n = ICMP6_MAXTYPE/32 + 1 > + * Actually is 203 > + */ > +} ipfw_insn_icmp6; > + > +/* > + * Here we have the structure representing an ipfw rule. > + * > + * Layout: > + * struct ip_fw_rule > + * [ counter block, size = rule->cntr_len ] > + * [ one or more instructions, size = rule->cmd_len * 4 ] > + * > + * It starts with a general area (with link fields). > + * Counter block may be next (if rule->cntr_len > 0), > + * followed by an array of one or more instructions, which the code > + * accesses as an array of 32-bit values. rule->cmd_len represents > + * the total instructions legth in u32 worrd, while act_ofs represents > + * rule action offset in u32 words. > + * > + * When assembling instruction, remember the following: > + * > + * + if a rule has a "keep-state" (or "limit") option, then the > + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE > + * + if a rule has a "log" option, then the first action > + * (at ACTION_PTR(r)) MUST be O_LOG > + * + if a rule has an "altq" option, it comes after "log" > + * + if a rule has an O_TAG option, it comes after "log" and "altq" > + * > + * > + * All structures (excluding instructions) are u64-aligned. > + * Please keep this. > + */ > + > +struct ip_fw_rule { > + uint16_t act_ofs; /* offset of action in 32-bit units */ > + uint16_t cmd_len; /* # of 32-bit words in cmd */ > + uint16_t spare; > + uint8_t set; /* rule set (0..31) */ > + uint8_t flags; /* rule flags */ > + uint32_t rulenum; /* rule number */ > + uint32_t id; /* rule id */ > + > + ipfw_insn cmd[1]; /* storage for commands */ > +}; > +#define IPFW_RULE_NOOPT 0x01 /* Has no options in body */ > + > +/* Unaligned version */ > + > +/* Base ipfw rule counter block. */ > +struct ip_fw_bcounter { > + uint16_t size; /* Size of counter block, bytes */ > + uint8_t flags; /* flags for given block */ > + uint8_t spare; > + uint32_t timestamp; /* tv_sec of last match */ > + uint64_t pcnt; /* Packet counter */ > + uint64_t bcnt; /* Byte counter */ > +}; > + > + > +#ifndef _KERNEL > +/* > + * Legacy rule format > + */ > +struct ip_fw { > + struct ip_fw *x_next; /* linked list of rules */ > + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ > + /* 'next_rule' is used to pass up 'set_disable' status */ > + > + uint16_t act_ofs; /* offset of action in 32-bit units */ > + uint16_t cmd_len; /* # of 32-bit words in cmd */ > + uint16_t rulenum; /* rule number */ > + uint8_t set; /* rule set (0..31) */ > + uint8_t _pad; /* padding */ > + uint32_t id; /* rule id */ > + > + /* These fields are present in all rules. */ > + uint64_t pcnt; /* Packet counter */ > + uint64_t bcnt; /* Byte counter */ > + uint32_t timestamp; /* tv_sec of last match */ > + > + ipfw_insn cmd[1]; /* storage for commands */ > +}; > +#endif > + > +#define ACTION_PTR(rule) \ > + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) > + > +#define RULESIZE(rule) (sizeof(*(rule)) + (rule)->cmd_len * 4 - 4) > + > + > +#if 1 // should be moved to in.h > +/* > + * This structure is used as a flow mask and a flow id for various > + * parts of the code. > + * addr_type is used in userland and kernel to mark the address type. > + * fib is used in the kernel to record the fib in use. > + * _flags is used in the kernel to store tcp flags for dynamic rules. > + */ > +struct ipfw_flow_id { > + uint32_t dst_ip; > + uint32_t src_ip; > + uint16_t dst_port; > + uint16_t src_port; > + uint8_t fib; > + uint8_t proto; > + uint8_t _flags; /* protocol-specific flags */ > + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ > + struct in6_addr dst_ip6; > + struct in6_addr src_ip6; > + uint32_t flow_id6; > + uint32_t extra; /* queue/pipe or frag_id */ > +}; > +#endif > + > +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) > + > +/* > + * Dynamic ipfw rule. > + */ > +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; > + > +struct _ipfw_dyn_rule { > + ipfw_dyn_rule *next; /* linked list of rules. */ > + struct ip_fw *rule; /* pointer to rule */ > + /* 'rule' is used to pass up the rule number (from the parent) */ > + > + ipfw_dyn_rule *parent; /* pointer to parent rule */ > + u_int64_t pcnt; /* packet match counter */ > + u_int64_t bcnt; /* byte match counter */ > + struct ipfw_flow_id id; /* (masked) flow id */ > + u_int32_t expire; /* expire time */ > + u_int32_t bucket; /* which bucket in hash table */ > + u_int32_t state; /* state of this rule (typically a > + * combination of TCP flags) > + */ > + u_int32_t ack_fwd; /* most recent ACKs in forward */ > + u_int32_t ack_rev; /* and reverse directions (used */ > + /* to generate keepalives) */ > + u_int16_t dyn_type; /* rule type */ > + u_int16_t count; /* refcount */ > +}; > + > +/* > + * Definitions for IP option names. > + */ > +#define IP_FW_IPOPT_LSRR 0x01 > +#define IP_FW_IPOPT_SSRR 0x02 > +#define IP_FW_IPOPT_RR 0x04 > +#define IP_FW_IPOPT_TS 0x08 > + > +/* > + * Definitions for TCP option names. > + */ > +#define IP_FW_TCPOPT_MSS 0x01 > +#define IP_FW_TCPOPT_WINDOW 0x02 > +#define IP_FW_TCPOPT_SACK 0x04 > +#define IP_FW_TCPOPT_TS 0x08 > +#define IP_FW_TCPOPT_CC 0x10 > + > +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ > +#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ > + > +/* > + * These are used for lookup tables. > + */ > + > +#define IPFW_TABLE_ADDR 1 /* Table for holding IPv4/IPv6 prefixes */ > +#define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ > +#define IPFW_TABLE_NUMBER 3 /* Table for holding ports/uid/gid/etc */ > +#define IPFW_TABLE_FLOW 4 /* Table for holding flow data */ > +#define IPFW_TABLE_MAXTYPE 4 /* Maximum valid number */ > + > +#define IPFW_TABLE_CIDR IPFW_TABLE_ADDR /* compat */ > + > +/* Value types */ > +#define IPFW_VTYPE_LEGACY 0xFFFFFFFF /* All data is filled in */ > +#define IPFW_VTYPE_SKIPTO 0x00000001 /* skipto/call/callreturn */ > +#define IPFW_VTYPE_PIPE 0x00000002 /* pipe/queue */ > +#define IPFW_VTYPE_FIB 0x00000004 /* setfib */ > +#define IPFW_VTYPE_NAT 0x00000008 /* nat */ > +#define IPFW_VTYPE_DSCP 0x00000010 /* dscp */ > +#define IPFW_VTYPE_TAG 0x00000020 /* tag/untag */ > +#define IPFW_VTYPE_DIVERT 0x00000040 /* divert/tee */ > +#define IPFW_VTYPE_NETGRAPH 0x00000080 /* netgraph/ngtee */ > +#define IPFW_VTYPE_LIMIT 0x00000100 /* limit */ > +#define IPFW_VTYPE_NH4 0x00000200 /* IPv4 nexthop */ > +#define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */ > + > +typedef struct _ipfw_table_entry { > + in_addr_t addr; /* network address */ > + u_int32_t value; /* value */ > + u_int16_t tbl; /* table number */ > + u_int8_t masklen; /* mask length */ > +} ipfw_table_entry; > + > +typedef struct _ipfw_table_xentry { > + uint16_t len; /* Total entry length */ > + uint8_t type; /* entry type */ > + uint8_t masklen; /* mask length */ > + uint16_t tbl; /* table number */ > + uint16_t flags; /* record flags */ > + uint32_t value; /* value */ > + union { > + /* Longest field needs to be aligned by 4-byte boundary */ > + struct in6_addr addr6; /* IPv6 address */ > + char iface[IF_NAMESIZE]; /* interface name */ > + } k; > +} ipfw_table_xentry; > +#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ > + > +typedef struct _ipfw_table { > + u_int32_t size; /* size of entries in bytes */ > + u_int32_t cnt; /* # of entries */ > + u_int16_t tbl; /* table number */ > + ipfw_table_entry ent[0]; /* entries */ > +} ipfw_table; > + > +typedef struct _ipfw_xtable { > + ip_fw3_opheader opheader; /* IP_FW3 opcode */ > + uint32_t size; /* size of entries in bytes */ > + uint32_t cnt; /* # of entries */ > + uint16_t tbl; /* table number */ > + uint8_t type; /* table type */ > + ipfw_table_xentry xent[0]; /* entries */ > +} ipfw_xtable; > + > +typedef struct _ipfw_obj_tlv { > + uint16_t type; /* TLV type */ > + uint16_t flags; /* TLV-specific flags */ > + uint32_t length; /* Total length, aligned to u64 */ > +} ipfw_obj_tlv; > +#define IPFW_TLV_TBL_NAME 1 > +#define IPFW_TLV_TBLNAME_LIST 2 > +#define IPFW_TLV_RULE_LIST 3 > +#define IPFW_TLV_DYNSTATE_LIST 4 > +#define IPFW_TLV_TBL_ENT 5 > +#define IPFW_TLV_DYN_ENT 6 > +#define IPFW_TLV_RULE_ENT 7 > +#define IPFW_TLV_TBLENT_LIST 8 > +#define IPFW_TLV_RANGE 9 > + > +/* Object name TLV */ > +typedef struct _ipfw_obj_ntlv { > + ipfw_obj_tlv head; /* TLV header */ > + uint16_t idx; /* Name index */ > + uint8_t spare; /* unused */ > + uint8_t type; /* object type, if applicable */ > + uint32_t set; /* set, if applicable */ > + char name[64]; /* Null-terminated name */ > +} ipfw_obj_ntlv; > + > +/* IPv4/IPv6 L4 flow description */ > +struct tflow_entry { > + uint8_t af; > + uint8_t proto; > + uint16_t spare; > + uint16_t sport; > + uint16_t dport; > + union { > + struct { > + struct in_addr sip; > + struct in_addr dip; > + } a4; > + struct { > + struct in6_addr sip6; > + struct in6_addr dip6; > + } a6; > + } a; > +}; > + > +typedef struct _ipfw_table_value { > + uint32_t tag; /* O_TAG/O_TAGGED */ > + uint32_t pipe; /* O_PIPE/O_QUEUE */ > + uint16_t divert; /* O_DIVERT/O_TEE */ > + uint16_t skipto; /* skipto, CALLRET */ > + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ > + uint32_t fib; /* O_SETFIB */ > + uint32_t nat; /* O_NAT */ > + uint32_t nh4; > + uint8_t dscp; > + uint8_t spare0[3]; > + struct in6_addr nh6; > + uint32_t limit; /* O_LIMIT */ > + uint32_t spare1; > + uint64_t reserved; > +} ipfw_table_value; > + > +/* Table entry TLV */ > +typedef struct _ipfw_obj_tentry { > + ipfw_obj_tlv head; /* TLV header */ > + uint8_t subtype; /* subtype (IPv4,IPv6) */ > + uint8_t masklen; /* mask length */ > + uint8_t result; /* request result */ > + uint8_t spare0; > + uint16_t idx; /* Table name index */ > + uint16_t spare1; > + union { > + /* Longest field needs to be aligned by 8-byte boundary */ > + struct in_addr addr; /* IPv4 address */ > + uint32_t key; /* uid/gid/port */ > + struct in6_addr addr6; /* IPv6 address */ > + char iface[IF_NAMESIZE]; /* interface name */ > + struct tflow_entry flow; > + } k; > + union { > + ipfw_table_value value; /* value data */ > + uint32_t kidx; /* value kernel index */ > + } v; > +} ipfw_obj_tentry; > +#define IPFW_TF_UPDATE 0x01 /* Update record if exists */ > +/* Container TLV */ > +#define IPFW_CTF_ATOMIC 0x01 /* Perform atomic operation */ > +/* Operation results */ > +#define IPFW_TR_IGNORED 0 /* Entry was ignored (rollback) */ > +#define IPFW_TR_ADDED 1 /* Entry was succesfully added */ > +#define IPFW_TR_UPDATED 2 /* Entry was succesfully updated*/ > +#define IPFW_TR_DELETED 3 /* Entry was succesfully deleted*/ > +#define IPFW_TR_LIMIT 4 /* Entry was ignored (limit) */ > +#define IPFW_TR_NOTFOUND 5 /* Entry was not found */ > +#define IPFW_TR_EXISTS 6 /* Entry already exists */ > +#define IPFW_TR_ERROR 7 /* Request has failed (unknown) */ > + > +typedef struct _ipfw_obj_dyntlv { > + ipfw_obj_tlv head; > + ipfw_dyn_rule state; > +} ipfw_obj_dyntlv; > +#define IPFW_DF_LAST 0x01 /* Last state in chain */ > + > +/* Containter TLVs */ > +typedef struct _ipfw_obj_ctlv { > + ipfw_obj_tlv head; /* TLV header */ > + uint32_t count; /* Number of sub-TLVs */ > + uint16_t objsize; /* Single object size */ > + uint8_t version; /* TLV version */ > + uint8_t flags; /* TLV-specific flags */ > +} ipfw_obj_ctlv; > + > +/* Range TLV */ > +typedef struct _ipfw_range_tlv { > + ipfw_obj_tlv head; /* TLV header */ > + uint32_t flags; /* Range flags */ > + uint16_t start_rule; /* Range start */ > + uint16_t end_rule; /* Range end */ > + uint32_t set; /* Range set to match */ > + uint32_t new_set; /* New set to move/swap to */ > +} ipfw_range_tlv; > +#define IPFW_RCFLAG_RANGE 0x01 /* rule range is set */ > +#define IPFW_RCFLAG_ALL 0x02 /* match ALL rules */ > +#define IPFW_RCFLAG_SET 0x04 /* match rules in given set */ > +/* User-settable flags */ > +#define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \ > + IPFW_RCFLAG_SET) > +/* Internally used flags */ > +#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip defaul rule */ > + > +typedef struct _ipfw_ta_tinfo { > + uint32_t flags; /* Format flags */ > + uint32_t spare; > + uint8_t taclass4; /* algorithm class */ > + uint8_t spare4; > + uint16_t itemsize4; /* item size in runtime */ > + uint32_t size4; /* runtime structure size */ > + uint32_t count4; /* number of items in runtime */ > + uint8_t taclass6; /* algorithm class */ > + uint8_t spare6; > + uint16_t itemsize6; /* item size in runtime */ > + uint32_t size6; /* runtime structure size */ > + uint32_t count6; /* number of items in runtime */ > +} ipfw_ta_tinfo; > +#define IPFW_TACLASS_HASH 1 /* algo is based on hash */ > +#define IPFW_TACLASS_ARRAY 2 /* algo is based on array */ > +#define IPFW_TACLASS_RADIX 3 /* algo is based on radix tree */ > + > +#define IPFW_TATFLAGS_DATA 0x0001 /* Has data filled in */ > +#define IPFW_TATFLAGS_AFDATA 0x0002 /* Separate data per AF */ > +#define IPFW_TATFLAGS_AFITEM 0x0004 /* diff. items per AF */ > + > +typedef struct _ipfw_xtable_info { > + uint8_t type; /* table type (addr,iface,..) */ > + uint8_t tflags; /* type flags */ > + uint16_t mflags; /* modification flags */ > + uint16_t flags; /* generic table flags */ > + uint16_t spare[3]; > + uint32_t vmask; /* bitmask with value types */ > + uint32_t set; /* set table is in */ > + uint32_t kidx; /* kernel index */ > + uint32_t refcnt; /* number of references */ > + uint32_t count; /* Number of records */ > + uint32_t size; /* Total size of records(export)*/ > + uint32_t limit; /* Max number of records */ > + char tablename[64]; /* table name */ > + char algoname[64]; /* algorithm name */ > + ipfw_ta_tinfo ta_info; /* additional algo stats */ > +} ipfw_xtable_info; > +/* Generic table flags */ > +#define IPFW_TGFLAGS_LOCKED 0x01 /* Tables is locked from changes*/ > +/* Table type-specific flags */ > +#define IPFW_TFFLAG_SRCIP 0x01 > +#define IPFW_TFFLAG_DSTIP 0x02 > +#define IPFW_TFFLAG_SRCPORT 0x04 > +#define IPFW_TFFLAG_DSTPORT 0x08 > +#define IPFW_TFFLAG_PROTO 0x10 > +/* Table modification flags */ > +#define IPFW_TMFLAGS_LIMIT 0x0002 /* Change limit value */ > +#define IPFW_TMFLAGS_LOCK 0x0004 /* Change table lock state */ > + > +typedef struct _ipfw_iface_info { > + char ifname[64]; /* interface name */ > + uint32_t ifindex; /* interface index */ > + uint32_t flags; /* flags */ > + uint32_t refcnt; /* number of references */ > + uint32_t gencnt; /* number of changes */ > + uint64_t spare; > +} ipfw_iface_info; > +#define IPFW_IFFLAG_RESOLVED 0x01 /* Interface exists */ > + > +typedef struct _ipfw_ta_info { > + char algoname[64]; /* algorithm name */ > + uint32_t type; /* lookup type */ > + uint32_t flags; > + uint32_t refcnt; > + uint32_t spare0; > + uint64_t spare1; > +} ipfw_ta_info; > + > +#define IPFW_OBJTYPE_TABLE 1 > +typedef struct _ipfw_obj_header { > + ip_fw3_opheader opheader; /* IP_FW3 opcode */ > + uint32_t spare; > + uint16_t idx; /* object name index */ > + uint8_t objtype; /* object type */ > + uint8_t objsubtype; /* object subtype */ > + ipfw_obj_ntlv ntlv; /* object name tlv */ > +} ipfw_obj_header; > + > +typedef struct _ipfw_obj_lheader { > + ip_fw3_opheader opheader; /* IP_FW3 opcode */ > + uint32_t set_mask; /* disabled set mask */ > + uint32_t count; /* Total objects count */ > + uint32_t size; /* Total size (incl. header) */ > + uint32_t objsize; /* Size of one object */ > +} ipfw_obj_lheader; > + > +#define IPFW_CFG_GET_STATIC 0x01 > +#define IPFW_CFG_GET_STATES 0x02 > +#define IPFW_CFG_GET_COUNTERS 0x04 > +typedef struct _ipfw_cfg_lheader { > + ip_fw3_opheader opheader; /* IP_FW3 opcode */ > + uint32_t set_mask; /* enabled set mask */ > + uint32_t spare; > + uint32_t flags; /* Request flags */ > + uint32_t size; /* neded buffer size */ > + uint32_t start_rule; > + uint32_t end_rule; > +} ipfw_cfg_lheader; > + > +typedef struct _ipfw_range_header { > + ip_fw3_opheader opheader; /* IP_FW3 opcode */ > + ipfw_range_tlv range; > +} ipfw_range_header; > + > +typedef struct _ipfw_sopt_info { > + uint16_t opcode; > + uint8_t version; > + uint8_t dir; > + uint8_t spare; > + uint64_t refcnt; > +} ipfw_sopt_info; > + > +#endif /* _IPFW2_H */ > diff --git a/example/ipfw/sys/netinet/tcp.h b/example/ipfw/sys/netinet/tcp.h > new file mode 100644 > index 0000000..29c4313 > --- /dev/null > +++ b/example/ipfw/sys/netinet/tcp.h > @@ -0,0 +1,247 @@ > +/*- > + * Copyright (c) 1982, 1986, 1993 > + * The Regents of the University of California. All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 4. Neither the name of the University nor the names of its contributors > + * may be used to endorse or promote products derived from this software > + * without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 > + * $FreeBSD: head/sys/netinet/tcp.h 246210 2013-02-01 15:32:20Z jhb $ > + */ > + > +#ifndef _NETINET_TCP_H_ > +#define _NETINET_TCP_H_ > + > +#include <sys/cdefs.h> > +#include <sys/types.h> > + > +#if __BSD_VISIBLE > + > +typedef u_int32_t tcp_seq; > + > +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ > +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ > + > +/* > + * TCP header. > + * Per RFC 793, September, 1981. > + */ > +struct tcphdr { > + u_short th_sport; /* source port */ > + u_short th_dport; /* destination port */ > + tcp_seq th_seq; /* sequence number */ > + tcp_seq th_ack; /* acknowledgement number */ > +#if BYTE_ORDER == LITTLE_ENDIAN > + u_char th_x2:4, /* (unused) */ > + th_off:4; /* data offset */ > +#endif > +#if BYTE_ORDER == BIG_ENDIAN > + u_char th_off:4, /* data offset */ > + th_x2:4; /* (unused) */ > +#endif > + u_char th_flags; > +#define TH_FIN 0x01 > +#define TH_SYN 0x02 > +#define TH_RST 0x04 > +#define TH_PUSH 0x08 > +#define TH_ACK 0x10 > +#define TH_URG 0x20 > +#define TH_ECE 0x40 > +#define TH_CWR 0x80 > +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) > +#define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" > + > + u_short th_win; /* window */ > + u_short th_sum; /* checksum */ > + u_short th_urp; /* urgent pointer */ > +}; > + > +#define TCPOPT_EOL 0 > +#define TCPOLEN_EOL 1 > +#define TCPOPT_PAD 0 /* padding after EOL */ > +#define TCPOLEN_PAD 1 > +#define TCPOPT_NOP 1 > +#define TCPOLEN_NOP 1 > +#define TCPOPT_MAXSEG 2 > +#define TCPOLEN_MAXSEG 4 > +#define TCPOPT_WINDOW 3 > +#define TCPOLEN_WINDOW 3 > +#define TCPOPT_SACK_PERMITTED 4 > +#define TCPOLEN_SACK_PERMITTED 2 > +#define TCPOPT_SACK 5 > +#define TCPOLEN_SACKHDR 2 > +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ > +#define TCPOPT_TIMESTAMP 8 > +#define TCPOLEN_TIMESTAMP 10 > +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ > +#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ > +#define TCPOLEN_SIGNATURE 18 > + > +/* Miscellaneous constants */ > +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ > +#define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ > + > + > +/* > + * The default maximum segment size (MSS) to be used for new TCP connections > + * when path MTU discovery is not enabled. > + * > + * RFC879 derives the default MSS from the largest datagram size hosts are > + * minimally required to handle directly or through IP reassembly minus the > + * size of the IP and TCP header. With IPv6 the minimum MTU is specified > + * in RFC2460. > + * > + * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) > + * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) > + * > + * We use explicit numerical definition here to avoid header pollution. > + */ > +#define TCP_MSS 536 > +#define TCP6_MSS 1220 > + > +/* > + * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS > + * option. Allowing low values of MSS can consume significant resources and > + * be used to mount a resource exhaustion attack. > + * Connections requesting lower MSS values will be rounded up to this value > + * and the IP_DF flag will be cleared to allow fragmentation along the path. > + * > + * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting > + * it to "0" disables the minmss check. > + * > + * The default value is fine for TCP across the Internet's smallest official > + * link MTU (256 bytes for AX.25 packet radio). However, a connection is very > + * unlikely to come across such low MTU interfaces these days (anno domini 2003). > + */ > +#define TCP_MINMSS 216 > + > +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ > +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ > + > +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ > + > +#define TCP_MAXBURST 4 /* maximum segments in a burst */ > + > +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ > +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) > + /* max space left for options */ > +#endif /* __BSD_VISIBLE */ > + > +/* > + * User-settable options (used with setsockopt). These are discrete > + * values and are not masked together. Some values appear to be > + * bitmasks for historical reasons. > + */ > +#define TCP_NODELAY 1 /* don't delay send to coalesce packets */ > +#if __BSD_VISIBLE > +#define TCP_MAXSEG 2 /* set maximum segment size */ > +#define TCP_NOPUSH 4 /* don't push last block of write */ > +#define TCP_NOOPT 8 /* don't use TCP options */ > +#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ > +#define TCP_INFO 32 /* retrieve tcp_info structure */ > +#define TCP_CONGESTION 64 /* get/set congestion control algorithm */ > +#define TCP_KEEPINIT 128 /* N, time to establish connection */ > +#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ > +#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ > +#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ > + > +/* Start of reserved space for third-party user-settable options. */ > +#define TCP_VENDOR SO_VENDOR > + > +#define TCP_CA_NAME_MAX 16 /* max congestion control name length */ > + > +#define TCPI_OPT_TIMESTAMPS 0x01 > +#define TCPI_OPT_SACK 0x02 > +#define TCPI_OPT_WSCALE 0x04 > +#define TCPI_OPT_ECN 0x08 > +#define TCPI_OPT_TOE 0x10 > + > +/* > + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits > + * the caller to query certain information about the state of a TCP > + * connection. We provide an overlapping set of fields with the Linux > + * implementation, but since this is a fixed size structure, room has been > + * left for growth. In order to maximize potential future compatibility with > + * the Linux API, the same variable names and order have been adopted, and > + * padding left to make room for omitted fields in case they are added later. > + * > + * XXX: This is currently an unstable ABI/API, in that it is expected to > + * change. > + */ > +struct tcp_info { > + u_int8_t tcpi_state; /* TCP FSM state. */ > + u_int8_t __tcpi_ca_state; > + u_int8_t __tcpi_retransmits; > + u_int8_t __tcpi_probes; > + u_int8_t __tcpi_backoff; > + u_int8_t tcpi_options; /* Options enabled on conn. */ > + u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ > + tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ > + > + u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ > + u_int32_t __tcpi_ato; > + u_int32_t tcpi_snd_mss; /* Max segment size for send. */ > + u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ > + > + u_int32_t __tcpi_unacked; > + u_int32_t __tcpi_sacked; > + u_int32_t __tcpi_lost; > + u_int32_t __tcpi_retrans; > + u_int32_t __tcpi_fackets; > + > + /* Times; measurements in usecs. */ > + u_int32_t __tcpi_last_data_sent; > + u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ > + u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ > + u_int32_t __tcpi_last_ack_recv; > + > + /* Metrics; variable units. */ > + u_int32_t __tcpi_pmtu; > + u_int32_t __tcpi_rcv_ssthresh; > + u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ > + u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ > + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ > + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ > + u_int32_t __tcpi_advmss; > + u_int32_t __tcpi_reordering; > + > + u_int32_t __tcpi_rcv_rtt; > + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ > + > + /* FreeBSD extensions to tcp_info. */ > + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ > + u_int32_t tcpi_snd_bwnd; /* No longer used. */ > + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ > + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ > + u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ > + u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ > + u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ > + u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ > + > + /* Padding to grow without breaking ABI. */ > + u_int32_t __tcpi_pad[26]; /* Padding. */ > +}; > +#endif > + > +#endif /* !_NETINET_TCP_H_ */ > diff --git a/example/ipfw/sys/netinet/udp.h b/example/ipfw/sys/netinet/udp.h > new file mode 100644 > index 0000000..c4e6e08 > --- /dev/null > +++ b/example/ipfw/sys/netinet/udp.h > @@ -0,0 +1,69 @@ > +/*- > + * Copyright (c) 1982, 1986, 1993 > + * The Regents of the University of California. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * 4. Neither the name of the University nor the names of its contributors > + * may be used to endorse or promote products derived from this software > + * without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * @(#)udp.h 8.1 (Berkeley) 6/10/93 > + * $FreeBSD: head/sys/netinet/udp.h 246210 2013-02-01 15:32:20Z jhb $ > + */ > + > +#ifndef _NETINET_UDP_H_ > +#define _NETINET_UDP_H_ > + > +/* > + * UDP protocol header. > + * Per RFC 768, September, 1981. > + */ > +struct udphdr { > + u_short uh_sport; /* source port */ > + u_short uh_dport; /* destination port */ > + u_short uh_ulen; /* udp length */ > + u_short uh_sum; /* udp checksum */ > +}; > + > +/* > + * User-settable options (used with setsockopt). > + */ > +#define UDP_ENCAP 1 > + > +/* Start of reserved space for third-party user-settable options. */ > +#define UDP_VENDOR SO_VENDOR > + > +/* > + * UDP Encapsulation of IPsec Packets options. > + */ > +/* Encapsulation types. */ > +#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ > +#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ > + > +/* Default ESP in UDP encapsulation port. */ > +#define UDP_ENCAP_ESPINUDP_PORT 500 > + > +/* Maximum UDP fragment size for ESP over UDP. */ > +#define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 > + > +#endif > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_heap.c b/example/ipfw/sys/netpfil/ipfw/dn_heap.c > new file mode 100644 > index 0000000..b47bd28 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_heap.c > @@ -0,0 +1,552 @@ > +/*- > + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * Binary heap and hash tables, used in dummynet > + * > + * $FreeBSD: head/sys/netpfil/ipfw/dn_heap.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > + > +#include <sys/cdefs.h> > +#include <sys/param.h> > +#ifdef _KERNEL > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/dn_heap.c 240494 2012-09-14 11:51:49Z glebius $"); > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/kernel.h> > +#include <netpfil/ipfw/dn_heap.h> > +#ifndef log > +#define log(x, arg...) > +#endif > + > +#else /* !_KERNEL */ > + > +#include <stdio.h> > +#include <dn_test.h> > +#include <strings.h> > +#include <stdlib.h> > + > +#include "dn_heap.h" > +#define log(x, arg...) fprintf(stderr, ## arg) > +#define panic(x...) fprintf(stderr, ## x), exit(1) > +#define MALLOC_DEFINE(a, b, c) > +static void *my_malloc(int s) { return malloc(s); } > +static void my_free(void *p) { free(p); } > +#define malloc(s, t, w) my_malloc(s) > +#define free(p, t) my_free(p) > +#endif /* !_KERNEL */ > + > +static MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); > + > +/* > + * Heap management functions. > + * > + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. > + * Some macros help finding parent/children so we can optimize them. > + * > + * heap_init() is called to expand the heap when needed. > + * Increment size in blocks of 16 entries. > + * Returns 1 on error, 0 on success > + */ > +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) > +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) > +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } > +#define HEAP_INCREMENT 15 > + > +static int > +heap_resize(struct dn_heap *h, unsigned int new_size) > +{ > + struct dn_heap_entry *p; > + > + if (h->size >= new_size ) /* have enough room */ > + return 0; > +#if 1 /* round to the next power of 2 */ > + new_size |= new_size >> 1; > + new_size |= new_size >> 2; > + new_size |= new_size >> 4; > + new_size |= new_size >> 8; > + new_size |= new_size >> 16; > +#else > + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; > +#endif > + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); > + if (p == NULL) { > + printf("--- %s, resize %d failed\n", __func__, new_size ); > + return 1; /* error */ > + } > + if (h->size > 0) { > + bcopy(h->p, p, h->size * sizeof(*p) ); > + free(h->p, M_DN_HEAP); > + } > + h->p = p; > + h->size = new_size; > + return 0; > +} > + > +int > +heap_init(struct dn_heap *h, int size, int ofs) > +{ > + if (heap_resize(h, size)) > + return 1; > + h->elements = 0; > + h->ofs = ofs; > + return 0; > +} > + > +/* > + * Insert element in heap. Normally, p != NULL, we insert p in > + * a new position and bubble up. If p == NULL, then the element is > + * already in place, and key is the position where to start the > + * bubble-up. > + * Returns 1 on failure (cannot allocate new heap entry) > + * > + * If ofs > 0 the position (index, int) of the element in the heap is > + * also stored in the element itself at the given offset in bytes. > + */ > +#define SET_OFFSET(h, i) do { \ > + if (h->ofs > 0) \ > + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ > + } while (0) > +/* > + * RESET_OFFSET is used for sanity checks. It sets ofs > + * to an invalid value. > + */ > +#define RESET_OFFSET(h, i) do { \ > + if (h->ofs > 0) \ > + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ > + } while (0) > + > +int > +heap_insert(struct dn_heap *h, uint64_t key1, void *p) > +{ > + int son = h->elements; > + > + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); > + if (p == NULL) { /* data already there, set starting point */ > + son = key1; > + } else { /* insert new element at the end, possibly resize */ > + son = h->elements; > + if (son == h->size) /* need resize... */ > + // XXX expand by 16 or so > + if (heap_resize(h, h->elements+16) ) > + return 1; /* failure... */ > + h->p[son].object = p; > + h->p[son].key = key1; > + h->elements++; > + } > + /* make sure that son >= father along the path */ > + while (son > 0) { > + int father = HEAP_FATHER(son); > + struct dn_heap_entry tmp; > + > + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) > + break; /* found right position */ > + /* son smaller than father, swap and repeat */ > + HEAP_SWAP(h->p[son], h->p[father], tmp); > + SET_OFFSET(h, son); > + son = father; > + } > + SET_OFFSET(h, son); > + return 0; > +} > + > +/* > + * remove top element from heap, or obj if obj != NULL > + */ > +void > +heap_extract(struct dn_heap *h, void *obj) > +{ > + int child, father, max = h->elements - 1; > + > + if (max < 0) { > + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); > + return; > + } > + if (obj == NULL) > + father = 0; /* default: move up smallest child */ > + else { /* extract specific element, index is at offset */ > + if (h->ofs <= 0) > + panic("%s: extract from middle not set on %p\n", > + __FUNCTION__, h); > + father = *((int *)((char *)obj + h->ofs)); > + if (father < 0 || father >= h->elements) { > + panic("%s: father %d out of bound 0..%d\n", > + __FUNCTION__, father, h->elements); > + } > + } > + /* > + * below, father is the index of the empty element, which > + * we replace at each step with the smallest child until we > + * reach the bottom level. > + */ > + // XXX why removing RESET_OFFSET increases runtime by 10% ? > + RESET_OFFSET(h, father); > + while ( (child = HEAP_LEFT(father)) <= max ) { > + if (child != max && > + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) > + child++; /* take right child, otherwise left */ > + h->p[father] = h->p[child]; > + SET_OFFSET(h, father); > + father = child; > + } > + h->elements--; > + if (father != max) { > + /* > + * Fill hole with last entry and bubble up, > + * reusing the insert code > + */ > + h->p[father] = h->p[max]; > + heap_insert(h, father, NULL); > + } > +} > + > +#if 0 > +/* > + * change object position and update references > + * XXX this one is never used! > + */ > +static void > +heap_move(struct dn_heap *h, uint64_t new_key, void *object) > +{ > + int temp, i, max = h->elements-1; > + struct dn_heap_entry *p, buf; > + > + if (h->ofs <= 0) > + panic("cannot move items on this heap"); > + p = h->p; /* shortcut */ > + > + i = *((int *)((char *)object + h->ofs)); > + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ > + p[i].key = new_key; > + for (; i>0 && > + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); > + i = temp ) { /* bubble up */ > + HEAP_SWAP(p[i], p[temp], buf); > + SET_OFFSET(h, i); > + } > + } else { /* must move down */ > + p[i].key = new_key; > + while ( (temp = HEAP_LEFT(i)) <= max ) { > + /* found left child */ > + if (temp != max && > + DN_KEY_LT(p[temp+1].key, p[temp].key)) > + temp++; /* select child with min key */ > + if (DN_KEY_LT(>p[temp].key, new_key)) { > + /* go down */ > + HEAP_SWAP(p[i], p[temp], buf); > + SET_OFFSET(h, i); > + } else > + break; > + i = temp; > + } > + } > + SET_OFFSET(h, i); > +} > +#endif /* heap_move, unused */ > + > +/* > + * heapify() will reorganize data inside an array to maintain the > + * heap property. It is needed when we delete a bunch of entries. > + */ > +static void > +heapify(struct dn_heap *h) > +{ > + int i; > + > + for (i = 0; i < h->elements; i++ ) > + heap_insert(h, i , NULL); > +} > + > +int > +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), > + uintptr_t arg) > +{ > + int i, ret, found; > + > + for (i = found = 0 ; i < h->elements ;) { > + ret = fn(h->p[i].object, arg); > + if (ret & HEAP_SCAN_DEL) { > + h->elements-- ; > + h->p[i] = h->p[h->elements] ; > + found++ ; > + } else > + i++ ; > + if (ret & HEAP_SCAN_END) > + break; > + } > + if (found) > + heapify(h); > + return found; > +} > + > +/* > + * cleanup the heap and free data structure > + */ > +void > +heap_free(struct dn_heap *h) > +{ > + if (h->size >0 ) > + free(h->p, M_DN_HEAP); > + bzero(h, sizeof(*h) ); > +} > + > +/* > + * hash table support. > + */ > + > +struct dn_ht { > + int buckets; /* how many buckets, really buckets - 1*/ > + int entries; /* how many entries */ > + int ofs; /* offset of link field */ > + uint32_t (*hash)(uintptr_t, int, void *arg); > + int (*match)(void *_el, uintptr_t key, int, void *); > + void *(*newh)(uintptr_t, int, void *); > + void **ht; /* bucket heads */ > +}; > +/* > + * Initialize, allocating bucket pointers inline. > + * Recycle previous record if possible. > + * If the 'newh' function is not supplied, we assume that the > + * key passed to ht_find is the same object to be stored in. > + */ > +struct dn_ht * > +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, > + uint32_t (*h)(uintptr_t, int, void *), > + int (*match)(void *, uintptr_t, int, void *), > + void *(*newh)(uintptr_t, int, void *)) > +{ > + int l; > + > + /* > + * Notes about rounding bucket size to a power of two. > + * Given the original bucket size, we compute the nearest lower and > + * higher power of two, minus 1 (respectively b_min and b_max) because > + * this value will be used to do an AND with the index returned > + * by hash function. > + * To choice between these two values, the original bucket size is > + * compared with b_min. If the original size is greater than 4/3 b_min, > + * we round the bucket size to b_max, else to b_min. > + * This ratio try to round to the nearest power of two, advantaging > + * the greater size if the different between two power is relatively > + * big. > + * Rounding the bucket size to a power of two avoid the use of > + * module when calculating the correct bucket. > + * The ht->buckets variable store the bucket size - 1 to simply > + * do an AND between the index returned by hash function and ht->bucket > + * instead of a module. > + */ > + int b_min; /* min buckets */ > + int b_max; /* max buckets */ > + int b_ori; /* original buckets */ > + > + if (h == NULL || match == NULL) { > + printf("--- missing hash or match function"); > + return NULL; > + } > + if (buckets < 1 || buckets > 65536) > + return NULL; > + > + b_ori = buckets; > + /* calculate next power of 2, - 1*/ > + buckets |= buckets >> 1; > + buckets |= buckets >> 2; > + buckets |= buckets >> 4; > + buckets |= buckets >> 8; > + buckets |= buckets >> 16; > + > + b_max = buckets; /* Next power */ > + b_min = buckets >> 1; /* Previous power */ > + > + /* Calculate the 'nearest' bucket size */ > + if (b_min * 4000 / 3000 < b_ori) > + buckets = b_max; > + else > + buckets = b_min; > + > + if (ht) { /* see if we can reuse */ > + if (buckets <= ht->buckets) { > + ht->buckets = buckets; > + } else { > + /* free pointers if not allocated inline */ > + if (ht->ht != (void *)(ht + 1)) > + free(ht->ht, M_DN_HEAP); > + free(ht, M_DN_HEAP); > + ht = NULL; > + } > + } > + if (ht == NULL) { > + /* Allocate buckets + 1 entries because buckets is use to > + * do the AND with the index returned by hash function > + */ > + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); > + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); > + } > + if (ht) { > + ht->ht = (void **)(ht + 1); > + ht->buckets = buckets; > + ht->ofs = ofs; > + ht->hash = h; > + ht->match = match; > + ht->newh = newh; > + } > + return ht; > +} > + > +/* dummy callback for dn_ht_free to unlink all */ > +static int > +do_del(void *obj, void *arg) > +{ > + return DNHT_SCAN_DEL; > +} > + > +void > +dn_ht_free(struct dn_ht *ht, int flags) > +{ > + if (ht == NULL) > + return; > + if (flags & DNHT_REMOVE) { > + (void)dn_ht_scan(ht, do_del, NULL); > + } else { > + if (ht->ht && ht->ht != (void *)(ht + 1)) > + free(ht->ht, M_DN_HEAP); > + free(ht, M_DN_HEAP); > + } > +} > + > +int > +dn_ht_entries(struct dn_ht *ht) > +{ > + return ht ? ht->entries : 0; > +} > + > +/* lookup and optionally create or delete element */ > +void * > +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) > +{ > + int i; > + void **pp, *p; > + > + if (ht == NULL) /* easy on an empty hash */ > + return NULL; > + i = (ht->buckets == 1) ? 0 : > + (ht->hash(key, flags, arg) & ht->buckets); > + > + for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { > + if (flags & DNHT_MATCH_PTR) { > + if (key == (uintptr_t)p) > + break; > + } else if (ht->match(p, key, flags, arg)) /* found match */ > + break; > + } > + if (p) { > + if (flags & DNHT_REMOVE) { > + /* link in the next element */ > + *pp = *(void **)((char *)p + ht->ofs); > + *(void **)((char *)p + ht->ofs) = NULL; > + ht->entries--; > + } > + } else if (flags & DNHT_INSERT) { > + // printf("%s before calling new, bucket %d ofs %d\n", > + // __FUNCTION__, i, ht->ofs); > + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; > + // printf("%s newh returns %p\n", __FUNCTION__, p); > + if (p) { > + ht->entries++; > + *(void **)((char *)p + ht->ofs) = ht->ht[i]; > + ht->ht[i] = p; > + } > + } > + return p; > +} > + > +/* > + * do a scan with the option to delete the object. Extract next before > + * running the callback because the element may be destroyed there. > + */ > +int > +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) > +{ > + int i, ret, found = 0; > + void **curp, *cur, *next; > + > + if (ht == NULL || fn == NULL) > + return 0; > + for (i = 0; i <= ht->buckets; i++) { > + curp = &ht->ht[i]; > + while ( (cur = *curp) != NULL) { > + next = *(void **)((char *)cur + ht->ofs); > + ret = fn(cur, arg); > + if (ret & DNHT_SCAN_DEL) { > + found++; > + ht->entries--; > + *curp = next; > + } else { > + curp = (void **)((char *)cur + ht->ofs); > + } > + if (ret & DNHT_SCAN_END) > + return (ret & DNHT_COPY_ERR) ? -1 : found; > + } > + } > + return found; > +} > + > +/* > + * Similar to dn_ht_scan(), except that the scan is performed only > + * in the bucket 'bucket'. The function returns a correct bucket number if > + * the original is invalid. > + * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] > + * pointer to the last entry processed. Moreover, the bucket number passed > + * by caller is decremented, because usually the caller increment it. > + */ > +int > +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), > + void *arg) > +{ > + int i, ret, found = 0; > + void **curp, *cur, *next; > + > + if (ht == NULL || fn == NULL) > + return 0; > + if (*bucket > ht->buckets) > + *bucket = 0; > + i = *bucket; > + > + curp = &ht->ht[i]; > + while ( (cur = *curp) != NULL) { > + next = *(void **)((char *)cur + ht->ofs); > + ret = fn(cur, arg); > + if (ret & DNHT_SCAN_DEL) { > + found++; > + ht->entries--; > + *curp = next; > + } else { > + curp = (void **)((char *)cur + ht->ofs); > + } > + if (ret & DNHT_SCAN_END) > + return found; > + } > + return found; > +} > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_heap.h b/example/ipfw/sys/netpfil/ipfw/dn_heap.h > new file mode 100644 > index 0000000..2b44d8e > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_heap.h > @@ -0,0 +1,192 @@ > +/*- > + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * Binary heap and hash tables, header file > + * > + * $FreeBSD: head/sys/netpfil/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ > + */ > + > +#ifndef _IP_DN_HEAP_H > +#define _IP_DN_HEAP_H > + > +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) > +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) > + > +/* > + * This module implements a binary heap supporting random extraction. > + * > + * A heap entry contains an uint64_t key and a pointer to object. > + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' > + * > + * The heap is a struct dn_heap plus a dynamically allocated > + * array of dn_heap_entry entries. 'size' represents the size of > + * the array, 'elements' count entries in use. The topmost > + * element has the smallest key. > + * The heap supports ordered insert, and extract from the top. > + * To extract an object from the middle of the heap, we the object > + * must reserve an 'int32_t' to store the position of the object > + * in the heap itself, and the location of this field must be > + * passed as an argument to heap_init() -- use -1 if the feature > + * is not used. > + */ > +struct dn_heap_entry { > + uint64_t key; /* sorting key, smallest comes first */ > + void *object; /* object pointer */ > +}; > + > +struct dn_heap { > + int size; /* the size of the array */ > + int elements; /* elements in use */ > + int ofs; /* offset in the object of heap index */ > + struct dn_heap_entry *p; /* array of "size" entries */ > +}; > + > +enum { > + HEAP_SCAN_DEL = 1, > + HEAP_SCAN_END = 2, > +}; > + > +/* > + * heap_init() reinitializes the heap setting the size and the offset > + * of the index for random extraction (use -1 if not used). > + * The 'elements' counter is set to 0. > + * > + * SET_HEAP_OFS() indicates where, in the object, is stored the index > + * for random extractions from the heap. > + * > + * heap_free() frees the memory associated to a heap. > + * > + * heap_insert() adds a key-pointer pair to the heap > + * > + * HEAP_TOP() returns a pointer to the top element of the heap, > + * but makes no checks on its existance (XXX should we change ?) > + * > + * heap_extract() removes the entry at the top, returing the pointer. > + * (the key should have been read before). > + * > + * heap_scan() invokes a callback on each entry of the heap. > + * The callback can return a combination of HEAP_SCAN_DEL and > + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must > + * be removed, and HEAP_SCAN_END means to terminate the scan. > + * heap_scan() returns the number of elements removed. > + * Because the order is not guaranteed, we should use heap_scan() > + * only as a last resort mechanism. > + */ > +#define HEAP_TOP(h) ((h)->p) > +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) > +int heap_init(struct dn_heap *h, int size, int ofs); > +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); > +void heap_extract(struct dn_heap *h, void *obj); > +void heap_free(struct dn_heap *h); > +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); > + > +/*------------------------------------------------------ > + * This module implements a generic hash table with support for > + * running callbacks on the entire table. To avoid allocating > + * memory during hash table operations, objects must reserve > + * space for a link field. XXX if the heap is moderately full, > + * an SLIST suffices, and we can tolerate the cost of a hash > + * computation on each removal. > + * > + * dn_ht_init() initializes the table, setting the number of > + * buckets, the offset of the link field, the main callbacks. > + * Callbacks are: > + * > + * hash(key, flags, arg) called to return a bucket index. > + * match(obj, key, flags, arg) called to determine if key > + * matches the current 'obj' in the heap > + * newh(key, flags, arg) optional, used to allocate a new > + * object during insertions. > + * > + * dn_ht_free() frees the heap or unlink elements. > + * DNHT_REMOVE unlink elements, 0 frees the heap. > + * You need two calls to do both. > + * > + * dn_ht_find() is the main lookup function, which can also be > + * used to insert or delete elements in the hash table. > + * The final 'arg' is passed to all callbacks. > + * > + * dn_ht_scan() is used to invoke a callback on all entries of > + * the heap, or possibly on just one bucket. The callback > + * is invoked with a pointer to the object, and must return > + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the > + * removal of the object from the heap and the end of the > + * scan, respectively. > + * > + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans > + * only the specific bucket of the table. The bucket is a in-out > + * parameter and return a valid bucket number if the original > + * is invalid. > + * > + * A combination of flags can be used to modify the operation > + * of the dn_ht_find(), and of the callbacks: > + * > + * DNHT_KEY_IS_OBJ means the key is the object pointer. > + * It is usally of interest for the hash and match functions. > + * > + * DNHT_MATCH_PTR during a lookup, match pointers instead > + * of calling match(). Normally used when removing specific > + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used > + * by the match function. > + * > + * DNHT_INSERT insert the element if not found. > + * Calls new() to allocates a new object unless > + * DNHT_KEY_IS_OBJ is set. > + * > + * DNHT_UNIQUE only insert if object not found. > + * XXX should it imply DNHT_INSERT ? > + * > + * DNHT_REMOVE remove objects if we find them. > + */ > +struct dn_ht; /* should be opaque */ > + > +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, > + uint32_t (*hash)(uintptr_t, int, void *), > + int (*match)(void *, uintptr_t, int, void *), > + void *(*newh)(uintptr_t, int, void *)); > +void dn_ht_free(struct dn_ht *, int flags); > + > +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); > +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); > +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); > +int dn_ht_entries(struct dn_ht *); > + > +enum { /* flags values. > + * first two are returned by the scan callback to indicate > + * to delete the matching element or to end the scan > + */ > + DNHT_SCAN_DEL = 0x0001, > + DNHT_SCAN_END = 0x0002, > + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ > + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ > + DNHT_INSERT = 0x0010, /* insert if not found */ > + DNHT_UNIQUE = 0x0020, /* report error if already there */ > + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ > + DNHT_COPY_ERR = 0x0080, /* error during a copy */ > +}; > + > +#endif /* _IP_DN_HEAP_H */ > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched.h b/example/ipfw/sys/netpfil/ipfw/dn_sched.h > new file mode 100644 > index 0000000..a81a9c0 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched.h > @@ -0,0 +1,192 @@ > +/* > + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * The API to write a packet scheduling algorithm for dummynet. > + * > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched.h 258467 2013-11-22 05:02:37Z luigi $ > + */ > + > +#ifndef _DN_SCHED_H > +#define _DN_SCHED_H > + > +#define DN_MULTIQUEUE 0x01 > +/* > + * Descriptor for a scheduling algorithm. > + * Contains all function pointers for a given scheduler > + * This is typically created when a module is loaded, and stored > + * in a global list of schedulers. > + */ > +struct dn_alg { > + uint32_t type; /* the scheduler type */ > + const char *name; /* scheduler name */ > + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ > + > + /* > + * The following define the size of 3 optional data structures > + * that may need to be allocated at runtime, and are appended > + * to each of the base data structures: scheduler, sched.inst, > + * and queue. We don't have a per-flowset structure. > + */ > + /* + parameters attached to the template, e.g. > + * default queue sizes, weights, quantum size, and so on; > + */ > + size_t schk_datalen; > + > + /* + per-instance parameters, such as timestamps, > + * containers for queues, etc; > + */ > + size_t si_datalen; > + > + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ > + > + /* > + * Methods implemented by the scheduler: > + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. > + * q is NULL for !MULTIQUEUE. > + * Return 0 on success, 1 on drop (packet consumed anyways). > + * Note that q should be interpreted only as a hint > + * on the flow that the mbuf belongs to: while a > + * scheduler will normally enqueue m into q, it is ok > + * to leave q alone and put the mbuf elsewhere. > + * This function is called in two cases: > + * - when a new packet arrives to the scheduler; > + * - when a scheduler is reconfigured. In this case the > + * call is issued by the new_queue callback, with a > + * non empty queue (q) and m pointing to the first > + * mbuf in the queue. For this reason, the function > + * should internally check for (m != q->mq.head) > + * before calling dn_enqueue(). > + * > + * dequeue Called when scheduler instance 's' can > + * dequeue a packet. Return NULL if none are available. > + * XXX what about non work-conserving ? > + * > + * config called on 'sched X config ...', normally writes > + * in the area of size sch_arg > + * > + * destroy called on 'sched delete', frees everything > + * in sch_arg (other parts are handled by more specific > + * functions) > + * > + * new_sched called when a new instance is created, e.g. > + * to create the local queue for !MULTIQUEUE, set V or > + * copy parameters for WFQ, and so on. > + * > + * free_sched called when deleting an instance, cleans > + * extra data in the per-instance area. > + * > + * new_fsk called when a flowset is linked to a scheduler, > + * e.g. to validate parameters such as weights etc. > + * free_fsk when a flowset is unlinked from a scheduler. > + * (probably unnecessary) > + * > + * new_queue called to set the per-queue parameters, > + * e.g. S and F, adjust sum of weights in the parent, etc. > + * > + * The new_queue callback is normally called from when > + * creating a new queue. In some cases (such as a > + * scheduler change or reconfiguration) it can be called > + * with a non empty queue. In this case, the queue > + * In case of non empty queue, the new_queue callback could > + * need to call the enqueue function. In this case, > + * the callback should eventually call enqueue() passing > + * as m the first element in the queue. > + * > + * free_queue actions related to a queue removal, e.g. undo > + * all the above. If the queue has data in it, also remove > + * from the scheduler. This can e.g. happen during a reconfigure. > + */ > + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, > + struct mbuf *); > + struct mbuf * (*dequeue)(struct dn_sch_inst *); > + > + int (*config)(struct dn_schk *); > + int (*destroy)(struct dn_schk*); > + int (*new_sched)(struct dn_sch_inst *); > + int (*free_sched)(struct dn_sch_inst *); > + int (*new_fsk)(struct dn_fsk *f); > + int (*free_fsk)(struct dn_fsk *f); > + int (*new_queue)(struct dn_queue *q); > + int (*free_queue)(struct dn_queue *q); > + > + /* run-time fields */ > + int ref_count; /* XXX number of instances in the system */ > + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ > +}; > + > +/* MSVC does not support initializers so we need this ugly macro */ > +#ifdef _WIN32 > +#define _SI(fld) > +#else > +#define _SI(fld) fld > +#endif > + > +/* > + * Additionally, dummynet exports some functions and macros > + * to be used by schedulers: > + */ > + > +void dn_free_pkts(struct mbuf *mnext); > +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); > +/* bound a variable between min and max */ > +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); > + > +/* > + * Extract the head of a queue, update stats. Must be the very last > + * thing done on a dequeue as the queue itself may go away. > + */ > +static __inline struct mbuf* > +dn_dequeue(struct dn_queue *q) > +{ > + struct mbuf *m = q->mq.head; > + if (m == NULL) > + return NULL; > + q->mq.head = m->m_nextpkt; > + q->mq.count--; > + > + /* Update stats for the queue */ > + q->ni.length--; > + q->ni.len_bytes -= m->m_pkthdr.len; > + if (q->_si) { > + q->_si->ni.length--; > + q->_si->ni.len_bytes -= m->m_pkthdr.len; > + } > + if (q->ni.length == 0) /* queue is now idle */ > + q->q_time = dn_cfg.curr_time; > + return m; > +} > + > +int dn_sched_modevent(module_t mod, int cmd, void *arg); > + > +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ > + static moduledata_t name##_mod = { \ > + #name, dn_sched_modevent, dnsched \ > + }; \ > + DECLARE_MODULE(name, name##_mod, \ > + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ > + MODULE_DEPEND(name, dummynet, 3, 3, 3) > +#endif /* _DN_SCHED_H */ > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c > new file mode 100644 > index 0000000..1119221 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c > @@ -0,0 +1,120 @@ > +/* > + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_fifo.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > + > +#ifdef _KERNEL > +#include <sys/malloc.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/kernel.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <net/if.h> /* IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ipfw_rule_ref */ > +#include <netinet/ip_fw.h> /* flow_id */ > +#include <netinet/ip_dummynet.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > +#else > +#include <dn_test.h> > +#endif > + > +/* > + * This file implements a FIFO scheduler for a single queue. > + * The queue is allocated as part of the scheduler instance, > + * and there is a single flowset is in the template which stores > + * queue size and policy. > + * Enqueue and dequeue use the default library functions. > + */ > +static int > +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) > +{ > + /* XXX if called with q != NULL and m=NULL, this is a > + * re-enqueue from an existing scheduler, which we should > + * handle. > + */ > + return dn_enqueue((struct dn_queue *)(si+1), m, 0); > +} > + > +static struct mbuf * > +fifo_dequeue(struct dn_sch_inst *si) > +{ > + return dn_dequeue((struct dn_queue *)(si + 1)); > +} > + > +static int > +fifo_new_sched(struct dn_sch_inst *si) > +{ > + /* This scheduler instance contains the queue */ > + struct dn_queue *q = (struct dn_queue *)(si + 1); > + > + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); > + q->_si = si; > + q->fs = si->sched->fs; > + return 0; > +} > + > +static int > +fifo_free_sched(struct dn_sch_inst *si) > +{ > + struct dn_queue *q = (struct dn_queue *)(si + 1); > + dn_free_pkts(q->mq.head); > + bzero(q, sizeof(*q)); > + return 0; > +} > + > +/* > + * FIFO scheduler descriptor > + * contains the type of the scheduler, the name, the size of extra > + * data structures, and function pointers. > + */ > +static struct dn_alg fifo_desc = { > + _SI( .type = ) DN_SCHED_FIFO, > + _SI( .name = ) "FIFO", > + _SI( .flags = ) 0, > + > + _SI( .schk_datalen = ) 0, > + _SI( .si_datalen = ) sizeof(struct dn_queue), > + _SI( .q_datalen = ) 0, > + > + _SI( .enqueue = ) fifo_enqueue, > + _SI( .dequeue = ) fifo_dequeue, > + _SI( .config = ) NULL, > + _SI( .destroy = ) NULL, > + _SI( .new_sched = ) fifo_new_sched, > + _SI( .free_sched = ) fifo_free_sched, > + _SI( .new_fsk = ) NULL, > + _SI( .free_fsk = ) NULL, > + _SI( .new_queue = ) NULL, > + _SI( .free_queue = ) NULL, > +}; > + > +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c > new file mode 100644 > index 0000000..f0ca44e > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c > @@ -0,0 +1,229 @@ > +/* > + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_prio.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > +#ifdef _KERNEL > +#include <sys/malloc.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/kernel.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <net/if.h> /* IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ipfw_rule_ref */ > +#include <netinet/ip_fw.h> /* flow_id */ > +#include <netinet/ip_dummynet.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > +#else > +#include <dn_test.h> > +#endif > + > +#define DN_SCHED_PRIO 5 //XXX > + > +#if !defined(_KERNEL) || !defined(__linux__) > +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) > +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) > +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) > +#endif > + > +#ifdef __MIPSEL__ > +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) > +#endif > + > +/* Size of the array of queues pointers. */ > +#define BITMAP_T unsigned long > +#define MAXPRIO (sizeof(BITMAP_T) * 8) > + > +/* > + * The scheduler instance contains an array of pointers to queues, > + * one for each priority, and a bitmap listing backlogged queues. > + */ > +struct prio_si { > + BITMAP_T bitmap; /* array bitmap */ > + struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ > +}; > + > +/* > + * If a queue with the same priority is already backlogged, use > + * that one instead of the queue passed as argument. > + */ > +static int > +prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) > +{ > + struct prio_si *si = (struct prio_si *)(_si + 1); > + int prio = q->fs->fs.par[0]; > + > + if (test_bit(prio, &si->bitmap) == 0) { > + /* No queue with this priority, insert */ > + __set_bit(prio, &si->bitmap); > + si->q_array[prio] = q; > + } else { /* use the existing queue */ > + q = si->q_array[prio]; > + } > + if (dn_enqueue(q, m, 0)) > + return 1; > + return 0; > +} > + > +/* > + * Packets are dequeued only from the highest priority queue. > + * The function ffs() return the lowest bit in the bitmap that rapresent > + * the array index (-1) which contains the pointer to the highest priority > + * queue. > + * After the dequeue, if this queue become empty, it is index is removed > + * from the bitmap. > + * Scheduler is idle if the bitmap is empty > + * > + * NOTE: highest priority is 0, lowest is sched->max_prio_q > + */ > +static struct mbuf * > +prio_dequeue(struct dn_sch_inst *_si) > +{ > + struct prio_si *si = (struct prio_si *)(_si + 1); > + struct mbuf *m; > + struct dn_queue *q; > + int prio; > + > + if (si->bitmap == 0) /* scheduler idle */ > + return NULL; > + > + prio = ffs(si->bitmap) - 1; > + > + /* Take the highest priority queue in the scheduler */ > + q = si->q_array[prio]; > + // assert(q) > + > + m = dn_dequeue(q); > + if (q->mq.head == NULL) { > + /* Queue is now empty, remove from scheduler > + * and mark it > + */ > + si->q_array[prio] = NULL; > + __clear_bit(prio, &si->bitmap); > + } > + return m; > +} > + > +static int > +prio_new_sched(struct dn_sch_inst *_si) > +{ > + struct prio_si *si = (struct prio_si *)(_si + 1); > + > + bzero(si->q_array, sizeof(si->q_array)); > + si->bitmap = 0; > + > + return 0; > +} > + > +static int > +prio_new_fsk(struct dn_fsk *fs) > +{ > + /* Check if the prioritiy is between 0 and MAXPRIO-1 */ > + ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); > + return 0; > +} > + > +static int > +prio_new_queue(struct dn_queue *q) > +{ > + struct prio_si *si = (struct prio_si *)(q->_si + 1); > + int prio = q->fs->fs.par[0]; > + struct dn_queue *oldq; > + > + q->ni.oid.subtype = DN_SCHED_PRIO; > + > + if (q->mq.head == NULL) > + return 0; > + > + /* Queue already full, must insert in the scheduler or append > + * mbufs to existing queue. This partly duplicates prio_enqueue > + */ > + if (test_bit(prio, &si->bitmap) == 0) { > + /* No queue with this priority, insert */ > + __set_bit(prio, &si->bitmap); > + si->q_array[prio] = q; > + } else if ( (oldq = si->q_array[prio]) != q) { > + /* must append to the existing queue. > + * can simply append q->mq.head to q2->... > + * and add the counters to those of q2 > + */ > + oldq->mq.tail->m_nextpkt = q->mq.head; > + oldq->mq.tail = q->mq.tail; > + oldq->ni.length += q->ni.length; > + q->ni.length = 0; > + oldq->ni.len_bytes += q->ni.len_bytes; > + q->ni.len_bytes = 0; > + q->mq.tail = q->mq.head = NULL; > + } > + return 0; > +} > + > +static int > +prio_free_queue(struct dn_queue *q) > +{ > + int prio = q->fs->fs.par[0]; > + struct prio_si *si = (struct prio_si *)(q->_si + 1); > + > + if (si->q_array[prio] == q) { > + si->q_array[prio] = NULL; > + __clear_bit(prio, &si->bitmap); > + } > + return 0; > +} > + > + > +static struct dn_alg prio_desc = { > + _SI( .type = ) DN_SCHED_PRIO, > + _SI( .name = ) "PRIO", > + _SI( .flags = ) DN_MULTIQUEUE, > + > + /* we need extra space in the si and the queue */ > + _SI( .schk_datalen = ) 0, > + _SI( .si_datalen = ) sizeof(struct prio_si), > + _SI( .q_datalen = ) 0, > + > + _SI( .enqueue = ) prio_enqueue, > + _SI( .dequeue = ) prio_dequeue, > + > + _SI( .config = ) NULL, > + _SI( .destroy = ) NULL, > + _SI( .new_sched = ) prio_new_sched, > + _SI( .free_sched = ) NULL, > + > + _SI( .new_fsk = ) prio_new_fsk, > + _SI( .free_fsk = ) NULL, > + > + _SI( .new_queue = ) prio_new_queue, > + _SI( .free_queue = ) prio_free_queue, > +}; > + > + > +DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c > new file mode 100644 > index 0000000..5cc5901 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c > @@ -0,0 +1,864 @@ > +/* > + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_qfq.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > + > +#ifdef _KERNEL > +#include <sys/malloc.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/kernel.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <net/if.h> /* IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ipfw_rule_ref */ > +#include <netinet/ip_fw.h> /* flow_id */ > +#include <netinet/ip_dummynet.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > +#else > +#include <dn_test.h> > +#endif > + > +#ifdef QFQ_DEBUG > +struct qfq_sched; > +static void dump_sched(struct qfq_sched *q, const char *msg); > +#define NO(x) x > +#else > +#define NO(x) > +#endif > +#define DN_SCHED_QFQ 4 // XXX Where? > +typedef unsigned long bitmap; > + > +/* > + * bitmaps ops are critical. Some linux versions have __fls > + * and the bitmap ops. Some machines have ffs > + */ > +#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) > +int fls(unsigned int n) > +{ > + int i = 0; > + for (i = 0; n > 0; n >>= 1, i++) > + ; > + return i; > +} > +#endif > + > +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) > +static inline unsigned long __fls(unsigned long word) > +{ > + return fls(word) - 1; > +} > +#endif > + > +#if !defined(_KERNEL) || !defined(__linux__) > +#ifdef QFQ_DEBUG > +int test_bit(int ix, bitmap *p) > +{ > + if (ix < 0 || ix > 31) > + D("bad index %d", ix); > + return *p & (1<<ix); > +} > +void __set_bit(int ix, bitmap *p) > +{ > + if (ix < 0 || ix > 31) > + D("bad index %d", ix); > + *p |= (1<<ix); > +} > +void __clear_bit(int ix, bitmap *p) > +{ > + if (ix < 0 || ix > 31) > + D("bad index %d", ix); > + *p &= ~(1<<ix); > +} > +#else /* !QFQ_DEBUG */ > +/* XXX do we have fast version, or leave it to the compiler ? */ > +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) > +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) > +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) > +#endif /* !QFQ_DEBUG */ > +#endif /* !__linux__ */ > + > +#ifdef __MIPSEL__ > +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) > +#endif > + > +/*-------------------------------------------*/ > +/* > + > +Virtual time computations. > + > +S, F and V are all computed in fixed point arithmetic with > +FRAC_BITS decimal bits. > + > + QFQ_MAX_INDEX is the maximum index allowed for a group. We need > + one bit per index. > + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. > + The layout of the bits is as below: > + > + [ MTU_SHIFT ][ FRAC_BITS ] > + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] > + ^.__grp->index = 0 > + *.__grp->slot_shift > + > + where MIN_SLOT_SHIFT is derived by difference from the others. > + > +The max group index corresponds to Lmax/w_min, where > +Lmax=1<<MTU_SHIFT, w_min = 1 . > +From this, and knowing how many groups (MAX_INDEX) we want, > +we can derive the shift corresponding to each group. > + > +Because we often need to compute > + F = S + len/w_i and V = V + len/wsum > +instead of storing w_i store the value > + inv_w = (1<<FRAC_BITS)/w_i > +so we can do F = S + len * inv_w * wsum. > +We use W_TOT in the formulas so we can easily move between > +static and adaptive weight sum. > + > +The per-scheduler-instance data contain all the data structures > +for the scheduler: bitmaps and bucket lists. > + > + */ > +/* > + * Maximum number of consecutive slots occupied by backlogged classes > + * inside a group. This is approx lmax/lmin + 5. > + * XXX check because it poses constraints on MAX_INDEX > + */ > +#define QFQ_MAX_SLOTS 32 > +/* > + * Shifts used for class<->group mapping. Class weights are > + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the > + * group with the smallest index that can support the L_i / r_i > + * configured for the class. > + * > + * grp->index is the index of the group; and grp->slot_shift > + * is the shift for the corresponding (scaled) sigma_i. > + * > + * When computing the group index, we do (len<<FP_SHIFT)/weight, > + * then compute an FLS (which is like a log2()), and if the result > + * is below the MAX_INDEX region we use 0 (which is the same as > + * using a larger len). > + */ > +#define QFQ_MAX_INDEX 19 > +#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ > + > +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) > +#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) > +//#define IWSUM (q->i_wsum) > +#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM) > + > +#define FRAC_BITS 30 /* fixed point arithmetic */ > +#define ONE_FP (1UL << FRAC_BITS) > + > +#define QFQ_MTU_SHIFT 11 /* log2(max_len) */ > +#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) > + > +/* > + * Possible group states, also indexes for the bitmaps array in > + * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3 > + */ > +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; > + > +struct qfq_group; > +/* > + * additional queue info. Some of this info should come from > + * the flowset, we copy them here for faster processing. > + * This is an overlay of the struct dn_queue > + */ > +struct qfq_class { > + struct dn_queue _q; > + uint64_t S, F; /* flow timestamps (exact) */ > + struct qfq_class *next; /* Link for the slot list. */ > + > + /* group we belong to. In principle we would need the index, > + * which is log_2(lmax/weight), but we never reference it > + * directly, only the group. > + */ > + struct qfq_group *grp; > + > + /* these are copied from the flowset. */ > + uint32_t inv_w; /* ONE_FP/weight */ > + uint32_t lmax; /* Max packet size for this flow. */ > +}; > + > +/* Group descriptor, see the paper for details. > + * Basically this contains the bucket lists > + */ > +struct qfq_group { > + uint64_t S, F; /* group timestamps (approx). */ > + unsigned int slot_shift; /* Slot shift. */ > + unsigned int index; /* Group index. */ > + unsigned int front; /* Index of the front slot. */ > + bitmap full_slots; /* non-empty slots */ > + > + /* Array of lists of active classes. */ > + struct qfq_class *slots[QFQ_MAX_SLOTS]; > +}; > + > +/* scheduler instance descriptor. */ > +struct qfq_sched { > + uint64_t V; /* Precise virtual time. */ > + uint32_t wsum; /* weight sum */ > + NO(uint32_t i_wsum; /* ONE_FP/w_sum */ > + uint32_t _queued; /* debugging */ > + uint32_t loops; /* debugging */) > + bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ > + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ > +}; > + > +/*---- support functions ----------------------------*/ > + > +/* Generic comparison function, handling wraparound. */ > +static inline int qfq_gt(uint64_t a, uint64_t b) > +{ > + return (int64_t)(a - b) > 0; > +} > + > +/* Round a precise timestamp to its slotted value. */ > +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) > +{ > + return ts & ~((1ULL << shift) - 1); > +} > + > +/* return the pointer to the group with lowest index in the bitmap */ > +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, > + unsigned long bitmap) > +{ > + int index = ffs(bitmap) - 1; // zero-based > + return &q->groups[index]; > +} > + > +/* > + * Calculate a flow index, given its weight and maximum packet length. > + * index = log_2(maxlen/weight) but we need to apply the scaling. > + * This is used only once at flow creation. > + */ > +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) > +{ > + uint64_t slot_size = (uint64_t)maxlen *inv_w; > + unsigned long size_map; > + int index = 0; > + > + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); > + if (!size_map) > + goto out; > + > + index = __fls(size_map) + 1; // basically a log_2() > + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); > + > + if (index < 0) > + index = 0; > + > +out: > + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); > + return index; > +} > +/*---- end support functions ----*/ > + > +/*-------- API calls --------------------------------*/ > +/* > + * Validate and copy parameters from flowset. > + */ > +static int > +qfq_new_queue(struct dn_queue *_q) > +{ > + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); > + struct qfq_class *cl = (struct qfq_class *)_q; > + int i; > + uint32_t w; /* approximated weight */ > + > + /* import parameters from the flowset. They should be correct > + * already. > + */ > + w = _q->fs->fs.par[0]; > + cl->lmax = _q->fs->fs.par[1]; > + if (!w || w > QFQ_MAX_WEIGHT) { > + w = 1; > + D("rounding weight to 1"); > + } > + cl->inv_w = ONE_FP/w; > + w = ONE_FP/cl->inv_w; > + if (q->wsum + w > QFQ_MAX_WSUM) > + return EINVAL; > + > + i = qfq_calc_index(cl->inv_w, cl->lmax); > + cl->grp = &q->groups[i]; > + q->wsum += w; > + // XXX cl->S = q->V; ? > + // XXX compute q->i_wsum > + return 0; > +} > + > +/* remove an empty queue */ > +static int > +qfq_free_queue(struct dn_queue *_q) > +{ > + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); > + struct qfq_class *cl = (struct qfq_class *)_q; > + if (cl->inv_w) { > + q->wsum -= ONE_FP/cl->inv_w; > + cl->inv_w = 0; /* reset weight to avoid run twice */ > + } > + return 0; > +} > + > +/* Calculate a mask to mimic what would be ffs_from(). */ > +static inline unsigned long > +mask_from(unsigned long bitmap, int from) > +{ > + return bitmap & ~((1UL << from) - 1); > +} > + > +/* > + * The state computation relies on ER=0, IR=1, EB=2, IB=3 > + * First compute eligibility comparing grp->S, q->V, > + * then check if someone is blocking us and possibly add EB > + */ > +static inline unsigned int > +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) > +{ > + /* if S > V we are not eligible */ > + unsigned int state = qfq_gt(grp->S, q->V); > + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); > + struct qfq_group *next; > + > + if (mask) { > + next = qfq_ffs(q, mask); > + if (qfq_gt(grp->F, next->F)) > + state |= EB; > + } > + > + return state; > +} > + > +/* > + * In principle > + * q->bitmaps[dst] |= q->bitmaps[src] & mask; > + * q->bitmaps[src] &= ~mask; > + * but we should make sure that src != dst > + */ > +static inline void > +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) > +{ > + q->bitmaps[dst] |= q->bitmaps[src] & mask; > + q->bitmaps[src] &= ~mask; > +} > + > +static inline void > +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) > +{ > + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); > + struct qfq_group *next; > + > + if (mask) { > + next = qfq_ffs(q, mask); > + if (!qfq_gt(next->F, old_finish)) > + return; > + } > + > + mask = (1UL << index) - 1; > + qfq_move_groups(q, mask, EB, ER); > + qfq_move_groups(q, mask, IB, IR); > +} > + > +/* > + * perhaps > + * > + old_V ^= q->V; > + old_V >>= QFQ_MIN_SLOT_SHIFT; > + if (old_V) { > + ... > + } > + * > + */ > +static inline void > +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) > +{ > + unsigned long mask, vslot, old_vslot; > + > + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; > + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; > + > + if (vslot != old_vslot) { > + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; > + qfq_move_groups(q, mask, IR, ER); > + qfq_move_groups(q, mask, IB, EB); > + } > +} > + > +/* > + * XXX we should make sure that slot becomes less than 32. > + * This is guaranteed by the input values. > + * roundedS is always cl->S rounded on grp->slot_shift bits. > + */ > +static inline void > +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) > +{ > + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; > + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; > + > + cl->next = grp->slots[i]; > + grp->slots[i] = cl; > + __set_bit(slot, &grp->full_slots); > +} > + > +/* > + * remove the entry from the slot > + */ > +static inline void > +qfq_front_slot_remove(struct qfq_group *grp) > +{ > + struct qfq_class **h = &grp->slots[grp->front]; > + > + *h = (*h)->next; > + if (!*h) > + __clear_bit(0, &grp->full_slots); > +} > + > +/* > + * Returns the first full queue in a group. As a side effect, > + * adjust the bucket list so the first non-empty bucket is at > + * position 0 in full_slots. > + */ > +static inline struct qfq_class * > +qfq_slot_scan(struct qfq_group *grp) > +{ > + int i; > + > + ND("grp %d full %x", grp->index, grp->full_slots); > + if (!grp->full_slots) > + return NULL; > + > + i = ffs(grp->full_slots) - 1; // zero-based > + if (i > 0) { > + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; > + grp->full_slots >>= i; > + } > + > + return grp->slots[grp->front]; > +} > + > +/* > + * adjust the bucket list. When the start time of a group decreases, > + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to > + * move the objects. The mask of occupied slots must be shifted > + * because we use ffs() to find the first non-empty slot. > + * This covers decreases in the group's start time, but what about > + * increases of the start time ? > + * Here too we should make sure that i is less than 32 > + */ > +static inline void > +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) > +{ > + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; > + > + grp->full_slots <<= i; > + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; > +} > + > + > +static inline void > +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) > +{ > + bitmap ineligible; > + > + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; > + if (ineligible) { > + if (!q->bitmaps[ER]) { > + struct qfq_group *grp; > + grp = qfq_ffs(q, ineligible); > + if (qfq_gt(grp->S, q->V)) > + q->V = grp->S; > + } > + qfq_make_eligible(q, old_V); > + } > +} > + > +/* > + * Updates the class, returns true if also the group needs to be updated. > + */ > +static inline int > +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, > + struct qfq_class *cl) > +{ > + > + cl->S = cl->F; > + if (cl->_q.mq.head == NULL) { > + qfq_front_slot_remove(grp); > + } else { > + unsigned int len; > + uint64_t roundedS; > + > + len = cl->_q.mq.head->m_pkthdr.len; > + cl->F = cl->S + (uint64_t)len * cl->inv_w; > + roundedS = qfq_round_down(cl->S, grp->slot_shift); > + if (roundedS == grp->S) > + return 0; > + > + qfq_front_slot_remove(grp); > + qfq_slot_insert(grp, cl, roundedS); > + } > + return 1; > +} > + > +static struct mbuf * > +qfq_dequeue(struct dn_sch_inst *si) > +{ > + struct qfq_sched *q = (struct qfq_sched *)(si + 1); > + struct qfq_group *grp; > + struct qfq_class *cl; > + struct mbuf *m; > + uint64_t old_V; > + > + NO(q->loops++;) > + if (!q->bitmaps[ER]) { > + NO(if (q->queued) > + dump_sched(q, "start dequeue");) > + return NULL; > + } > + > + grp = qfq_ffs(q, q->bitmaps[ER]); > + > + cl = grp->slots[grp->front]; > + /* extract from the first bucket in the bucket list */ > + m = dn_dequeue(&cl->_q); > + > + if (!m) { > + D("BUG/* non-workconserving leaf */"); > + return NULL; > + } > + NO(q->queued--;) > + old_V = q->V; > + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; > + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); > + > + if (qfq_update_class(q, grp, cl)) { > + uint64_t old_F = grp->F; > + cl = qfq_slot_scan(grp); > + if (!cl) { /* group gone, remove from ER */ > + __clear_bit(grp->index, &q->bitmaps[ER]); > + // grp->S = grp->F + 1; // XXX debugging only > + } else { > + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); > + unsigned int s; > + > + if (grp->S == roundedS) > + goto skip_unblock; > + grp->S = roundedS; > + grp->F = roundedS + (2ULL << grp->slot_shift); > + /* remove from ER and put in the new set */ > + __clear_bit(grp->index, &q->bitmaps[ER]); > + s = qfq_calc_state(q, grp); > + __set_bit(grp->index, &q->bitmaps[s]); > + } > + /* we need to unblock even if the group has gone away */ > + qfq_unblock_groups(q, grp->index, old_F); > + } > + > +skip_unblock: > + qfq_update_eligible(q, old_V); > + NO(if (!q->bitmaps[ER] && q->queued) > + dump_sched(q, "end dequeue");) > + > + return m; > +} > + > +/* > + * Assign a reasonable start time for a new flow k in group i. > + * Admissible values for \hat(F) are multiples of \sigma_i > + * no greater than V+\sigma_i . Larger values mean that > + * we had a wraparound so we consider the timestamp to be stale. > + * > + * If F is not stale and F >= V then we set S = F. > + * Otherwise we should assign S = V, but this may violate > + * the ordering in ER. So, if we have groups in ER, set S to > + * the F_j of the first group j which would be blocking us. > + * We are guaranteed not to move S backward because > + * otherwise our group i would still be blocked. > + */ > +static inline void > +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) > +{ > + unsigned long mask; > + uint64_t limit, roundedF; > + int slot_shift = cl->grp->slot_shift; > + > + roundedF = qfq_round_down(cl->F, slot_shift); > + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); > + > + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { > + /* timestamp was stale */ > + mask = mask_from(q->bitmaps[ER], cl->grp->index); > + if (mask) { > + struct qfq_group *next = qfq_ffs(q, mask); > + if (qfq_gt(roundedF, next->F)) { > + cl->S = next->F; > + return; > + } > + } > + cl->S = q->V; > + } else { /* timestamp is not stale */ > + cl->S = cl->F; > + } > +} > + > +static int > +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) > +{ > + struct qfq_sched *q = (struct qfq_sched *)(si + 1); > + struct qfq_group *grp; > + struct qfq_class *cl = (struct qfq_class *)_q; > + uint64_t roundedS; > + int s; > + > + NO(q->loops++;) > + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, > + _q, cl->inv_w, cl->grp->index); > + /* XXX verify that the packet obeys the parameters */ > + if (m != _q->mq.head) { > + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ > + return 1; > + NO(q->queued++;) > + if (m != _q->mq.head) > + return 0; > + } > + /* If reach this point, queue q was idle */ > + grp = cl->grp; > + qfq_update_start(q, cl); /* adjust start time */ > + /* compute new finish time and rounded start. */ > + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; > + roundedS = qfq_round_down(cl->S, grp->slot_shift); > + > + /* > + * insert cl in the correct bucket. > + * If cl->S >= grp->S we don't need to adjust the > + * bucket list and simply go to the insertion phase. > + * Otherwise grp->S is decreasing, we must make room > + * in the bucket list, and also recompute the group state. > + * Finally, if there were no flows in this group and nobody > + * was in ER make sure to adjust V. > + */ > + if (grp->full_slots) { > + if (!qfq_gt(grp->S, cl->S)) > + goto skip_update; > + /* create a slot for this cl->S */ > + qfq_slot_rotate(q, grp, roundedS); > + /* group was surely ineligible, remove */ > + __clear_bit(grp->index, &q->bitmaps[IR]); > + __clear_bit(grp->index, &q->bitmaps[IB]); > + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) > + q->V = roundedS; > + > + grp->S = roundedS; > + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i > + s = qfq_calc_state(q, grp); > + __set_bit(grp->index, &q->bitmaps[s]); > + ND("new state %d 0x%x", s, q->bitmaps[s]); > + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); > +skip_update: > + qfq_slot_insert(grp, cl, roundedS); > + > + return 0; > +} > + > + > +#if 0 > +static inline void > +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, > + struct qfq_class *cl, struct qfq_class **pprev) > +{ > + unsigned int i, offset; > + uint64_t roundedS; > + > + roundedS = qfq_round_down(cl->S, grp->slot_shift); > + offset = (roundedS - grp->S) >> grp->slot_shift; > + i = (grp->front + offset) % QFQ_MAX_SLOTS; > + > +#ifdef notyet > + if (!pprev) { > + pprev = &grp->slots[i]; > + while (*pprev && *pprev != cl) > + pprev = &(*pprev)->next; > + } > +#endif > + > + *pprev = cl->next; > + if (!grp->slots[i]) > + __clear_bit(offset, &grp->full_slots); > +} > + > +/* > + * called to forcibly destroy a queue. > + * If the queue is not in the front bucket, or if it has > + * other queues in the front bucket, we can simply remove > + * the queue with no other side effects. > + * Otherwise we must propagate the event up. > + * XXX description to be completed. > + */ > +static void > +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, > + struct qfq_class **pprev) > +{ > + struct qfq_group *grp = &q->groups[cl->index]; > + unsigned long mask; > + uint64_t roundedS; > + int s; > + > + cl->F = cl->S; // not needed if the class goes away. > + qfq_slot_remove(q, grp, cl, pprev); > + > + if (!grp->full_slots) { > + /* nothing left in the group, remove from all sets. > + * Do ER last because if we were blocking other groups > + * we must unblock them. > + */ > + __clear_bit(grp->index, &q->bitmaps[IR]); > + __clear_bit(grp->index, &q->bitmaps[EB]); > + __clear_bit(grp->index, &q->bitmaps[IB]); > + > + if (test_bit(grp->index, &q->bitmaps[ER]) && > + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { > + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); > + if (mask) > + mask = ~((1UL << __fls(mask)) - 1); > + else > + mask = ~0UL; > + qfq_move_groups(q, mask, EB, ER); > + qfq_move_groups(q, mask, IB, IR); > + } > + __clear_bit(grp->index, &q->bitmaps[ER]); > + } else if (!grp->slots[grp->front]) { > + cl = qfq_slot_scan(grp); > + roundedS = qfq_round_down(cl->S, grp->slot_shift); > + if (grp->S != roundedS) { > + __clear_bit(grp->index, &q->bitmaps[ER]); > + __clear_bit(grp->index, &q->bitmaps[IR]); > + __clear_bit(grp->index, &q->bitmaps[EB]); > + __clear_bit(grp->index, &q->bitmaps[IB]); > + grp->S = roundedS; > + grp->F = roundedS + (2ULL << grp->slot_shift); > + s = qfq_calc_state(q, grp); > + __set_bit(grp->index, &q->bitmaps[s]); > + } > + } > + qfq_update_eligible(q, q->V); > +} > +#endif > + > +static int > +qfq_new_fsk(struct dn_fsk *f) > +{ > + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); > + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); > + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); > + return 0; > +} > + > +/* > + * initialize a new scheduler instance > + */ > +static int > +qfq_new_sched(struct dn_sch_inst *si) > +{ > + struct qfq_sched *q = (struct qfq_sched *)(si + 1); > + struct qfq_group *grp; > + int i; > + > + for (i = 0; i <= QFQ_MAX_INDEX; i++) { > + grp = &q->groups[i]; > + grp->index = i; > + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - > + (QFQ_MAX_INDEX - i); > + } > + return 0; > +} > + > +/* > + * QFQ scheduler descriptor > + */ > +static struct dn_alg qfq_desc = { > + _SI( .type = ) DN_SCHED_QFQ, > + _SI( .name = ) "QFQ", > + _SI( .flags = ) DN_MULTIQUEUE, > + > + _SI( .schk_datalen = ) 0, > + _SI( .si_datalen = ) sizeof(struct qfq_sched), > + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), > + > + _SI( .enqueue = ) qfq_enqueue, > + _SI( .dequeue = ) qfq_dequeue, > + > + _SI( .config = ) NULL, > + _SI( .destroy = ) NULL, > + _SI( .new_sched = ) qfq_new_sched, > + _SI( .free_sched = ) NULL, > + _SI( .new_fsk = ) qfq_new_fsk, > + _SI( .free_fsk = ) NULL, > + _SI( .new_queue = ) qfq_new_queue, > + _SI( .free_queue = ) qfq_free_queue, > +}; > + > +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); > + > +#ifdef QFQ_DEBUG > +static void > +dump_groups(struct qfq_sched *q, uint32_t mask) > +{ > + int i, j; > + > + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { > + struct qfq_group *g = &q->groups[i]; > + > + if (0 == (mask & (1<<i))) > + continue; > + for (j = 0; j < QFQ_MAX_SLOTS; j++) { > + if (g->slots[j]) > + D(" bucket %d %p", j, g->slots[j]); > + } > + D("full_slots 0x%x", g->full_slots); > + D(" %2d S 0x%20llx F 0x%llx %c", i, > + g->S, g->F, > + mask & (1<<i) ? '1' : '0'); > + } > +} > + > +static void > +dump_sched(struct qfq_sched *q, const char *msg) > +{ > + D("--- in %s: ---", msg); > + ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V); > + D(" ER 0x%08x", q->bitmaps[ER]); > + D(" EB 0x%08x", q->bitmaps[EB]); > + D(" IR 0x%08x", q->bitmaps[IR]); > + D(" IB 0x%08x", q->bitmaps[IB]); > + dump_groups(q, 0xffffffff); > +}; > +#endif /* QFQ_DEBUG */ > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c > new file mode 100644 > index 0000000..28edb29 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c > @@ -0,0 +1,307 @@ > +/* > + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_rr.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > + > +#ifdef _KERNEL > +#include <sys/malloc.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/kernel.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <net/if.h> /* IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ipfw_rule_ref */ > +#include <netinet/ip_fw.h> /* flow_id */ > +#include <netinet/ip_dummynet.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > +#else > +#include <dn_test.h> > +#endif > + > +#define DN_SCHED_RR 3 // XXX Where? > + > +struct rr_queue { > + struct dn_queue q; /* Standard queue */ > + int status; /* 1: queue is in the list */ > + int credit; /* Number of bytes to transmit */ > + int quantum; /* quantum * C */ > + struct rr_queue *qnext; /* */ > +}; > + > +/* struct rr_schk contains global config parameters > + * and is right after dn_schk > + */ > +struct rr_schk { > + int min_q; /* Min quantum */ > + int max_q; /* Max quantum */ > + int q_bytes; /* Bytes per quantum */ > +}; > + > +/* per-instance round robin list, right after dn_sch_inst */ > +struct rr_si { > + struct rr_queue *head, *tail; /* Pointer to current queue */ > +}; > + > +/* Append a queue to the rr list */ > +static inline void > +rr_append(struct rr_queue *q, struct rr_si *si) > +{ > + q->status = 1; /* mark as in-rr_list */ > + q->credit = q->quantum; /* initialize credit */ > + > + /* append to the tail */ > + if (si->head == NULL) > + si->head = q; > + else > + si->tail->qnext = q; > + si->tail = q; /* advance the tail pointer */ > + q->qnext = si->head; /* make it circular */ > +} > + > +/* Remove the head queue from circular list. */ > +static inline void > +rr_remove_head(struct rr_si *si) > +{ > + if (si->head == NULL) > + return; /* empty queue */ > + si->head->status = 0; > + > + if (si->head == si->tail) { > + si->head = si->tail = NULL; > + return; > + } > + > + si->head = si->head->qnext; > + si->tail->qnext = si->head; > +} > + > +/* Remove a queue from circular list. > + * XXX see if ti can be merge with remove_queue() > + */ > +static inline void > +remove_queue_q(struct rr_queue *q, struct rr_si *si) > +{ > + struct rr_queue *prev; > + > + if (q->status != 1) > + return; > + if (q == si->head) { > + rr_remove_head(si); > + return; > + } > + > + for (prev = si->head; prev; prev = prev->qnext) { > + if (prev->qnext != q) > + continue; > + prev->qnext = q->qnext; > + if (q == si->tail) > + si->tail = prev; > + q->status = 0; > + break; > + } > +} > + > + > +static inline void > +next_pointer(struct rr_si *si) > +{ > + if (si->head == NULL) > + return; /* empty queue */ > + > + si->head = si->head->qnext; > + si->tail = si->tail->qnext; > +} > + > +static int > +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) > +{ > + struct rr_si *si; > + struct rr_queue *rrq; > + > + if (m != q->mq.head) { > + if (dn_enqueue(q, m, 0)) /* packet was dropped */ > + return 1; > + if (m != q->mq.head) > + return 0; > + } > + > + /* If reach this point, queue q was idle */ > + si = (struct rr_si *)(_si + 1); > + rrq = (struct rr_queue *)q; > + > + if (rrq->status == 1) /* Queue is already in the queue list */ > + return 0; > + > + /* Insert the queue in the queue list */ > + rr_append(rrq, si); > + > + return 0; > +} > + > +static struct mbuf * > +rr_dequeue(struct dn_sch_inst *_si) > +{ > + /* Access scheduler instance private data */ > + struct rr_si *si = (struct rr_si *)(_si + 1); > + struct rr_queue *rrq; > + uint64_t len; > + > + while ( (rrq = si->head) ) { > + struct mbuf *m = rrq->q.mq.head; > + if ( m == NULL) { > + /* empty queue, remove from list */ > + rr_remove_head(si); > + continue; > + } > + len = m->m_pkthdr.len; > + > + if (len > rrq->credit) { > + /* Packet too big */ > + rrq->credit += rrq->quantum; > + /* Try next queue */ > + next_pointer(si); > + } else { > + rrq->credit -= len; > + return dn_dequeue(&rrq->q); > + } > + } > + > + /* no packet to dequeue*/ > + return NULL; > +} > + > +static int > +rr_config(struct dn_schk *_schk) > +{ > + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); > + ND("called"); > + > + /* use reasonable quantums (64..2k bytes, default 1500) */ > + schk->min_q = 64; > + schk->max_q = 2048; > + schk->q_bytes = 1500; /* quantum */ > + > + return 0; > +} > + > +static int > +rr_new_sched(struct dn_sch_inst *_si) > +{ > + struct rr_si *si = (struct rr_si *)(_si + 1); > + > + ND("called"); > + si->head = si->tail = NULL; > + > + return 0; > +} > + > +static int > +rr_free_sched(struct dn_sch_inst *_si) > +{ > + ND("called"); > + /* Nothing to do? */ > + return 0; > +} > + > +static int > +rr_new_fsk(struct dn_fsk *fs) > +{ > + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); > + /* par[0] is the weight, par[1] is the quantum step */ > + ipdn_bound_var(&fs->fs.par[0], 1, > + 1, 65536, "RR weight"); > + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, > + schk->min_q, schk->max_q, "RR quantum"); > + return 0; > +} > + > +static int > +rr_new_queue(struct dn_queue *_q) > +{ > + struct rr_queue *q = (struct rr_queue *)_q; > + > + _q->ni.oid.subtype = DN_SCHED_RR; > + > + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; > + ND("called, q->quantum %d", q->quantum); > + q->credit = q->quantum; > + q->status = 0; > + > + if (_q->mq.head != NULL) { > + /* Queue NOT empty, insert in the queue list */ > + rr_append(q, (struct rr_si *)(_q->_si + 1)); > + } > + return 0; > +} > + > +static int > +rr_free_queue(struct dn_queue *_q) > +{ > + struct rr_queue *q = (struct rr_queue *)_q; > + > + ND("called"); > + if (q->status == 1) { > + struct rr_si *si = (struct rr_si *)(_q->_si + 1); > + remove_queue_q(q, si); > + } > + return 0; > +} > + > +/* > + * RR scheduler descriptor > + * contains the type of the scheduler, the name, the size of the > + * structures and function pointers. > + */ > +static struct dn_alg rr_desc = { > + _SI( .type = ) DN_SCHED_RR, > + _SI( .name = ) "RR", > + _SI( .flags = ) DN_MULTIQUEUE, > + > + _SI( .schk_datalen = ) 0, > + _SI( .si_datalen = ) sizeof(struct rr_si), > + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), > + > + _SI( .enqueue = ) rr_enqueue, > + _SI( .dequeue = ) rr_dequeue, > + > + _SI( .config = ) rr_config, > + _SI( .destroy = ) NULL, > + _SI( .new_sched = ) rr_new_sched, > + _SI( .free_sched = ) rr_free_sched, > + _SI( .new_fsk = ) rr_new_fsk, > + _SI( .free_fsk = ) NULL, > + _SI( .new_queue = ) rr_new_queue, > + _SI( .free_queue = ) rr_free_queue, > +}; > + > + > +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); > diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c > new file mode 100644 > index 0000000..c07f4c7 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c > @@ -0,0 +1,373 @@ > +/* > + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa > + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_wf2q.c 240494 2012-09-14 11:51:49Z glebius $ > + */ > + > +#ifdef _KERNEL > +#include <sys/malloc.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/kernel.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <net/if.h> /* IFNAMSIZ */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ipfw_rule_ref */ > +#include <netinet/ip_fw.h> /* flow_id */ > +#include <netinet/ip_dummynet.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > +#else > +#include <dn_test.h> > +#endif > + > +#ifndef MAX64 > +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) > +#endif > + > +/* > + * timestamps are computed on 64 bit using fixed point arithmetic. > + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len > + * and sum of weights, respectively. FRAC_BITS is the number of > + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large > + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w > + * using an unsigned 32-bit division, and to avoid wraparounds we need > + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 > + * As an example > + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 > + */ > +#ifndef FRAC_BITS > +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ > +#define ONE_FP (1UL << FRAC_BITS) > +#endif > + > +/* > + * Private information for the scheduler instance: > + * sch_heap (key is Finish time) returns the next queue to serve > + * ne_heap (key is Start time) stores not-eligible queues > + * idle_heap (key=start/finish time) stores idle flows. It must > + * support extract-from-middle. > + * A flow is only in 1 of the three heaps. > + * XXX todo: use a more efficient data structure, e.g. a tree sorted > + * by F with min_subtree(S) in each node > + */ > +struct wf2qp_si { > + struct dn_heap sch_heap; /* top extract - key Finish time */ > + struct dn_heap ne_heap; /* top extract - key Start time */ > + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ > + uint64_t V; /* virtual time */ > + uint32_t inv_wsum; /* inverse of sum of weights */ > + uint32_t wsum; /* sum of weights */ > +}; > + > +struct wf2qp_queue { > + struct dn_queue _q; > + uint64_t S, F; /* start time, finish time */ > + uint32_t inv_w; /* ONE_FP / weight */ > + int32_t heap_pos; /* position (index) of struct in heap */ > +}; > + > +/* > + * This file implements a WF2Q+ scheduler as it has been in dummynet > + * since 2000. > + * The scheduler supports per-flow queues and has O(log N) complexity. > + * > + * WF2Q+ needs to drain entries from the idle heap so that we > + * can keep the sum of weights up to date. We can do it whenever > + * we get a chance, or periodically, or following some other > + * strategy. The function idle_check() drains at most N elements > + * from the idle heap. > + */ > +static void > +idle_check(struct wf2qp_si *si, int n, int force) > +{ > + struct dn_heap *h = &si->idle_heap; > + while (n-- > 0 && h->elements > 0 && > + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { > + struct dn_queue *q = HEAP_TOP(h)->object; > + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; > + > + heap_extract(h, NULL); > + /* XXX to let the flowset delete the queue we should > + * mark it as 'unused' by the scheduler. > + */ > + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ > + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ > + if (si->wsum > 0) > + si->inv_wsum = ONE_FP/si->wsum; > + } > +} > + > +static int > +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) > +{ > + struct dn_fsk *fs = q->fs; > + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); > + struct wf2qp_queue *alg_fq; > + uint64_t len = m->m_pkthdr.len; > + > + if (m != q->mq.head) { > + if (dn_enqueue(q, m, 0)) /* packet was dropped */ > + return 1; > + if (m != q->mq.head) /* queue was already busy */ > + return 0; > + } > + > + /* If reach this point, queue q was idle */ > + alg_fq = (struct wf2qp_queue *)q; > + > + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { > + /* F<S means timestamps are invalid ->brand new queue. */ > + alg_fq->S = si->V; /* init start time */ > + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ > + si->inv_wsum = ONE_FP/si->wsum; > + } else { /* if it was idle then it was in the idle heap */ > + heap_extract(&si->idle_heap, q); > + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ > + } > + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; > + > + /* if nothing is backlogged, make sure this flow is eligible */ > + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) > + si->V = MAX64(alg_fq->S, si->V); > + > + /* > + * Look at eligibility. A flow is not eligibile if S>V (when > + * this happens, it means that there is some other flow already > + * scheduled for the same pipe, so the sch_heap cannot be > + * empty). If the flow is not eligible we just store it in the > + * ne_heap. Otherwise, we store in the sch_heap. > + * Note that for all flows in sch_heap (SCH), S_i <= V, > + * and for all flows in ne_heap (NEH), S_i > V. > + * So when we need to compute max(V, min(S_i)) forall i in > + * SCH+NEH, we only need to look into NEH. > + */ > + if (DN_KEY_LT(si->V, alg_fq->S)) { > + /* S>V means flow Not eligible. */ > + if (si->sch_heap.elements == 0) > + D("++ ouch! not eligible but empty scheduler!"); > + heap_insert(&si->ne_heap, alg_fq->S, q); > + } else { > + heap_insert(&si->sch_heap, alg_fq->F, q); > + } > + return 0; > +} > + > +/* XXX invariant: sch > 0 || V >= min(S in neh) */ > +static struct mbuf * > +wf2qp_dequeue(struct dn_sch_inst *_si) > +{ > + /* Access scheduler instance private data */ > + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); > + struct mbuf *m; > + struct dn_queue *q; > + struct dn_heap *sch = &si->sch_heap; > + struct dn_heap *neh = &si->ne_heap; > + struct wf2qp_queue *alg_fq; > + > + if (sch->elements == 0 && neh->elements == 0) { > + /* we have nothing to do. We could kill the idle heap > + * altogether and reset V > + */ > + idle_check(si, 0x7fffffff, 1); > + si->V = 0; > + si->wsum = 0; /* should be set already */ > + return NULL; /* quick return if nothing to do */ > + } > + idle_check(si, 1, 0); /* drain something from the idle heap */ > + > + /* make sure at least one element is eligible, bumping V > + * and moving entries that have become eligible. > + * We need to repeat the first part twice, before and > + * after extracting the candidate, or enqueue() will > + * find the data structure in a wrong state. > + */ > + m = NULL; > + for(;;) { > + /* > + * Compute V = max(V, min(S_i)). Remember that all elements > + * in sch have by definition S_i <= V so if sch is not empty, > + * V is surely the max and we must not update it. Conversely, > + * if sch is empty we only need to look at neh. > + * We don't need to move the queues, as it will be done at the > + * next enqueue > + */ > + if (sch->elements == 0 && neh->elements > 0) { > + si->V = MAX64(si->V, HEAP_TOP(neh)->key); > + } > + while (neh->elements > 0 && > + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { > + q = HEAP_TOP(neh)->object; > + alg_fq = (struct wf2qp_queue *)q; > + heap_extract(neh, NULL); > + heap_insert(sch, alg_fq->F, q); > + } > + if (m) /* pkt found in previous iteration */ > + break; > + /* ok we have at least one eligible pkt */ > + q = HEAP_TOP(sch)->object; > + alg_fq = (struct wf2qp_queue *)q; > + m = dn_dequeue(q); > + heap_extract(sch, NULL); /* Remove queue from heap. */ > + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; > + alg_fq->S = alg_fq->F; /* Update start time. */ > + if (q->mq.head == 0) { /* not backlogged any more. */ > + heap_insert(&si->idle_heap, alg_fq->F, q); > + } else { /* Still backlogged. */ > + /* Update F, store in neh or sch */ > + uint64_t len = q->mq.head->m_pkthdr.len; > + alg_fq->F += len * alg_fq->inv_w; > + if (DN_KEY_LEQ(alg_fq->S, si->V)) { > + heap_insert(sch, alg_fq->F, q); > + } else { > + heap_insert(neh, alg_fq->S, q); > + } > + } > + } > + return m; > +} > + > +static int > +wf2qp_new_sched(struct dn_sch_inst *_si) > +{ > + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); > + int ofs = offsetof(struct wf2qp_queue, heap_pos); > + > + /* all heaps support extract from middle */ > + if (heap_init(&si->idle_heap, 16, ofs) || > + heap_init(&si->sch_heap, 16, ofs) || > + heap_init(&si->ne_heap, 16, ofs)) { > + heap_free(&si->ne_heap); > + heap_free(&si->sch_heap); > + heap_free(&si->idle_heap); > + return ENOMEM; > + } > + return 0; > +} > + > +static int > +wf2qp_free_sched(struct dn_sch_inst *_si) > +{ > + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); > + > + heap_free(&si->sch_heap); > + heap_free(&si->ne_heap); > + heap_free(&si->idle_heap); > + > + return 0; > +} > + > +static int > +wf2qp_new_fsk(struct dn_fsk *fs) > +{ > + ipdn_bound_var(&fs->fs.par[0], 1, > + 1, 100, "WF2Q+ weight"); > + return 0; > +} > + > +static int > +wf2qp_new_queue(struct dn_queue *_q) > +{ > + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; > + > + _q->ni.oid.subtype = DN_SCHED_WF2QP; > + q->F = 0; /* not strictly necessary */ > + q->S = q->F + 1; /* mark timestamp as invalid. */ > + q->inv_w = ONE_FP / _q->fs->fs.par[0]; > + if (_q->mq.head != NULL) { > + wf2qp_enqueue(_q->_si, _q, _q->mq.head); > + } > + return 0; > +} > + > +/* > + * Called when the infrastructure removes a queue (e.g. flowset > + * is reconfigured). Nothing to do if we did not 'own' the queue, > + * otherwise remove it from the right heap and adjust the sum > + * of weights. > + */ > +static int > +wf2qp_free_queue(struct dn_queue *q) > +{ > + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; > + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); > + > + if (alg_fq->S >= alg_fq->F + 1) > + return 0; /* nothing to do, not in any heap */ > + si->wsum -= q->fs->fs.par[0]; > + if (si->wsum > 0) > + si->inv_wsum = ONE_FP/si->wsum; > + > + /* extract from the heap. XXX TODO we may need to adjust V > + * to make sure the invariants hold. > + */ > + if (q->mq.head == NULL) { > + heap_extract(&si->idle_heap, q); > + } else if (DN_KEY_LT(si->V, alg_fq->S)) { > + heap_extract(&si->ne_heap, q); > + } else { > + heap_extract(&si->sch_heap, q); > + } > + return 0; > +} > + > +/* > + * WF2Q+ scheduler descriptor > + * contains the type of the scheduler, the name, the size of the > + * structures and function pointers. > + */ > +static struct dn_alg wf2qp_desc = { > + _SI( .type = ) DN_SCHED_WF2QP, > + _SI( .name = ) "WF2Q+", > + _SI( .flags = ) DN_MULTIQUEUE, > + > + /* we need extra space in the si and the queue */ > + _SI( .schk_datalen = ) 0, > + _SI( .si_datalen = ) sizeof(struct wf2qp_si), > + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - > + sizeof(struct dn_queue), > + > + _SI( .enqueue = ) wf2qp_enqueue, > + _SI( .dequeue = ) wf2qp_dequeue, > + > + _SI( .config = ) NULL, > + _SI( .destroy = ) NULL, > + _SI( .new_sched = ) wf2qp_new_sched, > + _SI( .free_sched = ) wf2qp_free_sched, > + > + _SI( .new_fsk = ) wf2qp_new_fsk, > + _SI( .free_fsk = ) NULL, > + > + _SI( .new_queue = ) wf2qp_new_queue, > + _SI( .free_queue = ) wf2qp_free_queue, > +}; > + > + > +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c b/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c > new file mode 100644 > index 0000000..753331f > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c > @@ -0,0 +1,846 @@ > +/*- > + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $FreeBSD: head/sys/netpfil/ipfw/ip_dn_glue.c 266955 2014-06-01 20:19:17Z hiren $ > + * > + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 > + */ > + > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/module.h> > +#include <sys/priv.h> > +#include <sys/proc.h> > +#include <sys/rwlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/time.h> > +#include <sys/taskqueue.h> > +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ > +#include <netinet/ip_fw.h> > +#include <netinet/ip_dummynet.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > + > +/* FREEBSD7.2 ip_dummynet.h r191715*/ > + > +struct dn_heap_entry7 { > + int64_t key; /* sorting key. Topmost element is smallest one */ > + void *object; /* object pointer */ > +}; > + > +struct dn_heap7 { > + int size; > + int elements; > + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ > + struct dn_heap_entry7 *p; /* really an array of "size" entries */ > +}; > + > +/* Common to 7.2 and 8 */ > +struct dn_flow_set { > + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ > + > + u_short fs_nr ; /* flow_set number */ > + u_short flags_fs; > +#define DNOLD_HAVE_FLOW_MASK 0x0001 > +#define DNOLD_IS_RED 0x0002 > +#define DNOLD_IS_GENTLE_RED 0x0004 > +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ > +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ > +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ > +#define DNOLD_IS_PIPE 0x4000 > +#define DNOLD_IS_QUEUE 0x8000 > + > + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ > + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ > + > + int weight ; /* WFQ queue weight */ > + int qsize ; /* queue size in slots or bytes */ > + int plr ; /* pkt loss rate (2^31-1 means 100%) */ > + > + struct ipfw_flow_id flow_mask ; > + > + /* hash table of queues onto this flow_set */ > + int rq_size ; /* number of slots */ > + int rq_elements ; /* active elements */ > + struct dn_flow_queue7 **rq; /* array of rq_size entries */ > + > + u_int32_t last_expired ; /* do not expire too frequently */ > + int backlogged ; /* #active queues for this flowset */ > + > + /* RED parameters */ > +#define SCALE_RED 16 > +#define SCALE(x) ( (x) << SCALE_RED ) > +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) > +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) > + int w_q ; /* queue weight (scaled) */ > + int max_th ; /* maximum threshold for queue (scaled) */ > + int min_th ; /* minimum threshold for queue (scaled) */ > + int max_p ; /* maximum value for p_b (scaled) */ > + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ > + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ > + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ > + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ > + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ > + u_int lookup_depth ; /* depth of lookup table */ > + int lookup_step ; /* granularity inside the lookup table */ > + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ > + int avg_pkt_size ; /* medium packet size */ > + int max_pkt_size ; /* max packet size */ > +}; > +SLIST_HEAD(dn_flow_set_head, dn_flow_set); > + > +#define DN_IS_PIPE 0x4000 > +#define DN_IS_QUEUE 0x8000 > +struct dn_flow_queue7 { > + struct dn_flow_queue7 *next ; > + struct ipfw_flow_id id ; > + > + struct mbuf *head, *tail ; /* queue of packets */ > + u_int len ; > + u_int len_bytes ; > + > + u_long numbytes; > + > + u_int64_t tot_pkts ; /* statistics counters */ > + u_int64_t tot_bytes ; > + u_int32_t drops ; > + > + int hash_slot ; /* debugging/diagnostic */ > + > + /* RED parameters */ > + int avg ; /* average queue length est. (scaled) */ > + int count ; /* arrivals since last RED drop */ > + int random ; /* random value (scaled) */ > + u_int32_t q_time; /* start of queue idle time */ > + > + /* WF2Q+ support */ > + struct dn_flow_set *fs ; /* parent flow set */ > + int heap_pos ; /* position (index) of struct in heap */ > + int64_t sched_time ; /* current time when queue enters ready_heap */ > + > + int64_t S,F ; /* start time, finish time */ > +}; > + > +struct dn_pipe7 { /* a pipe */ > + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ > + > + int pipe_nr ; /* number */ > + int bandwidth; /* really, bytes/tick. */ > + int delay ; /* really, ticks */ > + > + struct mbuf *head, *tail ; /* packets in delay line */ > + > + /* WF2Q+ */ > + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ > + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ > + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ > + > + int64_t V ; /* virtual time */ > + int sum; /* sum of weights of all active sessions */ > + > + int numbytes; > + > + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ > + > + /* > + * When the tx clock come from an interface (if_name[0] != '\0'), its name > + * is stored below, whereas the ifp is filled when the rule is configured. > + */ > + char if_name[IFNAMSIZ]; > + struct ifnet *ifp ; > + int ready ; /* set if ifp != NULL and we got a signal from it */ > + > + struct dn_flow_set fs ; /* used with fixed-rate flows */ > +}; > +SLIST_HEAD(dn_pipe_head7, dn_pipe7); > + > + > +/* FREEBSD8 ip_dummynet.h r196045 */ > +struct dn_flow_queue8 { > + struct dn_flow_queue8 *next ; > + struct ipfw_flow_id id ; > + > + struct mbuf *head, *tail ; /* queue of packets */ > + u_int len ; > + u_int len_bytes ; > + > + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ > + int64_t extra_bits; /* extra bits simulating unavailable channel */ > + > + u_int64_t tot_pkts ; /* statistics counters */ > + u_int64_t tot_bytes ; > + u_int32_t drops ; > + > + int hash_slot ; /* debugging/diagnostic */ > + > + /* RED parameters */ > + int avg ; /* average queue length est. (scaled) */ > + int count ; /* arrivals since last RED drop */ > + int random ; /* random value (scaled) */ > + int64_t idle_time; /* start of queue idle time */ > + > + /* WF2Q+ support */ > + struct dn_flow_set *fs ; /* parent flow set */ > + int heap_pos ; /* position (index) of struct in heap */ > + int64_t sched_time ; /* current time when queue enters ready_heap */ > + > + int64_t S,F ; /* start time, finish time */ > +}; > + > +struct dn_pipe8 { /* a pipe */ > + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ > + > + int pipe_nr ; /* number */ > + int bandwidth; /* really, bytes/tick. */ > + int delay ; /* really, ticks */ > + > + struct mbuf *head, *tail ; /* packets in delay line */ > + > + /* WF2Q+ */ > + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ > + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ > + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ > + > + int64_t V ; /* virtual time */ > + int sum; /* sum of weights of all active sessions */ > + > + /* Same as in dn_flow_queue, numbytes can become large */ > + int64_t numbytes; /* bits I can transmit (more or less). */ > + uint64_t burst; /* burst size, scaled: bits * hz */ > + > + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ > + int64_t idle_time; /* start of pipe idle time */ > + > + char if_name[IFNAMSIZ]; > + struct ifnet *ifp ; > + int ready ; /* set if ifp != NULL and we got a signal from it */ > + > + struct dn_flow_set fs ; /* used with fixed-rate flows */ > + > + /* fields to simulate a delay profile */ > +#define ED_MAX_NAME_LEN 32 > + char name[ED_MAX_NAME_LEN]; > + int loss_level; > + int samples_no; > + int *samples; > +}; > + > +#define ED_MAX_SAMPLES_NO 1024 > +struct dn_pipe_max8 { > + struct dn_pipe8 pipe; > + int samples[ED_MAX_SAMPLES_NO]; > +}; > +SLIST_HEAD(dn_pipe_head8, dn_pipe8); > + > +/* > + * Changes from 7.2 to 8: > + * dn_pipe: > + * numbytes from int to int64_t > + * add burst (int64_t) > + * add idle_time (int64_t) > + * add profile > + * add struct dn_pipe_max > + * add flag DN_HAS_PROFILE > + * > + * dn_flow_queue > + * numbytes from u_long to int64_t > + * add extra_bits (int64_t) > + * q_time from u_int32_t to int64_t and name idle_time > + * > + * dn_flow_set unchanged > + * > + */ > + > +/* NOTE:XXX copied from dummynet.c */ > +#define O_NEXT(p, len) ((void *)((char *)p + len)) > +static void > +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) > +{ > + oid->len = len; > + oid->type = type; > + oid->subtype = 0; > + oid->id = id; > +} > +/* make room in the buffer and move the pointer forward */ > +static void * > +o_next(struct dn_id **o, int len, int type) > +{ > + struct dn_id *ret = *o; > + oid_fill(ret, len, type, 0); > + *o = O_NEXT(*o, len); > + return ret; > +} > + > + > +static size_t pipesize7 = sizeof(struct dn_pipe7); > +static size_t pipesize8 = sizeof(struct dn_pipe8); > +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); > + > +/* Indicate 'ipfw' version > + * 1: from FreeBSD 7.2 > + * 0: from FreeBSD 8 > + * -1: unknown (for now is unused) > + * > + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives > + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, > + * it is suppose to be the FreeBSD 8 version. > + */ > +static int is7 = 0; > + > +static int > +convertflags2new(int src) > +{ > + int dst = 0; > + > + if (src & DNOLD_HAVE_FLOW_MASK) > + dst |= DN_HAVE_MASK; > + if (src & DNOLD_QSIZE_IS_BYTES) > + dst |= DN_QSIZE_BYTES; > + if (src & DNOLD_NOERROR) > + dst |= DN_NOERROR; > + if (src & DNOLD_IS_RED) > + dst |= DN_IS_RED; > + if (src & DNOLD_IS_GENTLE_RED) > + dst |= DN_IS_GENTLE_RED; > + if (src & DNOLD_HAS_PROFILE) > + dst |= DN_HAS_PROFILE; > + > + return dst; > +} > + > +static int > +convertflags2old(int src) > +{ > + int dst = 0; > + > + if (src & DN_HAVE_MASK) > + dst |= DNOLD_HAVE_FLOW_MASK; > + if (src & DN_IS_RED) > + dst |= DNOLD_IS_RED; > + if (src & DN_IS_GENTLE_RED) > + dst |= DNOLD_IS_GENTLE_RED; > + if (src & DN_NOERROR) > + dst |= DNOLD_NOERROR; > + if (src & DN_HAS_PROFILE) > + dst |= DNOLD_HAS_PROFILE; > + if (src & DN_QSIZE_BYTES) > + dst |= DNOLD_QSIZE_IS_BYTES; > + > + return dst; > +} > + > +static int > +dn_compat_del(void *v) > +{ > + struct dn_pipe7 *p = (struct dn_pipe7 *) v; > + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; > + struct { > + struct dn_id oid; > + uintptr_t a[1]; /* add more if we want a list */ > + } cmd; > + > + /* XXX DN_API_VERSION ??? */ > + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); > + > + if (is7) { > + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) > + return EINVAL; > + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) > + return EINVAL; > + } else { > + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) > + return EINVAL; > + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) > + return EINVAL; > + } > + > + if (p->pipe_nr != 0) { /* pipe x delete */ > + cmd.a[0] = p->pipe_nr; > + cmd.oid.subtype = DN_LINK; > + } else { /* queue x delete */ > + cmd.oid.subtype = DN_FS; > + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; > + } > + > + return do_config(&cmd, cmd.oid.len); > +} > + > +static int > +dn_compat_config_queue(struct dn_fs *fs, void* v) > +{ > + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; > + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; > + struct dn_flow_set *f; > + > + if (is7) > + f = &p7->fs; > + else > + f = &p8->fs; > + > + fs->fs_nr = f->fs_nr; > + fs->sched_nr = f->parent_nr; > + fs->flow_mask = f->flow_mask; > + fs->buckets = f->rq_size; > + fs->qsize = f->qsize; > + fs->plr = f->plr; > + fs->par[0] = f->weight; > + fs->flags = convertflags2new(f->flags_fs); > + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { > + fs->w_q = f->w_q; > + fs->max_th = f->max_th; > + fs->min_th = f->min_th; > + fs->max_p = f->max_p; > + } > + > + return 0; > +} > + > +static int > +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, > + struct dn_fs *fs, void* v) > +{ > + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; > + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; > + int i = p7->pipe_nr; > + > + sch->sched_nr = i; > + sch->oid.subtype = 0; > + p->link_nr = i; > + fs->fs_nr = i + 2*DN_MAX_ID; > + fs->sched_nr = i + DN_MAX_ID; > + > + /* Common to 7 and 8 */ > + p->bandwidth = p7->bandwidth; > + p->delay = p7->delay; > + if (!is7) { > + /* FreeBSD 8 has burst */ > + p->burst = p8->burst; > + } > + > + /* fill the fifo flowset */ > + dn_compat_config_queue(fs, v); > + fs->fs_nr = i + 2*DN_MAX_ID; > + fs->sched_nr = i + DN_MAX_ID; > + > + /* Move scheduler related parameter from fs to sch */ > + sch->buckets = fs->buckets; /*XXX*/ > + fs->buckets = 0; > + if (fs->flags & DN_HAVE_MASK) { > + sch->flags |= DN_HAVE_MASK; > + fs->flags &= ~DN_HAVE_MASK; > + sch->sched_mask = fs->flow_mask; > + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); > + } > + > + return 0; > +} > + > +static int > +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, > + void *v) > +{ > + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; > + > + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); > + > + pf->link_nr = p->link_nr; > + pf->loss_level = p8->loss_level; > +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? > + pf->samples_no = p8->samples_no; > + strncpy(pf->name, p8->name,sizeof(pf->name)); > + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); > + > + return 0; > +} > + > +/* > + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create > + * the three main struct, else only a flowset is created > + */ > +static int > +dn_compat_configure(void *v) > +{ > + struct dn_id *buf = NULL, *base; > + struct dn_sch *sch = NULL; > + struct dn_link *p = NULL; > + struct dn_fs *fs = NULL; > + struct dn_profile *pf = NULL; > + int lmax; > + int error; > + > + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; > + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; > + > + int i; /* number of object to configure */ > + > + lmax = sizeof(struct dn_id); /* command header */ > + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + > + sizeof(struct dn_fs) + sizeof(struct dn_profile); > + > + base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); > + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); > + base->id = DN_API_VERSION; > + > + /* pipe_nr is the same in p7 and p8 */ > + i = p7->pipe_nr; > + if (i != 0) { /* pipe config */ > + sch = o_next(&buf, sizeof(*sch), DN_SCH); > + p = o_next(&buf, sizeof(*p), DN_LINK); > + fs = o_next(&buf, sizeof(*fs), DN_FS); > + > + error = dn_compat_config_pipe(sch, p, fs, v); > + if (error) { > + free(buf, M_DUMMYNET); > + return error; > + } > + if (!is7 && p8->samples_no > 0) { > + /* Add profiles*/ > + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); > + error = dn_compat_config_profile(pf, p, v); > + if (error) { > + free(buf, M_DUMMYNET); > + return error; > + } > + } > + } else { /* queue config */ > + fs = o_next(&buf, sizeof(*fs), DN_FS); > + error = dn_compat_config_queue(fs, v); > + if (error) { > + free(buf, M_DUMMYNET); > + return error; > + } > + } > + error = do_config(base, (char *)buf - (char *)base); > + > + if (buf) > + free(buf, M_DUMMYNET); > + return error; > +} > + > +int > +dn_compat_calc_size(void) > +{ > + int need = 0; > + /* XXX use FreeBSD 8 struct size */ > + /* NOTE: > + * - half scheduler: schk_count/2 > + * - all flowset: fsk_count > + * - all flowset queues: queue_count > + * - all pipe queue: si_count > + */ > + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; > + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); > + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); > + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); > + > + return need; > +} > + > +int > +dn_c_copy_q (void *_ni, void *arg) > +{ > + struct copy_args *a = arg; > + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; > + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; > + struct dn_flow *ni = (struct dn_flow *)_ni; > + int size = 0; > + > + /* XXX hash slot not set */ > + /* No difference between 7.2/8 */ > + fq7->len = ni->length; > + fq7->len_bytes = ni->len_bytes; > + fq7->id = ni->fid; > + > + if (is7) { > + size = sizeof(struct dn_flow_queue7); > + fq7->tot_pkts = ni->tot_pkts; > + fq7->tot_bytes = ni->tot_bytes; > + fq7->drops = ni->drops; > + } else { > + size = sizeof(struct dn_flow_queue8); > + fq8->tot_pkts = ni->tot_pkts; > + fq8->tot_bytes = ni->tot_bytes; > + fq8->drops = ni->drops; > + } > + > + *a->start += size; > + return 0; > +} > + > +int > +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) > +{ > + struct dn_link *l = &s->link; > + struct dn_fsk *f = s->fs; > + > + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; > + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; > + struct dn_flow_set *fs; > + int size = 0; > + > + if (is7) { > + fs = &pipe7->fs; > + size = sizeof(struct dn_pipe7); > + } else { > + fs = &pipe8->fs; > + size = sizeof(struct dn_pipe8); > + } > + > + /* These 4 field are the same in pipe7 and pipe8 */ > + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; > + pipe7->bandwidth = l->bandwidth; > + pipe7->delay = l->delay * 1000 / hz; > + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; > + > + if (!is7) { > + if (s->profile) { > + struct dn_profile *pf = s->profile; > + strncpy(pipe8->name, pf->name, sizeof(pf->name)); > + pipe8->loss_level = pf->loss_level; > + pipe8->samples_no = pf->samples_no; > + } > + pipe8->burst = div64(l->burst , 8 * hz); > + } > + > + fs->flow_mask = s->sch.sched_mask; > + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; > + > + fs->parent_nr = l->link_nr - DN_MAX_ID; > + fs->qsize = f->fs.qsize; > + fs->plr = f->fs.plr; > + fs->w_q = f->fs.w_q; > + fs->max_th = f->max_th; > + fs->min_th = f->min_th; > + fs->max_p = f->fs.max_p; > + fs->rq_elements = nq; > + > + fs->flags_fs = convertflags2old(f->fs.flags); > + > + *a->start += size; > + return 0; > +} > + > + > +int > +dn_compat_copy_pipe(struct copy_args *a, void *_o) > +{ > + int have = a->end - *a->start; > + int need = 0; > + int pipe_size = sizeof(struct dn_pipe8); > + int queue_size = sizeof(struct dn_flow_queue8); > + int n_queue = 0; /* number of queues */ > + > + struct dn_schk *s = (struct dn_schk *)_o; > + /* calculate needed space: > + * - struct dn_pipe > + * - if there are instances, dn_queue * n_instances > + */ > + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : > + (s->siht ? 1 : 0)); > + need = pipe_size + queue_size * n_queue; > + if (have < need) { > + D("have %d < need %d", have, need); > + return 1; > + } > + /* copy pipe */ > + dn_c_copy_pipe(s, a, n_queue); > + > + /* copy queues */ > + if (s->sch.flags & DN_HAVE_MASK) > + dn_ht_scan(s->siht, dn_c_copy_q, a); > + else if (s->siht) > + dn_c_copy_q(s->siht, a); > + return 0; > +} > + > +int > +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) > +{ > + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; > + > + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; > + fs->fs_nr = f->fs.fs_nr; > + fs->qsize = f->fs.qsize; > + fs->plr = f->fs.plr; > + fs->w_q = f->fs.w_q; > + fs->max_th = f->max_th; > + fs->min_th = f->min_th; > + fs->max_p = f->fs.max_p; > + fs->flow_mask = f->fs.flow_mask; > + fs->rq_elements = nq; > + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); > + fs->parent_nr = f->fs.sched_nr; > + fs->weight = f->fs.par[0]; > + > + fs->flags_fs = convertflags2old(f->fs.flags); > + *a->start += sizeof(struct dn_flow_set); > + return 0; > +} > + > +int > +dn_compat_copy_queue(struct copy_args *a, void *_o) > +{ > + int have = a->end - *a->start; > + int need = 0; > + int fs_size = sizeof(struct dn_flow_set); > + int queue_size = sizeof(struct dn_flow_queue8); > + > + struct dn_fsk *fs = (struct dn_fsk *)_o; > + int n_queue = 0; /* number of queues */ > + > + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : > + (fs->qht ? 1 : 0)); > + > + need = fs_size + queue_size * n_queue; > + if (have < need) { > + D("have < need"); > + return 1; > + } > + > + /* copy flowset */ > + dn_c_copy_fs(fs, a, n_queue); > + > + /* copy queues */ > + if (fs->fs.flags & DN_HAVE_MASK) > + dn_ht_scan(fs->qht, dn_c_copy_q, a); > + else if (fs->qht) > + dn_c_copy_q(fs->qht, a); > + > + return 0; > +} > + > +int > +copy_data_helper_compat(void *_o, void *_arg) > +{ > + struct copy_args *a = _arg; > + > + if (a->type == DN_COMPAT_PIPE) { > + struct dn_schk *s = _o; > + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { > + return 0; /* not old type */ > + } > + /* copy pipe parameters, and if instance exists, copy > + * other parameters and eventually queues. > + */ > + if(dn_compat_copy_pipe(a, _o)) > + return DNHT_SCAN_END; > + } else if (a->type == DN_COMPAT_QUEUE) { > + struct dn_fsk *fs = _o; > + if (fs->fs.fs_nr >= DN_MAX_ID) > + return 0; > + if (dn_compat_copy_queue(a, _o)) > + return DNHT_SCAN_END; > + } > + return 0; > +} > + > +/* Main function to manage old requests */ > +int > +ip_dummynet_compat(struct sockopt *sopt) > +{ > + int error=0; > + void *v = NULL; > + struct dn_id oid; > + > + /* Lenght of data, used to found ipfw version... */ > + int len = sopt->sopt_valsize; > + > + /* len can be 0 if command was dummynet_flush */ > + if (len == pipesize7) { > + D("setting compatibility with FreeBSD 7.2"); > + is7 = 1; > + } > + else if (len == pipesize8 || len == pipesizemax8) { > + D("setting compatibility with FreeBSD 8"); > + is7 = 0; > + } > + > + switch (sopt->sopt_name) { > + default: > + printf("dummynet: -- unknown option %d", sopt->sopt_name); > + error = EINVAL; > + break; > + > + case IP_DUMMYNET_FLUSH: > + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); > + do_config(&oid, oid.len); > + break; > + > + case IP_DUMMYNET_DEL: > + v = malloc(len, M_TEMP, M_WAITOK); > + error = sooptcopyin(sopt, v, len, len); > + if (error) > + break; > + error = dn_compat_del(v); > + free(v, M_TEMP); > + break; > + > + case IP_DUMMYNET_CONFIGURE: > + v = malloc(len, M_TEMP, M_WAITOK); > + error = sooptcopyin(sopt, v, len, len); > + if (error) > + break; > + error = dn_compat_configure(v); > + free(v, M_TEMP); > + break; > + > + case IP_DUMMYNET_GET: { > + void *buf; > + int ret; > + int original_size = sopt->sopt_valsize; > + int size; > + > + ret = dummynet_get(sopt, &buf); > + if (ret) > + return 0;//XXX ? > + size = sopt->sopt_valsize; > + sopt->sopt_valsize = original_size; > + D("size=%d, buf=%p", size, buf); > + ret = sooptcopyout(sopt, buf, size); > + if (ret) > + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); > + if (buf) > + free(buf, M_DUMMYNET); > + } > + } > + > + return error; > +} > + > + > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c b/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c > new file mode 100644 > index 0000000..6211221 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c > @@ -0,0 +1,960 @@ > +/*- > + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * Dummynet portions related to packet handling. > + */ > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_dn_io.c 272089 2014-09-25 02:26:05Z sbruno $"); > + > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/module.h> > +#include <sys/mutex.h> > +#include <sys/priv.h> > +#include <sys/proc.h> > +#include <sys/rwlock.h> > +#include <sys/socket.h> > +#include <sys/time.h> > +#include <sys/sysctl.h> > + > +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ > +#include <net/netisr.h> > +#include <net/vnet.h> > + > +#include <netinet/in.h> > +#include <netinet/ip.h> /* ip_len, ip_off */ > +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ > +#include <netinet/ip_fw.h> > +#include <netinet/ip_dummynet.h> > +#include <netinet/if_ether.h> /* various ether_* routines */ > +#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ > +#include <netinet6/ip6_var.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > + > +/* > + * We keep a private variable for the simulation time, but we could > + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) > + * instead of dn_cfg.curr_time > + */ > + > +struct dn_parms dn_cfg; > +//VNET_DEFINE(struct dn_parms, _base_dn_cfg); > + > +static long tick_last; /* Last tick duration (usec). */ > +static long tick_delta; /* Last vs standard tick diff (usec). */ > +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ > +static long tick_adjustment; /* Tick adjustments done. */ > +static long tick_lost; /* Lost(coalesced) ticks number. */ > +/* Adjusted vs non-adjusted curr_time difference (ticks). */ > +static long tick_diff; > + > +static unsigned long io_pkt; > +static unsigned long io_pkt_fast; > +static unsigned long io_pkt_drop; > + > +/* > + * We use a heap to store entities for which we have pending timer events. > + * The heap is checked at every tick and all entities with expired events > + * are extracted. > + */ > + > +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); > + > +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); > + > +#ifdef SYSCTL_NODE > + > +/* > + * Because of the way the SYSBEGIN/SYSEND macros work on other > + * platforms, there should not be functions between them. > + * So keep the handlers outside the block. > + */ > +static int > +sysctl_hash_size(SYSCTL_HANDLER_ARGS) > +{ > + int error, value; > + > + value = dn_cfg.hash_size; > + error = sysctl_handle_int(oidp, &value, 0, req); > + if (error != 0 || req->newptr == NULL) > + return (error); > + if (value < 16 || value > 65536) > + return (EINVAL); > + dn_cfg.hash_size = value; > + return (0); > +} > + > +static int > +sysctl_limits(SYSCTL_HANDLER_ARGS) > +{ > + int error; > + long value; > + > + if (arg2 != 0) > + value = dn_cfg.slot_limit; > + else > + value = dn_cfg.byte_limit; > + error = sysctl_handle_long(oidp, &value, 0, req); > + > + if (error != 0 || req->newptr == NULL) > + return (error); > + if (arg2 != 0) { > + if (value < 1) > + return (EINVAL); > + dn_cfg.slot_limit = value; > + } else { > + if (value < 1500) > + return (EINVAL); > + dn_cfg.byte_limit = value; > + } > + return (0); > +} > + > +SYSBEGIN(f4) > + > +SYSCTL_DECL(_net_inet); > +SYSCTL_DECL(_net_inet_ip); > +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); > + > +/* wrapper to pass dn_cfg fields to SYSCTL_* */ > +//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) > +#define DC(x) (&(dn_cfg.x)) > +/* parameters */ > + > + > +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, > + CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, > + "I", "Default hash table size"); > + > + > +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, > + CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, > + "L", "Upper limit in slots for pipe queue."); > +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, > + CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, > + "L", "Upper limit in bytes for pipe queue."); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, > + CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, > + CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); > + > +/* RED parameters */ > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, > + CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, > + CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, > + CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); > + > +/* time adjustment */ > +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, > + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); > +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, > + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); > +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, > + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); > +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, > + CTLFLAG_RD, &tick_diff, 0, > + "Adjusted vs non-adjusted curr_time difference (ticks)."); > +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, > + CTLFLAG_RD, &tick_lost, 0, > + "Number of ticks coalesced by dummynet taskqueue."); > + > +/* Drain parameters */ > +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, > + CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); > +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, > + CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); > + > +/* statistics */ > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, > + CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, > + CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, > + CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); > +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, > + CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); > +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, > + CTLFLAG_RD, &io_pkt, 0, > + "Number of packets passed to dummynet."); > +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, > + CTLFLAG_RD, &io_pkt_fast, 0, > + "Number of packets bypassed dummynet scheduler."); > +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, > + CTLFLAG_RD, &io_pkt_drop, 0, > + "Number of packets dropped by dummynet."); > +#undef DC > +SYSEND > + > +#endif > + > +static void dummynet_send(struct mbuf *); > + > +/* > + * Packets processed by dummynet have an mbuf tag associated with > + * them that carries their dummynet state. > + * Outside dummynet, only the 'rule' field is relevant, and it must > + * be at the beginning of the structure. > + */ > +struct dn_pkt_tag { > + struct ipfw_rule_ref rule; /* matching rule */ > + > + /* second part, dummynet specific */ > + int dn_dir; /* action when packet comes out.*/ > + /* see ip_fw_private.h */ > + uint64_t output_time; /* when the pkt is due for delivery*/ > + struct ifnet *ifp; /* interface, for ip_output */ > +// struct _ip6dn_args ip6opt; /* XXX ipv6 options, 192 bytes */ > +}; > + > +/* > + * Return the mbuf tag holding the dummynet state (it should > + * be the first one on the list). > + */ > +static struct dn_pkt_tag * > +dn_tag_get(struct mbuf *m) > +{ > + struct m_tag *mtag = m_tag_first(m); > + KASSERT(mtag != NULL && > + mtag->m_tag_cookie == MTAG_ABI_COMPAT && > + mtag->m_tag_id == PACKET_TAG_DUMMYNET, > + ("packet on dummynet queue w/o dummynet tag!")); > + return (struct dn_pkt_tag *)(mtag+1); > +} > + > +static inline void > +mq_append(struct mq *q, struct mbuf *m) > +{ > +#ifdef USERSPACE > + // buffers from netmap need to be copied > + // XXX note that the routine is not expected to fail > + ND("append %p to %p", m, q); > + if (m->m_flags & M_STACK) { > + struct mbuf *m_new; > + void *p; > + int l, ofs; > + > + ofs = m->m_data - m->__m_extbuf; > + // XXX allocate > + MGETHDR(m_new, M_NOWAIT, MT_DATA); > + ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", > + m, m->__m_extbuf, m->__m_extlen, ofs, m_new); > + p = m_new->__m_extbuf; /* new pointer */ > + l = m_new->__m_extlen; /* new len */ > + if (l <= m->__m_extlen) { > + panic("extlen too large"); > + } > + > + *m_new = *m; // copy > + m_new->m_flags &= ~M_STACK; > + m_new->__m_extbuf = p; // point to new buffer > + _pkt_copy(m->__m_extbuf, p, m->__m_extlen); > + m_new->m_data = p + ofs; > + m = m_new; > + } > +#endif /* USERSPACE */ > + if (q->head == NULL) > + q->head = m; > + else > + q->tail->m_nextpkt = m; > + q->count++; > + q->tail = m; > + m->m_nextpkt = NULL; > +} > + > +/* > + * Dispose a list of packet. Use a functions so if we need to do > + * more work, this is a central point to do it. > + */ > +void dn_free_pkts(struct mbuf *mnext) > +{ > + struct mbuf *m; > + > + while ((m = mnext) != NULL) { > + mnext = m->m_nextpkt; > + FREE_PKT(m); > + } > +} > + > +static int > +red_drops (struct dn_queue *q, int len) > +{ > + /* > + * RED algorithm > + * > + * RED calculates the average queue size (avg) using a low-pass filter > + * with an exponential weighted (w_q) moving average: > + * avg <- (1-w_q) * avg + w_q * q_size > + * where q_size is the queue length (measured in bytes or * packets). > + * > + * If q_size == 0, we compute the idle time for the link, and set > + * avg = (1 - w_q)^(idle/s) > + * where s is the time needed for transmitting a medium-sized packet. > + * > + * Now, if avg < min_th the packet is enqueued. > + * If avg > max_th the packet is dropped. Otherwise, the packet is > + * dropped with probability P function of avg. > + */ > + > + struct dn_fsk *fs = q->fs; > + int64_t p_b = 0; > + > + /* Queue in bytes or packets? */ > + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? > + q->ni.len_bytes : q->ni.length; > + > + /* Average queue size estimation. */ > + if (q_size != 0) { > + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ > + int diff = SCALE(q_size) - q->avg; > + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); > + > + q->avg += (int)v; > + } else { > + /* > + * Queue is empty, find for how long the queue has been > + * empty and use a lookup table for computing > + * (1 - * w_q)^(idle_time/s) where s is the time to send a > + * (small) packet. > + * XXX check wraps... > + */ > + if (q->avg) { > + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); > + > + q->avg = (t < fs->lookup_depth) ? > + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; > + } > + } > + > + /* Should i drop? */ > + if (q->avg < fs->min_th) { > + q->count = -1; > + return (0); /* accept packet */ > + } > + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ > + if (fs->fs.flags & DN_IS_ECN) > + return (1); > + if (fs->fs.flags & DN_IS_GENTLE_RED) { > + /* > + * According to Gentle-RED, if avg is greater than > + * max_th the packet is dropped with a probability > + * p_b = c_3 * avg - c_4 > + * where c_3 = (1 - max_p) / max_th > + * c_4 = 1 - 2 * max_p > + */ > + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - > + fs->c_4; > + } else { > + q->count = -1; > + return (1); > + } > + } else if (q->avg > fs->min_th) { > + if (fs->fs.flags & DN_IS_ECN) > + return (1); > + /* > + * We compute p_b using the linear dropping function > + * p_b = c_1 * avg - c_2 > + * where c_1 = max_p / (max_th - min_th) > + * c_2 = max_p * min_th / (max_th - min_th) > + */ > + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; > + } > + > + if (fs->fs.flags & DN_QSIZE_BYTES) > + p_b = div64((p_b * len) , fs->max_pkt_size); > + if (++q->count == 0) > + q->random = random() & 0xffff; > + else { > + /* > + * q->count counts packets arrived since last drop, so a greater > + * value of q->count means a greater packet drop probability. > + */ > + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { > + q->count = 0; > + /* After a drop we calculate a new random value. */ > + q->random = random() & 0xffff; > + return (1); /* drop */ > + } > + } > + /* End of RED algorithm. */ > + > + return (0); /* accept */ > + > +} > + > +/* > + * ECN/ECT Processing (partially adopted from altq) > + */ > +static int > +ecn_mark(struct mbuf* m) > +{ > + struct ip *ip; > + ip = mtod(m, struct ip *); > + > + switch (ip->ip_v) { > + case IPVERSION: > + { > + u_int8_t otos; > + int sum; > + > + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) > + return (0); /* not-ECT */ > + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) > + return (1); /* already marked */ > + > + /* > + * ecn-capable but not marked, > + * mark CE and update checksum > + */ > + otos = ip->ip_tos; > + ip->ip_tos |= IPTOS_ECN_CE; > + /* > + * update checksum (from RFC1624) > + * HC' = ~(~HC + ~m + m') > + */ > + sum = ~ntohs(ip->ip_sum) & 0xffff; > + sum += (~otos & 0xffff) + ip->ip_tos; > + sum = (sum >> 16) + (sum & 0xffff); > + sum += (sum >> 16); /* add carry */ > + ip->ip_sum = htons(~sum & 0xffff); > + return (1); > + } > +#ifdef INET6 > + case (IPV6_VERSION >> 4): > + { > + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); > + u_int32_t flowlabel; > + > + flowlabel = ntohl(ip6->ip6_flow); > + if ((flowlabel >> 28) != 6) > + return (0); /* version mismatch! */ > + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == > + (IPTOS_ECN_NOTECT << 20)) > + return (0); /* not-ECT */ > + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == > + (IPTOS_ECN_CE << 20)) > + return (1); /* already marked */ > + /* > + * ecn-capable but not marked, mark CE > + */ > + flowlabel |= (IPTOS_ECN_CE << 20); > + ip6->ip6_flow = htonl(flowlabel); > + return (1); > + } > +#endif > + } > + return (0); > +} > + > +/* > + * Enqueue a packet in q, subject to space and queue management policy > + * (whose parameters are in q->fs). > + * Update stats for the queue and the scheduler. > + * Return 0 on success, 1 on drop. The packet is consumed anyways. > + */ > +int > +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) > +{ > + struct dn_fs *f; > + struct dn_flow *ni; /* stats for scheduler instance */ > + uint64_t len; > + > + if (q->fs == NULL || q->_si == NULL) { > + printf("%s fs %p si %p, dropping\n", > + __FUNCTION__, q->fs, q->_si); > + FREE_PKT(m); > + return 1; > + } > + f = &(q->fs->fs); > + ni = &q->_si->ni; > + len = m->m_pkthdr.len; > + /* Update statistics, then check reasons to drop pkt. */ > + q->ni.tot_bytes += len; > + q->ni.tot_pkts++; > + ni->tot_bytes += len; > + ni->tot_pkts++; > + if (drop) > + goto drop; > + if (f->plr && random() < f->plr) > + goto drop; > + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { > + if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) > + goto drop; > + } > + if (f->flags & DN_QSIZE_BYTES) { > + if (q->ni.len_bytes > f->qsize) > + goto drop; > + } else if (q->ni.length >= f->qsize) { > + goto drop; > + } > + mq_append(&q->mq, m); > + q->ni.length++; > + q->ni.len_bytes += len; > + ni->length++; > + ni->len_bytes += len; > + return (0); > + > +drop: > + io_pkt_drop++; > + q->ni.drops++; > + ni->drops++; > + FREE_PKT(m); > + return (1); > +} > + > +/* > + * Fetch packets from the delay line which are due now. If there are > + * leftover packets, reinsert the delay line in the heap. > + * Runs under scheduler lock. > + */ > +static void > +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) > +{ > + struct mbuf *m; > + struct dn_pkt_tag *pkt = NULL; > + > + dline->oid.subtype = 0; /* not in heap */ > + while ((m = dline->mq.head) != NULL) { > + pkt = dn_tag_get(m); > + if (!DN_KEY_LEQ(pkt->output_time, now)) > + break; > + dline->mq.head = m->m_nextpkt; > + dline->mq.count--; > + mq_append(q, m); > + } > + if (m != NULL) { > + dline->oid.subtype = 1; /* in heap */ > + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); > + } > +} > + > +/* > + * Convert the additional MAC overheads/delays into an equivalent > + * number of bits for the given data rate. The samples are > + * in milliseconds so we need to divide by 1000. > + */ > +static uint64_t > +extra_bits(struct mbuf *m, struct dn_schk *s) > +{ > + int index; > + uint64_t bits; > + struct dn_profile *pf = s->profile; > + > + if (!pf || pf->samples_no == 0) > + return 0; > + index = random() % pf->samples_no; > + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); > + if (index >= pf->loss_level) { > + struct dn_pkt_tag *dt = dn_tag_get(m); > + if (dt) > + dt->dn_dir = DIR_DROP; > + } > + return bits; > +} > + > +/* > + * Send traffic from a scheduler instance due by 'now'. > + * Return a pointer to the head of the queue. > + */ > +static struct mbuf * > +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) > +{ > + struct mq def_q; > + struct dn_schk *s = si->sched; > + struct mbuf *m = NULL; > + int delay_line_idle = (si->dline.mq.head == NULL); > + int done, bw; > + > + if (q == NULL) { > + q = &def_q; > + q->head = NULL; > + } > + > + bw = s->link.bandwidth; > + si->kflags &= ~DN_ACTIVE; > + > + if (bw > 0) > + si->credit += (now - si->sched_time) * bw; > + else > + si->credit = 0; > + si->sched_time = now; > + done = 0; > + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { > + uint64_t len_scaled; > + > + done++; > + len_scaled = (bw == 0) ? 0 : hz * > + (m->m_pkthdr.len * 8 + extra_bits(m, s)); > + si->credit -= len_scaled; > + /* Move packet in the delay line */ > + dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; > + mq_append(&si->dline.mq, m); > + } > + > + /* > + * If credit >= 0 the instance is idle, mark time. > + * Otherwise put back in the heap, and adjust the output > + * time of the last inserted packet, m, which was too early. > + */ > + if (si->credit >= 0) { > + si->idle_time = now; > + } else { > + uint64_t t; > + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); > + t = div64(bw - 1 - si->credit, bw); > + if (m) > + dn_tag_get(m)->output_time += t; > + si->kflags |= DN_ACTIVE; > + heap_insert(&dn_cfg.evheap, now + t, si); > + } > + if (delay_line_idle && done) > + transmit_event(q, &si->dline, now); > + return q->head; > +} > + > +/* > + * The timer handler for dummynet. Time is computed in ticks, but > + * but the code is tolerant to the actual rate at which this is called. > + * Once complete, the function reschedules itself for the next tick. > + */ > +void > +dummynet_task(void *context, int pending) > +{ > + struct timeval t; > + struct mq q = { NULL, NULL }; /* queue to accumulate results */ > + > + CURVNET_SET((struct vnet *)context); > + > + DN_BH_WLOCK(); > + > + /* Update number of lost(coalesced) ticks. */ > + tick_lost += pending - 1; > + > + getmicrouptime(&t); > + /* Last tick duration (usec). */ > + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + > + (t.tv_usec - dn_cfg.prev_t.tv_usec); > + /* Last tick vs standard tick difference (usec). */ > + tick_delta = (tick_last * hz - 1000000) / hz; > + /* Accumulated tick difference (usec). */ > + tick_delta_sum += tick_delta; > + > + dn_cfg.prev_t = t; > + > + /* > + * Adjust curr_time if the accumulated tick difference is > + * greater than the 'standard' tick. Since curr_time should > + * be monotonically increasing, we do positive adjustments > + * as required, and throttle curr_time in case of negative > + * adjustment. > + */ > + dn_cfg.curr_time++; > + if (tick_delta_sum - tick >= 0) { > + int diff = tick_delta_sum / tick; > + > + dn_cfg.curr_time += diff; > + tick_diff += diff; > + tick_delta_sum %= tick; > + tick_adjustment++; > + } else if (tick_delta_sum + tick <= 0) { > + dn_cfg.curr_time--; > + tick_diff--; > + tick_delta_sum += tick; > + tick_adjustment++; > + } > + > + /* serve pending events, accumulate in q */ > + for (;;) { > + struct dn_id *p; /* generic parameter to handler */ > + > + if (dn_cfg.evheap.elements == 0 || > + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) > + break; > + p = HEAP_TOP(&dn_cfg.evheap)->object; > + heap_extract(&dn_cfg.evheap, NULL); > + > + if (p->type == DN_SCH_I) { > + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); > + } else { /* extracted a delay line */ > + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); > + } > + } > + if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { > + dn_cfg.expire_cycle = 0; > + dn_drain_scheduler(); > + dn_drain_queue(); > + } > + > + DN_BH_WUNLOCK(); > + dn_reschedule(); > + if (q.head != NULL) > + dummynet_send(q.head); > + CURVNET_RESTORE(); > +} > + > +/* > + * forward a chain of packets to the proper destination. > + * This runs outside the dummynet lock. > + */ > +static void > +dummynet_send(struct mbuf *m) > +{ > + struct mbuf *n; > + > + for (; m != NULL; m = n) { > + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ > + struct m_tag *tag; > + int dst; > + > + n = m->m_nextpkt; > + m->m_nextpkt = NULL; > + tag = m_tag_first(m); > + if (tag == NULL) { /* should not happen */ > + dst = DIR_DROP; > + } else { > + struct dn_pkt_tag *pkt = dn_tag_get(m); > + /* extract the dummynet info, rename the tag > + * to carry reinject info. > + */ > + if (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2) && > + pkt->ifp == NULL) { > + dst = DIR_DROP; > + } else { > + dst = pkt->dn_dir; > + ifp = pkt->ifp; > + tag->m_tag_cookie = MTAG_IPFW_RULE; > + tag->m_tag_id = 0; > + } > + } > + > + switch (dst) { > + case DIR_OUT: > + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); > + break ; > + > + case DIR_IN : > + netisr_dispatch(NETISR_IP, m); > + break; > + > +#ifdef INET6 > + case DIR_IN | PROTO_IPV6: > + netisr_dispatch(NETISR_IPV6, m); > + break; > + > + case DIR_OUT | PROTO_IPV6: > + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); > + break; > +#endif > + > + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ > + if (bridge_dn_p != NULL) > + ((*bridge_dn_p)(m, ifp)); > + else > + printf("dummynet: if_bridge not loaded\n"); > + > + break; > + > + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ > + /* > + * The Ethernet code assumes the Ethernet header is > + * contiguous in the first mbuf header. > + * Insure this is true. > + */ > + if (m->m_len < ETHER_HDR_LEN && > + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { > + printf("dummynet/ether: pullup failed, " > + "dropping packet\n"); > + break; > + } > + ether_demux(m->m_pkthdr.rcvif, m); > + break; > + > + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ > + ether_output_frame(ifp, m); > + break; > + > + case DIR_DROP: > + /* drop the packet after some time */ > + FREE_PKT(m); > + break; > + > + default: > + printf("dummynet: bad switch %d!\n", dst); > + FREE_PKT(m); > + break; > + } > + } > +} > + > +static inline int > +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) > +{ > + struct dn_pkt_tag *dt; > + struct m_tag *mtag; > + > + mtag = m_tag_get(PACKET_TAG_DUMMYNET, > + sizeof(*dt), M_NOWAIT | M_ZERO); > + if (mtag == NULL) > + return 1; /* Cannot allocate packet header. */ > + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ > + dt = (struct dn_pkt_tag *)(mtag + 1); > + dt->rule = fwa->rule; > + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ > + dt->dn_dir = dir; > + dt->ifp = fwa->oif; > + /* dt->output tame is updated as we move through */ > + dt->output_time = dn_cfg.curr_time; > + return 0; > +} > + > + > +/* > + * dummynet hook for packets. > + * We use the argument to locate the flowset fs and the sched_set sch > + * associated to it. The we apply flow_mask and sched_mask to > + * determine the queue and scheduler instances. > + * > + * dir where shall we send the packet after dummynet. > + * *m0 the mbuf with the packet > + * ifp the 'ifp' parameter from the caller. > + * NULL in ip_input, destination interface in ip_output, > + */ > +int > +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) > +{ > + struct mbuf *m = *m0; > + struct dn_fsk *fs = NULL; > + struct dn_sch_inst *si; > + struct dn_queue *q = NULL; /* default */ > + > + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + > + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); > + DN_BH_WLOCK(); > + io_pkt++; > + /* we could actually tag outside the lock, but who cares... */ > + if (tag_mbuf(m, dir, fwa)) > + goto dropit; > + if (dn_cfg.busy) { > + /* if the upper half is busy doing something expensive, > + * lets queue the packet and move forward > + */ > + mq_append(&dn_cfg.pending, m); > + m = *m0 = NULL; /* consumed */ > + goto done; /* already active, nothing to do */ > + } > + /* XXX locate_flowset could be optimised with a direct ref. */ > + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); > + if (fs == NULL) > + goto dropit; /* This queue/pipe does not exist! */ > + if (fs->sched == NULL) /* should not happen */ > + goto dropit; > + /* find scheduler instance, possibly applying sched_mask */ > + si = ipdn_si_find(fs->sched, &(fwa->f_id)); > + if (si == NULL) > + goto dropit; > + /* > + * If the scheduler supports multiple queues, find the right one > + * (otherwise it will be ignored by enqueue). > + */ > + if (fs->sched->fp->flags & DN_MULTIQUEUE) { > + q = ipdn_q_find(fs, si, &(fwa->f_id)); > + if (q == NULL) > + goto dropit; > + } > + if (fs->sched->fp->enqueue(si, q, m)) { > + /* packet was dropped by enqueue() */ > + m = *m0 = NULL; > + goto dropit; > + } > + > + if (si->kflags & DN_ACTIVE) { > + m = *m0 = NULL; /* consumed */ > + goto done; /* already active, nothing to do */ > + } > + > + /* compute the initial allowance */ > + if (si->idle_time < dn_cfg.curr_time) { > + /* Do this only on the first packet on an idle pipe */ > + struct dn_link *p = &fs->sched->link; > + > + si->sched_time = dn_cfg.curr_time; > + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; > + if (p->burst) { > + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; > + if (burst > p->burst) > + burst = p->burst; > + si->credit += burst; > + } > + } > + /* pass through scheduler and delay line */ > + m = serve_sched(NULL, si, dn_cfg.curr_time); > + > + /* optimization -- pass it back to ipfw for immediate send */ > + /* XXX Don't call dummynet_send() if scheduler return the packet > + * just enqueued. This avoid a lock order reversal. > + * > + */ > + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { > + /* fast io, rename the tag * to carry reinject info. */ > + struct m_tag *tag = m_tag_first(m); > + > + tag->m_tag_cookie = MTAG_IPFW_RULE; > + tag->m_tag_id = 0; > + io_pkt_fast++; > + if (m->m_nextpkt != NULL) { > + printf("dummynet: fast io: pkt chain detected!\n"); > + m->m_nextpkt = NULL; > + } > + m = NULL; > + } else { > + *m0 = NULL; > + } > +done: > + DN_BH_WUNLOCK(); > + if (m) > + dummynet_send(m); > + return 0; > + > +dropit: > + io_pkt_drop++; > + DN_BH_WUNLOCK(); > + if (m) > + FREE_PKT(m); > + *m0 = NULL; > + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; > +} > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h b/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h > new file mode 100644 > index 0000000..fdd7448 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h > @@ -0,0 +1,404 @@ > +/*- > + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * internal dummynet APIs. > + * > + * $FreeBSD: head/sys/netpfil/ipfw/ip_dn_private.h 258467 2013-11-22 05:02:37Z luigi $ > + */ > + > +#ifndef _IP_DN_PRIVATE_H > +#define _IP_DN_PRIVATE_H > + > +/* debugging support > + * use ND() to remove debugging, D() to print a line, > + * DX(level, ...) to print above a certain level > + * If you redefine D() you are expected to redefine all. > + */ > +#ifndef D > +#define ND(fmt, ...) do {} while (0) > +#define D1(fmt, ...) do {} while (0) > +#define D(fmt, ...) printf("%-10s " fmt "\n", \ > + __FUNCTION__, ## __VA_ARGS__) > +#define DX(lev, fmt, ...) do { \ > + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) > +#endif > + > +MALLOC_DECLARE(M_DUMMYNET); > + > +#ifndef __linux__ > +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) > +#endif > + > +#define DN_LOCK_INIT() do { \ > + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ > + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ > + } while (0) > +#define DN_LOCK_DESTROY() do { \ > + mtx_destroy(&dn_cfg.uh_mtx); \ > + mtx_destroy(&dn_cfg.bh_mtx); \ > + } while (0) > +#if 0 /* not used yet */ > +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) > +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) > +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) > +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) > +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) > +#endif > + > +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) > +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) > +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) > +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) > +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) > + > +SLIST_HEAD(dn_schk_head, dn_schk); > +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); > +SLIST_HEAD(dn_fsk_head, dn_fsk); > +SLIST_HEAD(dn_queue_head, dn_queue); > +SLIST_HEAD(dn_alg_head, dn_alg); > + > +struct mq { /* a basic queue of packets*/ > + struct mbuf *head, *tail; > + int count; > +}; > + > +static inline void > +set_oid(struct dn_id *o, int type, int len) > +{ > + o->type = type; > + o->len = len; > + o->subtype = 0; > +} > + > +/* > + * configuration and global data for a dummynet instance > + * > + * When a configuration is modified from userland, 'id' is incremented > + * so we can use the value to check for stale pointers. > + */ > +struct dn_parms { > + uint32_t id; /* configuration version */ > + > + /* defaults (sysctl-accessible) */ > + int red_lookup_depth; > + int red_avg_pkt_size; > + int red_max_pkt_size; > + int hash_size; > + int max_hash_size; > + long byte_limit; /* max queue sizes */ > + long slot_limit; > + > + int io_fast; > + int debug; > + > + /* timekeeping */ > + struct timeval prev_t; /* last time dummynet_tick ran */ > + struct dn_heap evheap; /* scheduled events */ > + > + /* counters of objects -- used for reporting space */ > + int schk_count; > + int si_count; > + int fsk_count; > + int queue_count; > + > + /* ticks and other stuff */ > + uint64_t curr_time; > + /* flowsets and schedulers are in hash tables, with 'hash_size' > + * buckets. fshash is looked up at every packet arrival > + * so better be generous if we expect many entries. > + */ > + struct dn_ht *fshash; > + struct dn_ht *schedhash; > + /* list of flowsets without a scheduler -- use sch_chain */ > + struct dn_fsk_head fsu; /* list of unlinked flowsets */ > + struct dn_alg_head schedlist; /* list of algorithms */ > + > + /* Store the fs/sch to scan when draining. The value is the > + * bucket number of the hash table. Expire can be disabled > + * with net.inet.ip.dummynet.expire=0, or it happens every > + * expire ticks. > + **/ > + int drain_fs; > + int drain_sch; > + uint32_t expire; > + uint32_t expire_cycle; /* tick count */ > + > + int init_done; > + > + /* if the upper half is busy doing something long, > + * can set the busy flag and we will enqueue packets in > + * a queue for later processing. > + */ > + int busy; > + struct mq pending; > + > +#ifdef _KERNEL > + /* > + * This file is normally used in the kernel, unless we do > + * some userland tests, in which case we do not need a mtx. > + * uh_mtx arbitrates between system calls and also > + * protects fshash, schedhash and fsunlinked. > + * These structures are readonly for the lower half. > + * bh_mtx protects all other structures which may be > + * modified upon packet arrivals > + */ > +#if defined( __linux__ ) || defined( _WIN32 ) > + spinlock_t uh_mtx; > + spinlock_t bh_mtx; > +#else > + struct mtx uh_mtx; > + struct mtx bh_mtx; > +#endif > + > +#endif /* _KERNEL */ > +}; > + > +/* > + * Delay line, contains all packets on output from a link. > + * Every scheduler instance has one. > + */ > +struct delay_line { > + struct dn_id oid; > + struct dn_sch_inst *si; > + struct mq mq; > +}; > + > +/* > + * The kernel side of a flowset. It is linked in a hash table > + * of flowsets, and in a list of children of their parent scheduler. > + * qht is either the queue or (if HAVE_MASK) a hash table queues. > + * Note that the mask to use is the (flow_mask|sched_mask), which > + * changes as we attach/detach schedulers. So we store it here. > + * > + * XXX If we want to add scheduler-specific parameters, we need to > + * put them in external storage because the scheduler may not be > + * available when the fsk is created. > + */ > +struct dn_fsk { /* kernel side of a flowset */ > + struct dn_fs fs; > + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ > + > + struct ipfw_flow_id fsk_mask; > + > + /* qht is a hash table of queues, or just a single queue > + * a bit in fs.flags tells us which one > + */ > + struct dn_ht *qht; > + struct dn_schk *sched; /* Sched we are linked to */ > + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ > + > + /* bucket index used by drain routine to drain queues for this > + * flowset > + */ > + int drain_bucket; > + /* Parameter realted to RED / GRED */ > + /* original values are in dn_fs*/ > + int w_q ; /* queue weight (scaled) */ > + int max_th ; /* maximum threshold for queue (scaled) */ > + int min_th ; /* minimum threshold for queue (scaled) */ > + int max_p ; /* maximum value for p_b (scaled) */ > + > + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ > + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ > + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ > + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ > + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ > + u_int lookup_depth ; /* depth of lookup table */ > + int lookup_step ; /* granularity inside the lookup table */ > + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ > + int avg_pkt_size ; /* medium packet size */ > + int max_pkt_size ; /* max packet size */ > +}; > + > +/* > + * A queue is created as a child of a flowset unless it belongs to > + * a !MULTIQUEUE scheduler. It is normally in a hash table in the > + * flowset. fs always points to the parent flowset. > + * si normally points to the sch_inst, unless the flowset has been > + * detached from the scheduler -- in this case si == NULL and we > + * should not enqueue. > + */ > +struct dn_queue { > + struct dn_flow ni; /* oid, flow_id, stats */ > + struct mq mq; /* packets queue */ > + struct dn_sch_inst *_si; /* owner scheduler instance */ > + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ > + struct dn_fsk *fs; /* parent flowset. */ > + > + /* RED parameters */ > + int avg; /* average queue length est. (scaled) */ > + int count; /* arrivals since last RED drop */ > + int random; /* random value (scaled) */ > + uint64_t q_time; /* start of queue idle time */ > + > +}; > + > +/* > + * The kernel side of a scheduler. Contains the userland config, > + * a link, pointer to extra config arguments from command line, > + * kernel flags, and a pointer to the scheduler methods. > + * It is stored in a hash table, and holds a list of all > + * flowsets and scheduler instances. > + * XXX sch must be at the beginning, see schk_hash(). > + */ > +struct dn_schk { > + struct dn_sch sch; > + struct dn_alg *fp; /* Pointer to scheduler functions */ > + struct dn_link link; /* The link, embedded */ > + struct dn_profile *profile; /* delay profile, if any */ > + struct dn_id *cfg; /* extra config arguments */ > + > + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ > + > + struct dn_fsk_head fsk_list; /* all fsk linked to me */ > + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ > + > + /* bucket index used by the drain routine to drain the scheduler > + * instance for this flowset. > + */ > + int drain_bucket; > + > + /* Hash table of all instances (through sch.sched_mask) > + * or single instance if no mask. Always valid. > + */ > + struct dn_ht *siht; > +}; > + > + > +/* > + * Scheduler instance. > + * Contains variables and all queues relative to a this instance. > + * This struct is created a runtime. > + */ > +struct dn_sch_inst { > + struct dn_flow ni; /* oid, flowid and stats */ > + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ > + struct delay_line dline; > + struct dn_schk *sched; /* the template */ > + int kflags; /* DN_ACTIVE */ > + > + int64_t credit; /* bits I can transmit (more or less). */ > + uint64_t sched_time; /* time link was scheduled in ready_heap */ > + uint64_t idle_time; /* start of scheduler instance idle time */ > + > + /* q_count is the number of queues that this instance is using. > + * The counter is incremented or decremented when > + * a reference from the queue is created or deleted. > + * It is used to make sure that a scheduler instance can be safely > + * deleted by the drain routine. See notes below. > + */ > + int q_count; > + > +}; > + > +/* > + * NOTE about object drain. > + * The system will automatically (XXX check when) drain queues and > + * scheduler instances when they are idle. > + * A queue is idle when it has no packets; an instance is idle when > + * it is not in the evheap heap, and the corresponding delay line is empty. > + * A queue can be safely deleted when it is idle because of the scheduler > + * function xxx_free_queue() will remove any references to it. > + * An instance can be only deleted when no queues reference it. To be sure > + * of that, a counter (q_count) stores the number of queues that are pointing > + * to the instance. > + * > + * XXX > + * Order of scan: > + * - take all flowset in a bucket for the flowset hash table > + * - take all queues in a bucket for the flowset > + * - increment the queue bucket > + * - scan next flowset bucket > + * Nothing is done if a bucket contains no entries. > + * > + * The same schema is used for sceduler instances > + */ > + > + > +/* kernel-side flags. Linux has DN_DELETE in fcntl.h > + */ > +enum { > + /* 1 and 2 are reserved for the SCAN flags */ > + DN_DESTROY = 0x0004, /* destroy */ > + DN_DELETE_FS = 0x0008, /* destroy flowset */ > + DN_DETACH = 0x0010, > + DN_ACTIVE = 0x0020, /* object is in evheap */ > + DN_F_DLINE = 0x0040, /* object is a delay line */ > + DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed > + * by scheduler */ > + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ > +}; > + > +extern struct dn_parms dn_cfg; > +//VNET_DECLARE(struct dn_parms, _base_dn_cfg); > +//#define dn_cfg VNET(_base_dn_cfg) > + > +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); > +void dummynet_task(void *context, int pending); > +void dn_reschedule(void); > + > +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, > + struct ipfw_flow_id *); > +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); > + > +/* > + * copy_range is a template for requests for ranges of pipes/queues/scheds. > + * The number of ranges is variable and can be derived by o.len. > + * As a default, we use a small number of entries so that the struct > + * fits easily on the stack and is sufficient for most common requests. > + */ > +#define DEFAULT_RANGES 5 > +struct copy_range { > + struct dn_id o; > + uint32_t r[ 2 * DEFAULT_RANGES ]; > +}; > + > +struct copy_args { > + char **start; > + char *end; > + int flags; > + int type; > + struct copy_range *extra; /* extra filtering */ > +}; > + > +struct sockopt; > +int ip_dummynet_compat(struct sockopt *sopt); > +int dummynet_get(struct sockopt *sopt, void **compat); > +int dn_c_copy_q (void *_ni, void *arg); > +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); > +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); > +int dn_compat_copy_queue(struct copy_args *a, void *_o); > +int dn_compat_copy_pipe(struct copy_args *a, void *_o); > +int copy_data_helper_compat(void *_o, void *_arg); > +int dn_compat_calc_size(void); > +int do_config(void *p, int l); > + > +/* function to drain idle object */ > +void dn_drain_scheduler(void); > +void dn_drain_queue(void); > + > +#endif /* _IP_DN_PRIVATE_H */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c b/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c > new file mode 100644 > index 0000000..a4dcb4f > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c > @@ -0,0 +1,2334 @@ > +/*- > + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa > + * Portions Copyright (c) 2000 Akamba Corp. > + * All rights reserved > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_dummynet.c 272840 2014-10-09 19:32:35Z melifaro $"); > + > +/* > + * Configuration and internal object management for dummynet. > + */ > + > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/module.h> > +#include <sys/mutex.h> > +#include <sys/priv.h> > +#include <sys/proc.h> > +#include <sys/rwlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/time.h> > +#include <sys/taskqueue.h> > +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ > +#include <netinet/ip_fw.h> > +#include <netinet/ip_dummynet.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/dn_heap.h> > +#include <netpfil/ipfw/ip_dn_private.h> > +#include <netpfil/ipfw/dn_sched.h> > + > +/* which objects to copy */ > +#define DN_C_LINK 0x01 > +#define DN_C_SCH 0x02 > +#define DN_C_FLOW 0x04 > +#define DN_C_FS 0x08 > +#define DN_C_QUEUE 0x10 > + > +/* we use this argument in case of a schk_new */ > +struct schk_new_arg { > + struct dn_alg *fp; > + struct dn_sch *sch; > +}; > + > +/*---- callout hooks. ----*/ > +static struct callout dn_timeout; > +static struct task dn_task; > +static struct taskqueue *dn_tq = NULL; > + > +static void > +dummynet(void *arg) > +{ > + > + (void)arg; /* UNUSED */ > + taskqueue_enqueue_fast(dn_tq, &dn_task); > +} > + > +void > +dn_reschedule(void) > +{ > + callout_reset(&dn_timeout, 1, dummynet, NULL); > +} > +/*----- end of callout hooks -----*/ > + > +/* Return a scheduler descriptor given the type or name. */ > +static struct dn_alg * > +find_sched_type(int type, char *name) > +{ > + struct dn_alg *d; > + > + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { > + if (d->type == type || (name && !strcasecmp(d->name, name))) > + return d; > + } > + return NULL; /* not found */ > +} > + > +int > +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) > +{ > + int oldv = *v; > + const char *op = NULL; > + if (dflt < lo) > + dflt = lo; > + if (dflt > hi) > + dflt = hi; > + if (oldv < lo) { > + *v = dflt; > + op = "Bump"; > + } else if (oldv > hi) { > + *v = hi; > + op = "Clamp"; > + } else > + return *v; > + if (op && msg) > + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); > + return *v; > +} > + > +/*---- flow_id mask, hash and compare functions ---*/ > +/* > + * The flow_id includes the 5-tuple, the queue/pipe number > + * which we store in the extra area in host order, > + * and for ipv6 also the flow_id6. > + * XXX see if we want the tos byte (can store in 'flags') > + */ > +static struct ipfw_flow_id * > +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) > +{ > + int is_v6 = IS_IP6_FLOW_ID(id); > + > + id->dst_port &= mask->dst_port; > + id->src_port &= mask->src_port; > + id->proto &= mask->proto; > + id->extra &= mask->extra; > + if (is_v6) { > + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); > + APPLY_MASK(&id->src_ip6, &mask->src_ip6); > + id->flow_id6 &= mask->flow_id6; > + } else { > + id->dst_ip &= mask->dst_ip; > + id->src_ip &= mask->src_ip; > + } > + return id; > +} > + > +/* computes an OR of two masks, result in dst and also returned */ > +static struct ipfw_flow_id * > +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) > +{ > + int is_v6 = IS_IP6_FLOW_ID(dst); > + > + dst->dst_port |= src->dst_port; > + dst->src_port |= src->src_port; > + dst->proto |= src->proto; > + dst->extra |= src->extra; > + if (is_v6) { > +#define OR_MASK(_d, _s) \ > + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ > + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ > + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ > + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; > + OR_MASK(&dst->dst_ip6, &src->dst_ip6); > + OR_MASK(&dst->src_ip6, &src->src_ip6); > +#undef OR_MASK > + dst->flow_id6 |= src->flow_id6; > + } else { > + dst->dst_ip |= src->dst_ip; > + dst->src_ip |= src->src_ip; > + } > + return dst; > +} > + > +static int > +nonzero_mask(struct ipfw_flow_id *m) > +{ > + if (m->dst_port || m->src_port || m->proto || m->extra) > + return 1; > + if (IS_IP6_FLOW_ID(m)) { > + return > + m->dst_ip6.__u6_addr.__u6_addr32[0] || > + m->dst_ip6.__u6_addr.__u6_addr32[1] || > + m->dst_ip6.__u6_addr.__u6_addr32[2] || > + m->dst_ip6.__u6_addr.__u6_addr32[3] || > + m->src_ip6.__u6_addr.__u6_addr32[0] || > + m->src_ip6.__u6_addr.__u6_addr32[1] || > + m->src_ip6.__u6_addr.__u6_addr32[2] || > + m->src_ip6.__u6_addr.__u6_addr32[3] || > + m->flow_id6; > + } else { > + return m->dst_ip || m->src_ip; > + } > +} > + > +/* XXX we may want a better hash function */ > +static uint32_t > +flow_id_hash(struct ipfw_flow_id *id) > +{ > + uint32_t i; > + > + if (IS_IP6_FLOW_ID(id)) { > + uint32_t *d = (uint32_t *)&id->dst_ip6; > + uint32_t *s = (uint32_t *)&id->src_ip6; > + i = (d[0] ) ^ (d[1]) ^ > + (d[2] ) ^ (d[3]) ^ > + (d[0] >> 15) ^ (d[1] >> 15) ^ > + (d[2] >> 15) ^ (d[3] >> 15) ^ > + (s[0] << 1) ^ (s[1] << 1) ^ > + (s[2] << 1) ^ (s[3] << 1) ^ > + (s[0] << 16) ^ (s[1] << 16) ^ > + (s[2] << 16) ^ (s[3] << 16) ^ > + (id->dst_port << 1) ^ (id->src_port) ^ > + (id->extra) ^ > + (id->proto ) ^ (id->flow_id6); > + } else { > + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ > + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ > + (id->extra) ^ > + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); > + } > + return i; > +} > + > +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ > +static int > +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) > +{ > + int is_v6 = IS_IP6_FLOW_ID(id1); > + > + if (!is_v6) { > + if (IS_IP6_FLOW_ID(id2)) > + return 1; /* different address families */ > + > + return (id1->dst_ip == id2->dst_ip && > + id1->src_ip == id2->src_ip && > + id1->dst_port == id2->dst_port && > + id1->src_port == id2->src_port && > + id1->proto == id2->proto && > + id1->extra == id2->extra) ? 0 : 1; > + } > + /* the ipv6 case */ > + return ( > + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && > + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && > + id1->dst_port == id2->dst_port && > + id1->src_port == id2->src_port && > + id1->proto == id2->proto && > + id1->extra == id2->extra && > + id1->flow_id6 == id2->flow_id6) ? 0 : 1; > +} > +/*--------- end of flow-id mask, hash and compare ---------*/ > + > +/*--- support functions for the qht hashtable ---- > + * Entries are hashed by flow-id > + */ > +static uint32_t > +q_hash(uintptr_t key, int flags, void *arg) > +{ > + /* compute the hash slot from the flow id */ > + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? > + &((struct dn_queue *)key)->ni.fid : > + (struct ipfw_flow_id *)key; > + > + return flow_id_hash(id); > +} > + > +static int > +q_match(void *obj, uintptr_t key, int flags, void *arg) > +{ > + struct dn_queue *o = (struct dn_queue *)obj; > + struct ipfw_flow_id *id2; > + > + if (flags & DNHT_KEY_IS_OBJ) { > + /* compare pointers */ > + id2 = &((struct dn_queue *)key)->ni.fid; > + } else { > + id2 = (struct ipfw_flow_id *)key; > + } > + return (0 == flow_id_cmp(&o->ni.fid, id2)); > +} > + > +/* > + * create a new queue instance for the given 'key'. > + */ > +static void * > +q_new(uintptr_t key, int flags, void *arg) > +{ > + struct dn_queue *q, *template = arg; > + struct dn_fsk *fs = template->fs; > + int size = sizeof(*q) + fs->sched->fp->q_datalen; > + > + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (q == NULL) { > + D("no memory for new queue"); > + return NULL; > + } > + > + set_oid(&q->ni.oid, DN_QUEUE, size); > + if (fs->fs.flags & DN_QHT_HASH) > + q->ni.fid = *(struct ipfw_flow_id *)key; > + q->fs = fs; > + q->_si = template->_si; > + q->_si->q_count++; > + > + if (fs->sched->fp->new_queue) > + fs->sched->fp->new_queue(q); > + dn_cfg.queue_count++; > + return q; > +} > + > +/* > + * Notify schedulers that a queue is going away. > + * If (flags & DN_DESTROY), also free the packets. > + * The version for callbacks is called q_delete_cb(). > + */ > +static void > +dn_delete_queue(struct dn_queue *q, int flags) > +{ > + struct dn_fsk *fs = q->fs; > + > + // D("fs %p si %p\n", fs, q->_si); > + /* notify the parent scheduler that the queue is going away */ > + if (fs && fs->sched->fp->free_queue) > + fs->sched->fp->free_queue(q); > + q->_si->q_count--; > + q->_si = NULL; > + if (flags & DN_DESTROY) { > + if (q->mq.head) > + dn_free_pkts(q->mq.head); > + bzero(q, sizeof(*q)); // safety > + free(q, M_DUMMYNET); > + dn_cfg.queue_count--; > + } > +} > + > +static int > +q_delete_cb(void *q, void *arg) > +{ > + int flags = (int)(uintptr_t)arg; > + dn_delete_queue(q, flags); > + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; > +} > + > +/* > + * calls dn_delete_queue/q_delete_cb on all queues, > + * which notifies the parent scheduler and possibly drains packets. > + * flags & DN_DESTROY: drains queues and destroy qht; > + */ > +static void > +qht_delete(struct dn_fsk *fs, int flags) > +{ > + ND("fs %d start flags %d qht %p", > + fs->fs.fs_nr, flags, fs->qht); > + if (!fs->qht) > + return; > + if (fs->fs.flags & DN_QHT_HASH) { > + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); > + if (flags & DN_DESTROY) { > + dn_ht_free(fs->qht, 0); > + fs->qht = NULL; > + } > + } else { > + dn_delete_queue((struct dn_queue *)(fs->qht), flags); > + if (flags & DN_DESTROY) > + fs->qht = NULL; > + } > +} > + > +/* > + * Find and possibly create the queue for a MULTIQUEUE scheduler. > + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). > + */ > +struct dn_queue * > +ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, > + struct ipfw_flow_id *id) > +{ > + struct dn_queue template; > + > + template._si = si; > + template.fs = fs; > + > + if (fs->fs.flags & DN_QHT_HASH) { > + struct ipfw_flow_id masked_id; > + if (fs->qht == NULL) { > + fs->qht = dn_ht_init(NULL, fs->fs.buckets, > + offsetof(struct dn_queue, q_next), > + q_hash, q_match, q_new); > + if (fs->qht == NULL) > + return NULL; > + } > + masked_id = *id; > + flow_id_mask(&fs->fsk_mask, &masked_id); > + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, > + DNHT_INSERT, &template); > + } else { > + if (fs->qht == NULL) > + fs->qht = q_new(0, 0, &template); > + return (struct dn_queue *)fs->qht; > + } > +} > +/*--- end of queue hash table ---*/ > + > +/*--- support functions for the sch_inst hashtable ---- > + * > + * These are hashed by flow-id > + */ > +static uint32_t > +si_hash(uintptr_t key, int flags, void *arg) > +{ > + /* compute the hash slot from the flow id */ > + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? > + &((struct dn_sch_inst *)key)->ni.fid : > + (struct ipfw_flow_id *)key; > + > + return flow_id_hash(id); > +} > + > +static int > +si_match(void *obj, uintptr_t key, int flags, void *arg) > +{ > + struct dn_sch_inst *o = obj; > + struct ipfw_flow_id *id2; > + > + id2 = (flags & DNHT_KEY_IS_OBJ) ? > + &((struct dn_sch_inst *)key)->ni.fid : > + (struct ipfw_flow_id *)key; > + return flow_id_cmp(&o->ni.fid, id2) == 0; > +} > + > +/* > + * create a new instance for the given 'key' > + * Allocate memory for instance, delay line and scheduler private data. > + */ > +static void * > +si_new(uintptr_t key, int flags, void *arg) > +{ > + struct dn_schk *s = arg; > + struct dn_sch_inst *si; > + int l = sizeof(*si) + s->fp->si_datalen; > + > + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (si == NULL) > + goto error; > + > + /* Set length only for the part passed up to userland. */ > + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); > + set_oid(&(si->dline.oid), DN_DELAY_LINE, > + sizeof(struct delay_line)); > + /* mark si and dline as outside the event queue */ > + si->ni.oid.id = si->dline.oid.id = -1; > + > + si->sched = s; > + si->dline.si = si; > + > + if (s->fp->new_sched && s->fp->new_sched(si)) { > + D("new_sched error"); > + goto error; > + } > + if (s->sch.flags & DN_HAVE_MASK) > + si->ni.fid = *(struct ipfw_flow_id *)key; > + > + dn_cfg.si_count++; > + return si; > + > +error: > + if (si) { > + bzero(si, sizeof(*si)); // safety > + free(si, M_DUMMYNET); > + } > + return NULL; > +} > + > +/* > + * Callback from siht to delete all scheduler instances. Remove > + * si and delay line from the system heap, destroy all queues. > + * We assume that all flowset have been notified and do not > + * point to us anymore. > + */ > +static int > +si_destroy(void *_si, void *arg) > +{ > + struct dn_sch_inst *si = _si; > + struct dn_schk *s = si->sched; > + struct delay_line *dl = &si->dline; > + > + if (dl->oid.subtype) /* remove delay line from event heap */ > + heap_extract(&dn_cfg.evheap, dl); > + dn_free_pkts(dl->mq.head); /* drain delay line */ > + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ > + heap_extract(&dn_cfg.evheap, si); > + if (s->fp->free_sched) > + s->fp->free_sched(si); > + bzero(si, sizeof(*si)); /* safety */ > + free(si, M_DUMMYNET); > + dn_cfg.si_count--; > + return DNHT_SCAN_DEL; > +} > + > +/* > + * Find the scheduler instance for this packet. If we need to apply > + * a mask, do on a local copy of the flow_id to preserve the original. > + * Assume siht is always initialized if we have a mask. > + */ > +struct dn_sch_inst * > +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) > +{ > + > + if (s->sch.flags & DN_HAVE_MASK) { > + struct ipfw_flow_id id_t = *id; > + flow_id_mask(&s->sch.sched_mask, &id_t); > + return dn_ht_find(s->siht, (uintptr_t)&id_t, > + DNHT_INSERT, s); > + } > + if (!s->siht) > + s->siht = si_new(0, 0, s); > + return (struct dn_sch_inst *)s->siht; > +} > + > +/* callback to flush credit for the scheduler instance */ > +static int > +si_reset_credit(void *_si, void *arg) > +{ > + struct dn_sch_inst *si = _si; > + struct dn_link *p = &si->sched->link; > + > + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); > + return 0; > +} > + > +static void > +schk_reset_credit(struct dn_schk *s) > +{ > + if (s->sch.flags & DN_HAVE_MASK) > + dn_ht_scan(s->siht, si_reset_credit, NULL); > + else if (s->siht) > + si_reset_credit(s->siht, NULL); > +} > +/*---- end of sch_inst hashtable ---------------------*/ > + > +/*------------------------------------------------------- > + * flowset hash (fshash) support. Entries are hashed by fs_nr. > + * New allocations are put in the fsunlinked list, from which > + * they are removed when they point to a specific scheduler. > + */ > +static uint32_t > +fsk_hash(uintptr_t key, int flags, void *arg) > +{ > + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : > + ((struct dn_fsk *)key)->fs.fs_nr; > + > + return ( (i>>8)^(i>>4)^i ); > +} > + > +static int > +fsk_match(void *obj, uintptr_t key, int flags, void *arg) > +{ > + struct dn_fsk *fs = obj; > + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : > + ((struct dn_fsk *)key)->fs.fs_nr; > + > + return (fs->fs.fs_nr == i); > +} > + > +static void * > +fsk_new(uintptr_t key, int flags, void *arg) > +{ > + struct dn_fsk *fs; > + > + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (fs) { > + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); > + dn_cfg.fsk_count++; > + fs->drain_bucket = 0; > + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); > + } > + return fs; > +} > + > +/* > + * detach flowset from its current scheduler. Flags as follows: > + * DN_DETACH removes from the fsk_list > + * DN_DESTROY deletes individual queues > + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). > + */ > +static void > +fsk_detach(struct dn_fsk *fs, int flags) > +{ > + if (flags & DN_DELETE_FS) > + flags |= DN_DESTROY; > + ND("fs %d from sched %d flags %s %s %s", > + fs->fs.fs_nr, fs->fs.sched_nr, > + (flags & DN_DELETE_FS) ? "DEL_FS":"", > + (flags & DN_DESTROY) ? "DEL":"", > + (flags & DN_DETACH) ? "DET":""); > + if (flags & DN_DETACH) { /* detach from the list */ > + struct dn_fsk_head *h; > + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; > + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); > + } > + /* Free the RED parameters, they will be recomputed on > + * subsequent attach if needed. > + */ > + if (fs->w_q_lookup) > + free(fs->w_q_lookup, M_DUMMYNET); > + fs->w_q_lookup = NULL; > + qht_delete(fs, flags); > + if (fs->sched && fs->sched->fp->free_fsk) > + fs->sched->fp->free_fsk(fs); > + fs->sched = NULL; > + if (flags & DN_DELETE_FS) { > + bzero(fs, sizeof(*fs)); /* safety */ > + free(fs, M_DUMMYNET); > + dn_cfg.fsk_count--; > + } else { > + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); > + } > +} > + > +/* > + * Detach or destroy all flowsets in a list. > + * flags specifies what to do: > + * DN_DESTROY: flush all queues > + * DN_DELETE_FS: DN_DESTROY + destroy flowset > + * DN_DELETE_FS implies DN_DESTROY > + */ > +static void > +fsk_detach_list(struct dn_fsk_head *h, int flags) > +{ > + struct dn_fsk *fs; > + int n = 0; /* only for stats */ > + > + ND("head %p flags %x", h, flags); > + while ((fs = SLIST_FIRST(h))) { > + SLIST_REMOVE_HEAD(h, sch_chain); > + n++; > + fsk_detach(fs, flags); > + } > + ND("done %d flowsets", n); > +} > + > +/* > + * called on 'queue X delete' -- removes the flowset from fshash, > + * deletes all queues for the flowset, and removes the flowset. > + */ > +static int > +delete_fs(int i, int locked) > +{ > + struct dn_fsk *fs; > + int err = 0; > + > + if (!locked) > + DN_BH_WLOCK(); > + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); > + ND("fs %d found %p", i, fs); > + if (fs) { > + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); > + err = 0; > + } else > + err = EINVAL; > + if (!locked) > + DN_BH_WUNLOCK(); > + return err; > +} > + > +/*----- end of flowset hashtable support -------------*/ > + > +/*------------------------------------------------------------ > + * Scheduler hash. When searching by index we pass sched_nr, > + * otherwise we pass struct dn_sch * which is the first field in > + * struct dn_schk so we can cast between the two. We use this trick > + * because in the create phase (but it should be fixed). > + */ > +static uint32_t > +schk_hash(uintptr_t key, int flags, void *_arg) > +{ > + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : > + ((struct dn_schk *)key)->sch.sched_nr; > + return ( (i>>8)^(i>>4)^i ); > +} > + > +static int > +schk_match(void *obj, uintptr_t key, int flags, void *_arg) > +{ > + struct dn_schk *s = (struct dn_schk *)obj; > + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : > + ((struct dn_schk *)key)->sch.sched_nr; > + return (s->sch.sched_nr == i); > +} > + > +/* > + * Create the entry and intialize with the sched hash if needed. > + * Leave s->fp unset so we can tell whether a dn_ht_find() returns > + * a new object or a previously existing one. > + */ > +static void * > +schk_new(uintptr_t key, int flags, void *arg) > +{ > + struct schk_new_arg *a = arg; > + struct dn_schk *s; > + int l = sizeof(*s) +a->fp->schk_datalen; > + > + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (s == NULL) > + return NULL; > + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); > + s->sch = *a->sch; // copy initial values > + s->link.link_nr = s->sch.sched_nr; > + SLIST_INIT(&s->fsk_list); > + /* initialize the hash table or create the single instance */ > + s->fp = a->fp; /* si_new needs this */ > + s->drain_bucket = 0; > + if (s->sch.flags & DN_HAVE_MASK) { > + s->siht = dn_ht_init(NULL, s->sch.buckets, > + offsetof(struct dn_sch_inst, si_next), > + si_hash, si_match, si_new); > + if (s->siht == NULL) { > + free(s, M_DUMMYNET); > + return NULL; > + } > + } > + s->fp = NULL; /* mark as a new scheduler */ > + dn_cfg.schk_count++; > + return s; > +} > + > +/* > + * Callback for sched delete. Notify all attached flowsets to > + * detach from the scheduler, destroy the internal flowset, and > + * all instances. The scheduler goes away too. > + * arg is 0 (only detach flowsets and destroy instances) > + * DN_DESTROY (detach & delete queues, delete schk) > + * or DN_DELETE_FS (delete queues and flowsets, delete schk) > + */ > +static int > +schk_delete_cb(void *obj, void *arg) > +{ > + struct dn_schk *s = obj; > + struct dn_profile **p = &s->profile; > + int i, lim = 1 /* how many profiles */; > + > +#if 0 > + int a = (int)arg; > + ND("sched %d arg %s%s", > + s->sch.sched_nr, > + a&DN_DESTROY ? "DEL ":"", > + a&DN_DELETE_FS ? "DEL_FS":""); > +#endif > + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); > + /* no more flowset pointing to us now */ > + if (s->sch.flags & DN_HAVE_MASK) { > + dn_ht_scan(s->siht, si_destroy, NULL); > + dn_ht_free(s->siht, 0); > + } else if (s->siht) > + si_destroy(s->siht, NULL); > + > + for (i = 0; i < lim; i++) { > + if (p[i]) { > + free(p[i], M_DUMMYNET); > + p[i] = NULL; > + } > + } > + s->siht = NULL; > + if (s->fp->destroy) > + s->fp->destroy(s); > + bzero(s, sizeof(*s)); // safety > + free(obj, M_DUMMYNET); > + dn_cfg.schk_count--; > + return DNHT_SCAN_DEL; > +} > + > +/* > + * called on a 'sched X delete' command. Deletes a single scheduler. > + * This is done by removing from the schedhash, unlinking all > + * flowsets and deleting their traffic. > + */ > +static int > +delete_schk(int i) > +{ > + struct dn_schk *s; > + > + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); > + ND("%d %p", i, s); > + if (!s) > + return EINVAL; > + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ > + /* then detach flowsets, delete traffic */ > + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); > + return 0; > +} > +/*--- end of schk hashtable support ---*/ > + > +static int > +copy_obj(char **start, char *end, void *_o, const char *msg, int i) > +{ > + struct dn_id *o = _o; > + int have = end - *start; > + > + if (have < o->len || o->len == 0 || o->type == 0) { > + D("(WARN) type %d %s %d have %d need %d", > + o->type, msg, i, have, o->len); > + return 1; > + } > + ND("type %d %s %d len %d", o->type, msg, i, o->len); > + bcopy(_o, *start, o->len); > + if (o->type == DN_LINK) { > + /* Adjust burst parameter for link */ > + struct dn_link *l = (struct dn_link *)*start; > + l->burst = div64(l->burst, 8 * hz); > + /* convert back to milliseconds */ > + l->delay = l->delay * 1000 / hz; > + } else if (o->type == DN_SCH) { > + /* Set id->id to the number of instances */ > + struct dn_schk *s = _o; > + struct dn_id *id = (struct dn_id *)(*start); > + id->id = (s->sch.flags & DN_HAVE_MASK) ? > + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); > + } > + *start += o->len; > + return 0; > +} > + > +/* Specific function to copy a queue. > + * Copies only the user-visible part of a queue (which is in > + * a struct dn_flow), and sets len accordingly. > + */ > +static int > +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) > +{ > + struct dn_id *o = _o; > + int have = end - *start; > + int len = sizeof(struct dn_flow); /* see above comment */ > + > + if (have < len || o->len == 0 || o->type != DN_QUEUE) { > + D("ERROR type %d %s %d have %d need %d", > + o->type, msg, i, have, len); > + return 1; > + } > + ND("type %d %s %d len %d", o->type, msg, i, len); > + bcopy(_o, *start, len); > + ((struct dn_id*)(*start))->len = len; > + *start += len; > + return 0; > +} > + > +static int > +copy_q_cb(void *obj, void *arg) > +{ > + struct dn_queue *q = obj; > + struct copy_args *a = arg; > + struct dn_flow *ni = (struct dn_flow *)(*a->start); > + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) > + return DNHT_SCAN_END; > + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ > + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); > + return 0; > +} > + > +static int > +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) > +{ > + if (!fs->qht) > + return 0; > + if (fs->fs.flags & DN_QHT_HASH) > + dn_ht_scan(fs->qht, copy_q_cb, a); > + else > + copy_q_cb(fs->qht, a); > + return 0; > +} > + > +/* > + * This routine only copies the initial part of a profile ? XXX > + */ > +static int > +copy_profile(struct copy_args *a, struct dn_profile *p) > +{ > + int have = a->end - *a->start; > + /* XXX start with base length */ > + int profile_len = sizeof(struct dn_profile) - > + ED_MAX_SAMPLES_NO*sizeof(int); > + > + if (p == NULL) > + return 0; > + profile_len += p->samples_no * sizeof(int); /* add actual samples */ > + if (have < profile_len) { > + D("error have %d need %d", have, profile_len); > + return 1; > + } > + bcopy(p, *a->start, profile_len); > + ((struct dn_id *)(*a->start))->len = profile_len; > + *a->start += profile_len; > + return 0; > +} > + > +static int > +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) > +{ > + struct dn_fs *ufs = (struct dn_fs *)(*a->start); > + if (!fs) > + return 0; > + ND("flowset %d", fs->fs.fs_nr); > + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) > + return DNHT_SCAN_END; > + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? > + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); > + if (flags) { /* copy queues */ > + copy_q(a, fs, 0); > + } > + return 0; > +} > + > +static int > +copy_si_cb(void *obj, void *arg) > +{ > + struct dn_sch_inst *si = obj; > + struct copy_args *a = arg; > + struct dn_flow *ni = (struct dn_flow *)(*a->start); > + if (copy_obj(a->start, a->end, &si->ni, "inst", > + si->sched->sch.sched_nr)) > + return DNHT_SCAN_END; > + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ > + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); > + return 0; > +} > + > +static int > +copy_si(struct copy_args *a, struct dn_schk *s, int flags) > +{ > + if (s->sch.flags & DN_HAVE_MASK) > + dn_ht_scan(s->siht, copy_si_cb, a); > + else if (s->siht) > + copy_si_cb(s->siht, a); > + return 0; > +} > + > +/* > + * compute a list of children of a scheduler and copy up > + */ > +static int > +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) > +{ > + struct dn_fsk *fs; > + struct dn_id *o; > + uint32_t *p; > + > + int n = 0, space = sizeof(*o); > + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { > + if (fs->fs.fs_nr < DN_MAX_ID) > + n++; > + } > + space += n * sizeof(uint32_t); > + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); > + if (a->end - *(a->start) < space) > + return DNHT_SCAN_END; > + o = (struct dn_id *)(*(a->start)); > + o->len = space; > + *a->start += o->len; > + o->type = DN_TEXT; > + p = (uint32_t *)(o+1); > + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) > + if (fs->fs.fs_nr < DN_MAX_ID) > + *p++ = fs->fs.fs_nr; > + return 0; > +} > + > +static int > +copy_data_helper(void *_o, void *_arg) > +{ > + struct copy_args *a = _arg; > + uint32_t *r = a->extra->r; /* start of first range */ > + uint32_t *lim; /* first invalid pointer */ > + int n; > + > + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); > + > + if (a->type == DN_LINK || a->type == DN_SCH) { > + /* pipe|sched show, we receive a dn_schk */ > + struct dn_schk *s = _o; > + > + n = s->sch.sched_nr; > + if (a->type == DN_SCH && n >= DN_MAX_ID) > + return 0; /* not a scheduler */ > + if (a->type == DN_LINK && n <= DN_MAX_ID) > + return 0; /* not a pipe */ > + > + /* see if the object is within one of our ranges */ > + for (;r < lim; r += 2) { > + if (n < r[0] || n > r[1]) > + continue; > + /* Found a valid entry, copy and we are done */ > + if (a->flags & DN_C_LINK) { > + if (copy_obj(a->start, a->end, > + &s->link, "link", n)) > + return DNHT_SCAN_END; > + if (copy_profile(a, s->profile)) > + return DNHT_SCAN_END | DNHT_COPY_ERR; > + if (copy_flowset(a, s->fs, 0)) > + return DNHT_SCAN_END; > + } > + if (a->flags & DN_C_SCH) { > + if (copy_obj(a->start, a->end, > + &s->sch, "sched", n)) > + return DNHT_SCAN_END | DNHT_COPY_ERR; > + /* list all attached flowsets */ > + if (copy_fsk_list(a, s, 0)) > + return DNHT_SCAN_END | DNHT_COPY_ERR; > + } > + if (a->flags & DN_C_FLOW) > + copy_si(a, s, 0); > + break; > + } > + } else if (a->type == DN_FS) { > + /* queue show, skip internal flowsets */ > + struct dn_fsk *fs = _o; > + > + n = fs->fs.fs_nr; > + if (n >= DN_MAX_ID) > + return 0; > + /* see if the object is within one of our ranges */ > + for (;r < lim; r += 2) { > + if (n < r[0] || n > r[1]) > + continue; > + if (copy_flowset(a, fs, 0)) > + return DNHT_SCAN_END | DNHT_COPY_ERR; > + copy_q(a, fs, 0); > + break; /* we are done */ > + } > + } > + return 0; > +} > + > +static inline struct dn_schk * > +locate_scheduler(int i) > +{ > + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); > +} > + > +/* > + * red parameters are in fixed point arithmetic. > + */ > +static int > +config_red(struct dn_fsk *fs) > +{ > + int64_t s, idle, weight, w0; > + int t, i; > + > + fs->w_q = fs->fs.w_q; > + fs->max_p = fs->fs.max_p; > + ND("called"); > + /* Doing stuff that was in userland */ > + i = fs->sched->link.bandwidth; > + s = (i <= 0) ? 0 : > + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; > + > + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ > + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); > + /* fs->lookup_step not scaled, */ > + if (!fs->lookup_step) > + fs->lookup_step = 1; > + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled > + > + for (t = fs->lookup_step; t > 1; --t) > + weight = SCALE_MUL(weight, w0); > + fs->lookup_weight = (int)(weight); // scaled > + > + /* Now doing stuff that was in kerneland */ > + fs->min_th = SCALE(fs->fs.min_th); > + fs->max_th = SCALE(fs->fs.max_th); > + > + if (fs->fs.max_th == fs->fs.min_th) > + fs->c_1 = fs->max_p; > + else > + fs->c_1 = SCALE((int64_t)(fs->max_p)) / (fs->fs.max_th - fs->fs.min_th); > + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); > + > + if (fs->fs.flags & DN_IS_GENTLE_RED) { > + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; > + fs->c_4 = SCALE(1) - 2 * fs->max_p; > + } > + > + /* If the lookup table already exist, free and create it again. */ > + if (fs->w_q_lookup) { > + free(fs->w_q_lookup, M_DUMMYNET); > + fs->w_q_lookup = NULL; > + } > + if (dn_cfg.red_lookup_depth == 0) { > + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" > + "must be > 0\n"); > + fs->fs.flags &= ~DN_IS_RED; > + fs->fs.flags &= ~DN_IS_GENTLE_RED; > + return (EINVAL); > + } > + fs->lookup_depth = dn_cfg.red_lookup_depth; > + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), > + M_DUMMYNET, M_NOWAIT); > + if (fs->w_q_lookup == NULL) { > + printf("dummynet: sorry, cannot allocate red lookup table\n"); > + fs->fs.flags &= ~DN_IS_RED; > + fs->fs.flags &= ~DN_IS_GENTLE_RED; > + return(ENOSPC); > + } > + > + /* Fill the lookup table with (1 - w_q)^x */ > + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; > + > + for (i = 1; i < fs->lookup_depth; i++) > + fs->w_q_lookup[i] = > + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); > + > + if (dn_cfg.red_avg_pkt_size < 1) > + dn_cfg.red_avg_pkt_size = 512; > + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; > + if (dn_cfg.red_max_pkt_size < 1) > + dn_cfg.red_max_pkt_size = 1500; > + fs->max_pkt_size = dn_cfg.red_max_pkt_size; > + ND("exit"); > + return 0; > +} > + > +/* Scan all flowset attached to this scheduler and update red */ > +static void > +update_red(struct dn_schk *s) > +{ > + struct dn_fsk *fs; > + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { > + if (fs && (fs->fs.flags & DN_IS_RED)) > + config_red(fs); > + } > +} > + > +/* attach flowset to scheduler s, possibly requeue */ > +static void > +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) > +{ > + ND("remove fs %d from fsunlinked, link to sched %d", > + fs->fs.fs_nr, s->sch.sched_nr); > + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); > + fs->sched = s; > + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); > + if (s->fp->new_fsk) > + s->fp->new_fsk(fs); > + /* XXX compute fsk_mask */ > + fs->fsk_mask = fs->fs.flow_mask; > + if (fs->sched->sch.flags & DN_HAVE_MASK) > + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); > + if (fs->qht) { > + /* > + * we must drain qht according to the old > + * type, and reinsert according to the new one. > + * The requeue is complex -- in general we need to > + * reclassify every single packet. > + * For the time being, let's hope qht is never set > + * when we reach this point. > + */ > + D("XXX TODO requeue from fs %d to sch %d", > + fs->fs.fs_nr, s->sch.sched_nr); > + fs->qht = NULL; > + } > + /* set the new type for qht */ > + if (nonzero_mask(&fs->fsk_mask)) > + fs->fs.flags |= DN_QHT_HASH; > + else > + fs->fs.flags &= ~DN_QHT_HASH; > + > + /* XXX config_red() can fail... */ > + if (fs->fs.flags & DN_IS_RED) > + config_red(fs); > +} > + > +/* update all flowsets which may refer to this scheduler */ > +static void > +update_fs(struct dn_schk *s) > +{ > + struct dn_fsk *fs, *tmp; > + > + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { > + if (s->sch.sched_nr != fs->fs.sched_nr) { > + D("fs %d for sch %d not %d still unlinked", > + fs->fs.fs_nr, fs->fs.sched_nr, > + s->sch.sched_nr); > + continue; > + } > + fsk_attach(fs, s); > + } > +} > + > +/* > + * Configuration -- to preserve backward compatibility we use > + * the following scheme (N is 65536) > + * NUMBER SCHED LINK FLOWSET > + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue > + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 > + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 > + * > + * "pipe i config" configures #1, #2 and #3 > + * "sched i config" configures #1 and possibly #6 > + * "queue i config" configures #3 > + * #1 is configured with 'pipe i config' or 'sched i config' > + * #2 is configured with 'pipe i config', and created if not > + * existing with 'sched i config' > + * #3 is configured with 'queue i config' > + * #4 is automatically configured after #1, can only be FIFO > + * #5 is automatically configured after #2 > + * #6 is automatically created when #1 is !MULTIQUEUE, > + * and can be updated. > + * #7 is automatically configured after #2 > + */ > + > +/* > + * configure a link (and its FIFO instance) > + */ > +static int > +config_link(struct dn_link *p, struct dn_id *arg) > +{ > + int i; > + > + if (p->oid.len != sizeof(*p)) { > + D("invalid pipe len %d", p->oid.len); > + return EINVAL; > + } > + i = p->link_nr; > + if (i <= 0 || i >= DN_MAX_ID) > + return EINVAL; > + /* > + * The config program passes parameters as follows: > + * bw = bits/second (0 means no limits), > + * delay = ms, must be translated into ticks. > + * qsize = slots/bytes > + * burst ??? > + */ > + p->delay = (p->delay * hz) / 1000; > + /* Scale burst size: bytes -> bits * hz */ > + p->burst *= 8 * hz; > + > + DN_BH_WLOCK(); > + /* do it twice, base link and FIFO link */ > + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { > + struct dn_schk *s = locate_scheduler(i); > + if (s == NULL) { > + DN_BH_WUNLOCK(); > + D("sched %d not found", i); > + return EINVAL; > + } > + /* remove profile if exists */ > + if (s->profile) { > + free(s->profile, M_DUMMYNET); > + s->profile = NULL; > + } > + /* copy all parameters */ > + s->link.oid = p->oid; > + s->link.link_nr = i; > + s->link.delay = p->delay; > + if (s->link.bandwidth != p->bandwidth) { > + /* XXX bandwidth changes, need to update red params */ > + s->link.bandwidth = p->bandwidth; > + update_red(s); > + } > + s->link.burst = p->burst; > + schk_reset_credit(s); > + } > + dn_cfg.id++; > + DN_BH_WUNLOCK(); > + return 0; > +} > + > +/* > + * configure a flowset. Can be called from inside with locked=1, > + */ > +static struct dn_fsk * > +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) > +{ > + int i; > + struct dn_fsk *fs; > + > + if (nfs->oid.len != sizeof(*nfs)) { > + D("invalid flowset len %d", nfs->oid.len); > + return NULL; > + } > + i = nfs->fs_nr; > + if (i <= 0 || i >= 3*DN_MAX_ID) > + return NULL; > + ND("flowset %d", i); > + /* XXX other sanity checks */ > + if (nfs->flags & DN_QSIZE_BYTES) { > + ipdn_bound_var(&nfs->qsize, 16384, > + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); > + } else { > + ipdn_bound_var(&nfs->qsize, 50, > + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); > + } > + if (nfs->flags & DN_HAVE_MASK) { > + /* make sure we have some buckets */ > + ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, > + 1, dn_cfg.max_hash_size, "flowset buckets"); > + } else { > + nfs->buckets = 1; /* we only need 1 */ > + } > + if (!locked) > + DN_BH_WLOCK(); > + do { /* exit with break when done */ > + struct dn_schk *s; > + int flags = nfs->sched_nr ? DNHT_INSERT : 0; > + int j; > + int oldc = dn_cfg.fsk_count; > + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); > + if (fs == NULL) { > + D("missing sched for flowset %d", i); > + break; > + } > + /* grab some defaults from the existing one */ > + if (nfs->sched_nr == 0) /* reuse */ > + nfs->sched_nr = fs->fs.sched_nr; > + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { > + if (nfs->par[j] == -1) /* reuse */ > + nfs->par[j] = fs->fs.par[j]; > + } > + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { > + ND("flowset %d unchanged", i); > + break; /* no change, nothing to do */ > + } > + if (oldc != dn_cfg.fsk_count) /* new item */ > + dn_cfg.id++; > + s = locate_scheduler(nfs->sched_nr); > + /* detach from old scheduler if needed, preserving > + * queues if we need to reattach. Then update the > + * configuration, and possibly attach to the new sched. > + */ > + DX(2, "fs %d changed sched %d@%p to %d@%p", > + fs->fs.fs_nr, > + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); > + if (fs->sched) { > + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); > + flags |= DN_DESTROY; /* XXX temporary */ > + fsk_detach(fs, flags); > + } > + fs->fs = *nfs; /* copy configuration */ > + if (s != NULL) > + fsk_attach(fs, s); > + } while (0); > + if (!locked) > + DN_BH_WUNLOCK(); > + return fs; > +} > + > +/* > + * config/reconfig a scheduler and its FIFO variant. > + * For !MULTIQUEUE schedulers, also set up the flowset. > + * > + * On reconfigurations (detected because s->fp is set), > + * detach existing flowsets preserving traffic, preserve link, > + * and delete the old scheduler creating a new one. > + */ > +static int > +config_sched(struct dn_sch *_nsch, struct dn_id *arg) > +{ > + struct dn_schk *s; > + struct schk_new_arg a; /* argument for schk_new */ > + int i; > + struct dn_link p; /* copy of oldlink */ > + struct dn_profile *pf = NULL; /* copy of old link profile */ > + /* Used to preserv mask parameter */ > + struct ipfw_flow_id new_mask; > + int new_buckets = 0; > + int new_flags = 0; > + int pipe_cmd; > + int err = ENOMEM; > + > + a.sch = _nsch; > + if (a.sch->oid.len != sizeof(*a.sch)) { > + D("bad sched len %d", a.sch->oid.len); > + return EINVAL; > + } > + i = a.sch->sched_nr; > + if (i <= 0 || i >= DN_MAX_ID) > + return EINVAL; > + /* make sure we have some buckets */ > + if (a.sch->flags & DN_HAVE_MASK) > + ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, > + 1, dn_cfg.max_hash_size, "sched buckets"); > + /* XXX other sanity checks */ > + bzero(&p, sizeof(p)); > + > + pipe_cmd = a.sch->flags & DN_PIPE_CMD; > + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? > + if (pipe_cmd) { > + /* Copy mask parameter */ > + new_mask = a.sch->sched_mask; > + new_buckets = a.sch->buckets; > + new_flags = a.sch->flags; > + } > + DN_BH_WLOCK(); > +again: /* run twice, for wfq and fifo */ > + /* > + * lookup the type. If not supplied, use the previous one > + * or default to WF2Q+. Otherwise, return an error. > + */ > + dn_cfg.id++; > + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); > + if (a.fp != NULL) { > + /* found. Lookup or create entry */ > + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); > + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { > + /* No type. search existing s* or retry with WF2Q+ */ > + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); > + if (s != NULL) { > + a.fp = s->fp; > + /* Scheduler exists, skip to FIFO scheduler > + * if command was pipe config... > + */ > + if (pipe_cmd) > + goto next; > + } else { > + /* New scheduler, create a wf2q+ with no mask > + * if command was pipe config... > + */ > + if (pipe_cmd) { > + /* clear mask parameter */ > + bzero(&a.sch->sched_mask, sizeof(new_mask)); > + a.sch->buckets = 0; > + a.sch->flags &= ~DN_HAVE_MASK; > + } > + a.sch->oid.subtype = DN_SCHED_WF2QP; > + goto again; > + } > + } else { > + D("invalid scheduler type %d %s", > + a.sch->oid.subtype, a.sch->name); > + err = EINVAL; > + goto error; > + } > + /* normalize name and subtype */ > + a.sch->oid.subtype = a.fp->type; > + bzero(a.sch->name, sizeof(a.sch->name)); > + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); > + if (s == NULL) { > + D("cannot allocate scheduler %d", i); > + goto error; > + } > + /* restore existing link if any */ > + if (p.link_nr) { > + s->link = p; > + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ > + s->profile = NULL; /* XXX maybe not needed */ > + } else { > + s->profile = malloc(sizeof(struct dn_profile), > + M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (s->profile == NULL) { > + D("cannot allocate profile"); > + goto error; //XXX > + } > + bcopy(pf, s->profile, sizeof(*pf)); > + } > + } > + p.link_nr = 0; > + if (s->fp == NULL) { > + DX(2, "sched %d new type %s", i, a.fp->name); > + } else if (s->fp != a.fp || > + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { > + /* already existing. */ > + DX(2, "sched %d type changed from %s to %s", > + i, s->fp->name, a.fp->name); > + DX(4, " type/sub %d/%d -> %d/%d", > + s->sch.oid.type, s->sch.oid.subtype, > + a.sch->oid.type, a.sch->oid.subtype); > + if (s->link.link_nr == 0) > + D("XXX WARNING link 0 for sched %d", i); > + p = s->link; /* preserve link */ > + if (s->profile) {/* preserve profile */ > + if (!pf) > + pf = malloc(sizeof(*pf), > + M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (pf) /* XXX should issue a warning otherwise */ > + bcopy(s->profile, pf, sizeof(*pf)); > + } > + /* remove from the hash */ > + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); > + /* Detach flowsets, preserve queues. */ > + // schk_delete_cb(s, NULL); > + // XXX temporarily, kill queues > + schk_delete_cb(s, (void *)DN_DESTROY); > + goto again; > + } else { > + DX(4, "sched %d unchanged type %s", i, a.fp->name); > + } > + /* complete initialization */ > + s->sch = *a.sch; > + s->fp = a.fp; > + s->cfg = arg; > + // XXX schk_reset_credit(s); > + /* create the internal flowset if needed, > + * trying to reuse existing ones if available > + */ > + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { > + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); > + if (!s->fs) { > + struct dn_fs fs; > + bzero(&fs, sizeof(fs)); > + set_oid(&fs.oid, DN_FS, sizeof(fs)); > + fs.fs_nr = i + DN_MAX_ID; > + fs.sched_nr = i; > + s->fs = config_fs(&fs, NULL, 1 /* locked */); > + } > + if (!s->fs) { > + schk_delete_cb(s, (void *)DN_DESTROY); > + D("error creating internal fs for %d", i); > + goto error; > + } > + } > + /* call init function after the flowset is created */ > + if (s->fp->config) > + s->fp->config(s); > + update_fs(s); > +next: > + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ > + i += DN_MAX_ID; > + if (pipe_cmd) { > + /* Restore mask parameter for FIFO */ > + a.sch->sched_mask = new_mask; > + a.sch->buckets = new_buckets; > + a.sch->flags = new_flags; > + } else { > + /* sched config shouldn't modify the FIFO scheduler */ > + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { > + /* FIFO already exist, don't touch it */ > + err = 0; /* and this is not an error */ > + goto error; > + } > + } > + a.sch->sched_nr = i; > + a.sch->oid.subtype = DN_SCHED_FIFO; > + bzero(a.sch->name, sizeof(a.sch->name)); > + goto again; > + } > + err = 0; > +error: > + DN_BH_WUNLOCK(); > + if (pf) > + free(pf, M_DUMMYNET); > + return err; > +} > + > +/* > + * attach a profile to a link > + */ > +static int > +config_profile(struct dn_profile *pf, struct dn_id *arg) > +{ > + struct dn_schk *s; > + int i, olen, err = 0; > + > + if (pf->oid.len < sizeof(*pf)) { > + D("short profile len %d", pf->oid.len); > + return EINVAL; > + } > + i = pf->link_nr; > + if (i <= 0 || i >= DN_MAX_ID) > + return EINVAL; > + /* XXX other sanity checks */ > + DN_BH_WLOCK(); > + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { > + struct dn_profile **pkpf, *kpf; > + > + s = locate_scheduler(i); > + > + if (s == NULL) { > + err = EINVAL; > + break; > + } > + dn_cfg.id++; > + pkpf = &s->profile; /* prepare to handle multiple profiles */ > + kpf = *pkpf; > + > + /* > + * If we had a profile and the new one does not fit, > + * or it is deleted, then we need to free memory. > + */ > + if (kpf && (pf->samples_no == 0 || > + kpf->oid.len < pf->oid.len)) { > + free(kpf, M_DUMMYNET); > + *pkpf = NULL; > + } > + if (pf->samples_no == 0) > + continue; > + /* > + * new profile, possibly allocate memory > + * and copy data. > + */ > + if (kpf == NULL) > + *pkpf = kpf = malloc(pf->oid.len, > + M_DUMMYNET, M_NOWAIT | M_ZERO); > + if (kpf == NULL) { > + D("no memory for profile %d", i); > + err = ENOMEM; > + break; > + } > + /* preserve larger length XXX double check */ > + olen = kpf->oid.len; > + if (olen < pf->oid.len) > + olen = pf->oid.len; > + bcopy(pf, kpf, pf->oid.len); > + kpf->oid.len = olen; > + } > + DN_BH_WUNLOCK(); > + return err; > +} > + > +/* > + * Delete all objects: > + */ > +static void > +dummynet_flush(void) > +{ > + > + /* delete all schedulers and related links/queues/flowsets */ > + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, > + (void *)(uintptr_t)DN_DELETE_FS); > + /* delete all remaining (unlinked) flowsets */ > + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); > + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); > + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); > + /* Reinitialize system heap... */ > + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); > +} > + > +/* > + * Main handler for configuration. We are guaranteed to be called > + * with an oid which is at least a dn_id. > + * - the first object is the command (config, delete, flush, ...) > + * - config_link must be issued after the corresponding config_sched > + * - parameters (DN_TXT) for an object must preceed the object > + * processed on a config_sched. > + */ > +int > +do_config(void *p, int l) > +{ > + struct dn_id *next, *o; > + int err = 0, err2 = 0; > + struct dn_id *arg = NULL; > + uintptr_t *a; > + > + o = p; > + if (o->id != DN_API_VERSION) { > + D("invalid api version got %d need %d", > + o->id, DN_API_VERSION); > + return EINVAL; > + } > + for (; l >= sizeof(*o); o = next) { > + struct dn_id *prev = arg; > + if (o->len < sizeof(*o) || l < o->len) { > + D("bad len o->len %d len %d", o->len, l); > + err = EINVAL; > + break; > + } > + l -= o->len; > + next = (struct dn_id *)((char *)o + o->len); > + err = 0; > + switch (o->type) { > + default: > + D("cmd %d not implemented", o->type); > + break; > + > +#ifdef EMULATE_SYSCTL > + /* sysctl emulation. > + * if we recognize the command, jump to the correct > + * handler and return > + */ > + case DN_SYSCTL_SET: > + err = kesysctl_emu_set(p, l); > + return err; > +#endif > + > + case DN_CMD_CONFIG: /* simply a header */ > + break; > + > + case DN_CMD_DELETE: > + /* the argument is in the first uintptr_t after o */ > + a = (uintptr_t *)(o+1); > + if (o->len < sizeof(*o) + sizeof(*a)) { > + err = EINVAL; > + break; > + } > + switch (o->subtype) { > + case DN_LINK: > + /* delete base and derived schedulers */ > + DN_BH_WLOCK(); > + err = delete_schk(*a); > + err2 = delete_schk(*a + DN_MAX_ID); > + DN_BH_WUNLOCK(); > + if (!err) > + err = err2; > + break; > + > + default: > + D("invalid delete type %d", > + o->subtype); > + err = EINVAL; > + break; > + > + case DN_FS: > + err = (*a <1 || *a >= DN_MAX_ID) ? > + EINVAL : delete_fs(*a, 0) ; > + break; > + } > + break; > + > + case DN_CMD_FLUSH: > + DN_BH_WLOCK(); > + dummynet_flush(); > + DN_BH_WUNLOCK(); > + break; > + case DN_TEXT: /* store argument the next block */ > + prev = NULL; > + arg = o; > + break; > + case DN_LINK: > + err = config_link((struct dn_link *)o, arg); > + break; > + case DN_PROFILE: > + err = config_profile((struct dn_profile *)o, arg); > + break; > + case DN_SCH: > + err = config_sched((struct dn_sch *)o, arg); > + break; > + case DN_FS: > + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); > + break; > + } > + if (prev) > + arg = NULL; > + if (err != 0) > + break; > + } > + return err; > +} > + > +static int > +compute_space(struct dn_id *cmd, struct copy_args *a) > +{ > + int x = 0, need = 0; > + int profile_size = sizeof(struct dn_profile) - > + ED_MAX_SAMPLES_NO*sizeof(int); > + /* note, this may be short */ > + > + /* NOTE about compute space: > + * NP = dn_cfg.schk_count > + * NSI = dn_cfg.si_count > + * NF = dn_cfg.fsk_count > + * NQ = dn_cfg.queue_count > + * - ipfw pipe show > + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler > + * link, scheduler template, flowset > + * integrated in scheduler and header > + * for flowset list > + * (NSI)*(dn_flow) all scheduler instance (includes > + * the queue instance) > + * - ipfw sched show > + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler > + * link, scheduler template, flowset > + * integrated in scheduler and header > + * for flowset list > + * (NSI * dn_flow) all scheduler instances > + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler > + * (NQ * dn_queue) all queue [XXXfor now not listed] > + * - ipfw queue show > + * (NF * dn_fs) all flowset > + * (NQ * dn_queue) all queues > + */ > + switch (cmd->subtype) { > + default: > + return -1; > + /* XXX where do LINK and SCH differ ? */ > + /* 'ipfw sched show' could list all queues associated to > + * a scheduler. This feature for now is disabled > + */ > + case DN_LINK: /* pipe show */ > + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; > + need += dn_cfg.schk_count * > + (sizeof(struct dn_fs) + profile_size) / 2; > + need += dn_cfg.fsk_count * sizeof(uint32_t); > + break; > + case DN_SCH: /* sched show */ > + need += dn_cfg.schk_count * > + (sizeof(struct dn_fs) + profile_size) / 2; > + need += dn_cfg.fsk_count * sizeof(uint32_t); > + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; > + break; > + case DN_FS: /* queue show */ > + x = DN_C_FS | DN_C_QUEUE; > + break; > + case DN_GET_COMPAT: /* compatibility mode */ > + need = dn_compat_calc_size(); > + break; > + } > + a->flags = x; > + if (x & DN_C_SCH) { > + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; > + /* NOT also, each fs might be attached to a sched */ > + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; > + } > + if (x & DN_C_FS) > + need += dn_cfg.fsk_count * sizeof(struct dn_fs); > + if (x & DN_C_LINK) { > + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; > + } > + /* > + * When exporting a queue to userland, only pass up the > + * struct dn_flow, which is the only visible part. > + */ > + > + if (x & DN_C_QUEUE) > + need += dn_cfg.queue_count * sizeof(struct dn_flow); > + if (x & DN_C_FLOW) > + need += dn_cfg.si_count * (sizeof(struct dn_flow)); > + return need; > +} > + > +/* > + * If compat != NULL dummynet_get is called in compatibility mode. > + * *compat will be the pointer to the buffer to pass to ipfw > + */ > +int > +dummynet_get(struct sockopt *sopt, void **compat) > +{ > + int have, i, need, error; > + char *start = NULL, *buf; > + size_t sopt_valsize; > + struct dn_id *cmd; > + struct copy_args a; > + struct copy_range r; > + int l = sizeof(struct dn_id); > + > + bzero(&a, sizeof(a)); > + bzero(&r, sizeof(r)); > + > + /* save and restore original sopt_valsize around copyin */ > + sopt_valsize = sopt->sopt_valsize; > + > + cmd = &r.o; > + > + if (!compat) { > + /* copy at least an oid, and possibly a full object */ > + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); > + sopt->sopt_valsize = sopt_valsize; > + if (error) > + goto done; > + l = cmd->len; > +#ifdef EMULATE_SYSCTL > + /* sysctl emulation. */ > + if (cmd->type == DN_SYSCTL_GET) > + return kesysctl_emu_get(sopt); > +#endif > + if (l > sizeof(r)) { > + /* request larger than default, allocate buffer */ > + cmd = malloc(l, M_DUMMYNET, M_WAITOK); > + error = sooptcopyin(sopt, cmd, l, l); > + sopt->sopt_valsize = sopt_valsize; > + if (error) > + goto done; > + } > + } else { /* compatibility */ > + error = 0; > + cmd->type = DN_CMD_GET; > + cmd->len = sizeof(struct dn_id); > + cmd->subtype = DN_GET_COMPAT; > + // cmd->id = sopt_valsize; > + D("compatibility mode"); > + } > + a.extra = (struct copy_range *)cmd; > + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ > + uint32_t *rp = (uint32_t *)(cmd + 1); > + cmd->len += 2* sizeof(uint32_t); > + rp[0] = 1; > + rp[1] = DN_MAX_ID - 1; > + if (cmd->subtype == DN_LINK) { > + rp[0] += DN_MAX_ID; > + rp[1] += DN_MAX_ID; > + } > + } > + /* Count space (under lock) and allocate (outside lock). > + * Exit with lock held if we manage to get enough buffer. > + * Try a few times then give up. > + */ > + for (have = 0, i = 0; i < 10; i++) { > + DN_BH_WLOCK(); > + need = compute_space(cmd, &a); > + > + /* if there is a range, ignore value from compute_space() */ > + if (l > sizeof(*cmd)) > + need = sopt_valsize - sizeof(*cmd); > + > + if (need < 0) { > + DN_BH_WUNLOCK(); > + error = EINVAL; > + goto done; > + } > + need += sizeof(*cmd); > + cmd->id = need; > + if (have >= need) > + break; > + > + DN_BH_WUNLOCK(); > + if (start) > + free(start, M_DUMMYNET); > + start = NULL; > + if (need > sopt_valsize) > + break; > + > + have = need; > + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); > + } > + > + if (start == NULL) { > + if (compat) { > + *compat = NULL; > + error = 1; // XXX > + } else { > + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); > + } > + goto done; > + } > + ND("have %d:%lu sched %d, %d:%lu links %d, %d:%lu flowsets %d, " > + "%d:%lu si %d, %d:%lu queues %d", > + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, > + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, > + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, > + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, > + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); > + sopt->sopt_valsize = sopt_valsize; > + a.type = cmd->subtype; > + > + if (compat == NULL) { > + bcopy(cmd, start, sizeof(*cmd)); > + ((struct dn_id*)(start))->len = sizeof(struct dn_id); > + buf = start + sizeof(*cmd); > + } else > + buf = start; > + a.start = &buf; > + a.end = start + have; > + /* start copying other objects */ > + /* XXX set error in case of no space */ > + if (compat) { > + a.type = DN_COMPAT_PIPE; > + error = dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); > + a.type = DN_COMPAT_QUEUE; > + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); > + } else if (a.type == DN_FS) { > + error = dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); > + } else { > + error = dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); > + } > + DN_BH_WUNLOCK(); > + if (error < 0) { > + error = 0; /* we skip the sooptcopyout so we fail, hopefully */ > + goto done; > + } else { > + error = 0; /* all fine */ > + } > + > + if (compat) { > + *compat = start; > + sopt->sopt_valsize = buf - start; > + /* free() is done by ip_dummynet_compat() */ > + start = NULL; //XXX hack > + } else { > + error = sooptcopyout(sopt, start, buf - start); > + } > +done: > + if (cmd && cmd != &r.o) > + free(cmd, M_DUMMYNET); > + if (start) > + free(start, M_DUMMYNET); > + return error; > +} > + > +/* Callback called on scheduler instance to delete it if idle */ > +static int > +drain_scheduler_cb(void *_si, void *arg) > +{ > + struct dn_sch_inst *si = _si; > + > + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) > + return 0; > + > + if (si->sched->fp->flags & DN_MULTIQUEUE) { > + if (si->q_count == 0) > + return si_destroy(si, NULL); > + else > + return 0; > + } else { /* !DN_MULTIQUEUE */ > + if ((si+1)->ni.length == 0) > + return si_destroy(si, NULL); > + else > + return 0; > + } > + return 0; /* unreachable */ > +} > + > +/* Callback called on scheduler to check if it has instances */ > +static int > +drain_scheduler_sch_cb(void *_s, void *arg) > +{ > + struct dn_schk *s = _s; > + > + if (s->sch.flags & DN_HAVE_MASK) { > + dn_ht_scan_bucket(s->siht, &s->drain_bucket, > + drain_scheduler_cb, NULL); > + s->drain_bucket++; > + } else { > + if (s->siht) { > + if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) > + s->siht = NULL; > + } > + } > + return 0; > +} > + > +/* Called every tick, try to delete a 'bucket' of scheduler */ > +void > +dn_drain_scheduler(void) > +{ > + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, > + drain_scheduler_sch_cb, NULL); > + dn_cfg.drain_sch++; > +} > + > +/* Callback called on queue to delete if it is idle */ > +static int > +drain_queue_cb(void *_q, void *arg) > +{ > + struct dn_queue *q = _q; > + > + if (q->ni.length == 0) { > + dn_delete_queue(q, DN_DESTROY); > + return DNHT_SCAN_DEL; /* queue is deleted */ > + } > + > + return 0; /* queue isn't deleted */ > +} > + > +/* Callback called on flowset used to check if it has queues */ > +static int > +drain_queue_fs_cb(void *_fs, void *arg) > +{ > + struct dn_fsk *fs = _fs; > + > + if (fs->fs.flags & DN_QHT_HASH) { > + /* Flowset has a hash table for queues */ > + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, > + drain_queue_cb, NULL); > + fs->drain_bucket++; > + } else { > + /* No hash table for this flowset, null the pointer > + * if the queue is deleted > + */ > + if (fs->qht) { > + if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) > + fs->qht = NULL; > + } > + } > + return 0; > +} > + > +/* Called every tick, try to delete a 'bucket' of queue */ > +void > +dn_drain_queue(void) > +{ > + /* scan a bucket of flowset */ > + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, > + drain_queue_fs_cb, NULL); > + dn_cfg.drain_fs++; > +} > + > +/* > + * Handler for the various dummynet socket options > + */ > +static int > +ip_dn_ctl(struct sockopt *sopt) > +{ > + void *p = NULL; > + int error, l; > + > + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); > + if (error) > + return (error); > + > + /* Disallow sets in really-really secure mode. */ > + if (sopt->sopt_dir == SOPT_SET) { > + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); > + if (error) > + return (error); > + } > + > + switch (sopt->sopt_name) { > + default : > + D("dummynet: unknown option %d", sopt->sopt_name); > + error = EINVAL; > + break; > + > + case IP_DUMMYNET_FLUSH: > + case IP_DUMMYNET_CONFIGURE: > + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ > + case IP_DUMMYNET_GET: > + D("dummynet: compat option %d", sopt->sopt_name); > + error = ip_dummynet_compat(sopt); > + break; > + > + case IP_DUMMYNET3 : > + if (sopt->sopt_dir == SOPT_GET) { > + error = dummynet_get(sopt, NULL); > + break; > + } > + l = sopt->sopt_valsize; > + /* XXX bumped size to 16000 for 3 profiles */ > + if (l < sizeof(struct dn_id) || l > 16000) { > + D("argument len %d invalid", l); > + break; > + } > + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? > + error = sooptcopyin(sopt, p, l, l); > + if (error) > + break ; > + error = do_config(p, l); > + break; > + } > + > + if (p != NULL) > + free(p, M_TEMP); > + > + return error ; > +} > + > + > +static void > +ip_dn_init(void) > +{ > + if (dn_cfg.init_done) > + return; > + printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); > + dn_cfg.init_done = 1; > + /* Set defaults here. MSVC does not accept initializers, > + * and this is also useful for vimages > + */ > + /* queue limits */ > + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ > + dn_cfg.byte_limit = 1024 * 1024; > + dn_cfg.expire = 1; > + > + /* RED parameters */ > + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ > + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ > + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ > + > + /* hash tables */ > + dn_cfg.max_hash_size = 65536; /* max in the hash tables */ > + dn_cfg.hash_size = 64; /* default hash size */ > + > + /* create hash tables for schedulers and flowsets. > + * In both we search by key and by pointer. > + */ > + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, > + offsetof(struct dn_schk, schk_next), > + schk_hash, schk_match, schk_new); > + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, > + offsetof(struct dn_fsk, fsk_next), > + fsk_hash, fsk_match, fsk_new); > + > + /* bucket index to drain object */ > + dn_cfg.drain_fs = 0; > + dn_cfg.drain_sch = 0; > + > + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); > + SLIST_INIT(&dn_cfg.fsu); > + SLIST_INIT(&dn_cfg.schedlist); > + > + DN_LOCK_INIT(); > + > + TASK_INIT(&dn_task, 0, dummynet_task, curvnet); > + dn_tq = taskqueue_create_fast("dummynet", M_WAITOK, > + taskqueue_thread_enqueue, &dn_tq); > + taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); > + > + callout_init(&dn_timeout, CALLOUT_MPSAFE); > + dn_reschedule(); > + > + /* Initialize curr_time adjustment mechanics. */ > + getmicrouptime(&dn_cfg.prev_t); > +} > + > +static void > +ip_dn_destroy(int last) > +{ > + callout_drain(&dn_timeout); > + > + DN_BH_WLOCK(); > + if (last) { > + ND("removing last instance\n"); > + ip_dn_ctl_ptr = NULL; > + ip_dn_io_ptr = NULL; > + } > + > + dummynet_flush(); > + DN_BH_WUNLOCK(); > + taskqueue_drain(dn_tq, &dn_task); > + taskqueue_free(dn_tq); > + > + dn_ht_free(dn_cfg.schedhash, 0); > + dn_ht_free(dn_cfg.fshash, 0); > + heap_free(&dn_cfg.evheap); > + > + DN_LOCK_DESTROY(); > +} > + > +static int > +dummynet_modevent(module_t mod, int type, void *data) > +{ > + > + if (type == MOD_LOAD) { > + if (ip_dn_io_ptr) { > + printf("DUMMYNET already loaded\n"); > + return EEXIST ; > + } > + ip_dn_init(); > + ip_dn_ctl_ptr = ip_dn_ctl; > + ip_dn_io_ptr = dummynet_io; > + return 0; > + } else if (type == MOD_UNLOAD) { > + ip_dn_destroy(1 /* last */); > + return 0; > + } else > + return EOPNOTSUPP; > +} > + > +/* modevent helpers for the modules */ > +static int > +load_dn_sched(struct dn_alg *d) > +{ > + struct dn_alg *s; > + > + if (d == NULL) > + return 1; /* error */ > + ip_dn_init(); /* just in case, we need the lock */ > + > + /* Check that mandatory funcs exists */ > + if (d->enqueue == NULL || d->dequeue == NULL) { > + D("missing enqueue or dequeue for %s", d->name); > + return 1; > + } > + > + /* Search if scheduler already exists */ > + DN_BH_WLOCK(); > + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { > + if (strcmp(s->name, d->name) == 0) { > + D("%s already loaded", d->name); > + break; /* scheduler already exists */ > + } > + } > + if (s == NULL) > + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); > + DN_BH_WUNLOCK(); > + D("dn_sched %s %sloaded", d->name, s ? "not ":""); > + return s ? 1 : 0; > +} > + > +static int > +unload_dn_sched(struct dn_alg *s) > +{ > + struct dn_alg *tmp, *r; > + int err = EINVAL; > + > + ND("called for %s", s->name); > + > + DN_BH_WLOCK(); > + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { > + if (strcmp(s->name, r->name) != 0) > + continue; > + ND("ref_count = %d", r->ref_count); > + err = (r->ref_count != 0) ? EBUSY : 0; > + if (err == 0) > + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); > + break; > + } > + DN_BH_WUNLOCK(); > + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); > + return err; > +} > + > +int > +dn_sched_modevent(module_t mod, int cmd, void *arg) > +{ > + struct dn_alg *sch = arg; > + > + if (cmd == MOD_LOAD) > + return load_dn_sched(sch); > + else if (cmd == MOD_UNLOAD) > + return unload_dn_sched(sch); > + else > + return EINVAL; > +} > + > +static moduledata_t dummynet_mod = { > + "dummynet", dummynet_modevent, NULL > +}; > + > +#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN > +#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ > +DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); > +MODULE_DEPEND(dummynet, ipfw, 3, 3, 3); > +MODULE_VERSION(dummynet, 3); > + > +/* > + * Starting up. Done in order after dummynet_modevent() has been called. > + * VNET_SYSINIT is also called for each existing vnet and each new vnet. > + */ > +//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); > + > +/* > + * Shutdown handlers up shop. These are done in REVERSE ORDER, but still > + * after dummynet_modevent() has been called. Not called on reboot. > + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. > + * or when the module is unloaded. > + */ > +//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); > + > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw2.c b/example/ipfw/sys/netpfil/ipfw/ip_fw2.c > new file mode 100644 > index 0000000..7e94502 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw2.c > @@ -0,0 +1,2905 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw2.c 272840 2014-10-09 19:32:35Z melifaro $"); > + > +/* > + * The FreeBSD IP packet firewall, main file > + */ > + > +#include "opt_ipfw.h" > +#include "opt_ipdivert.h" > +#include "opt_inet.h" > +#ifndef INET > +#error "IPFIREWALL requires INET" > +#endif /* INET */ > +#include "opt_inet6.h" > +#include "opt_ipsec.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/condvar.h> > +#include <sys/counter.h> > +#include <sys/eventhandler.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/jail.h> > +#include <sys/module.h> > +#include <sys/priv.h> > +#include <sys/proc.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/sysctl.h> > +#include <sys/syslog.h> > +#include <sys/ucred.h> > +#include <net/ethernet.h> /* for ETHERTYPE_IP */ > +#include <net/if.h> > +#include <net/if_var.h> > +#include <net/route.h> > +#include <net/pfil.h> > +#include <net/vnet.h> > + > +#include <netpfil/pf/pf_mtag.h> > + > +#include <netinet/in.h> > +#include <netinet/in_var.h> > +#include <netinet/in_pcb.h> > +#include <netinet/ip.h> > +#include <netinet/ip_var.h> > +#include <netinet/ip_icmp.h> > +#include <netinet/ip_fw.h> > +#include <netinet/ip_carp.h> > +#include <netinet/pim.h> > +#include <netinet/tcp_var.h> > +#include <netinet/udp.h> > +#include <netinet/udp_var.h> > +#include <netinet/sctp.h> > + > +#include <netinet/ip6.h> > +#include <netinet/icmp6.h> > +#ifdef INET6 > +#include <netinet6/in6_pcb.h> > +#include <netinet6/scope6_var.h> > +#include <netinet6/ip6_var.h> > +#endif > + > +#include <netpfil/ipfw/ip_fw_private.h> > + > +#include <machine/in_cksum.h> /* XXX for in_cksum */ > + > +#ifdef MAC > +#include <security/mac/mac_framework.h> > +#endif > + > +/* > + * static variables followed by global ones. > + * All ipfw global variables are here. > + */ > + > +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); > +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) > + > +static VNET_DEFINE(int, fw_permit_single_frag6) = 1; > +#define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) > + > +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT > +static int default_to_accept = 1; > +#else > +static int default_to_accept; > +#endif > + > +VNET_DEFINE(int, autoinc_step); > +VNET_DEFINE(int, fw_one_pass) = 1; > + > +VNET_DEFINE(unsigned int, fw_tables_max); > +VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ > +/* Use 128 tables by default */ > +static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; > + > +#ifndef LINEAR_SKIPTO > +static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, > + int tablearg, int jump_backwards); > +#define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) > +#else > +static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, > + int tablearg, int jump_backwards); > +#define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) > +#endif > + > +/* > + * Each rule belongs to one of 32 different sets (0..31). > + * The variable set_disable contains one bit per set. > + * If the bit is set, all rules in the corresponding set > + * are disabled. Set RESVD_SET(31) is reserved for the default rule > + * and rules that are not deleted by the flush command, > + * and CANNOT be disabled. > + * Rules in set RESVD_SET can only be deleted individually. > + */ > +VNET_DEFINE(u_int32_t, set_disable); > +#define V_set_disable VNET(set_disable) > + > +VNET_DEFINE(int, fw_verbose); > +/* counter for ipfw_log(NULL...) */ > +VNET_DEFINE(u_int64_t, norule_counter); > +VNET_DEFINE(int, verbose_limit); > + > +/* layer3_chain contains the list of rules for layer 3 */ > +VNET_DEFINE(struct ip_fw_chain, layer3_chain); > + > +/* ipfw_vnet_ready controls when we are open for business */ > +VNET_DEFINE(int, ipfw_vnet_ready) = 0; > + > +VNET_DEFINE(int, ipfw_nat_ready) = 0; > + > +ipfw_nat_t *ipfw_nat_ptr = NULL; > +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); > +ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; > +ipfw_nat_cfg_t *ipfw_nat_del_ptr; > +ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; > +ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; > + > +#ifdef SYSCTL_NODE > +uint32_t dummy_def = IPFW_DEFAULT_RULE; > +static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); > +static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); > + > +SYSBEGIN(f3) > + > +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); > +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, > + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, > + "Only do a single pass through ipfw when using dummynet(4)"); > +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, > + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, > + "Rule number auto-increment step"); > +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, > + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, > + "Log matches to ipfw rules"); > +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, > + CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, > + "Set upper limit of matches of ipfw rules logged"); > +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, > + &dummy_def, 0, > + "The default/max possible rule number."); > +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, > + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", > + "Maximum number of concurrently used tables"); > +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, > + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", > + "Use per-set namespace for tables"); > +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, > + &default_to_accept, 0, > + "Make the default rule accept all packets."); > +TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); > +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, > + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, > + "Number of static rules"); > + > +#ifdef INET6 > +SYSCTL_DECL(_net_inet6_ip6); > +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); > +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, > + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, > + "Deny packets with unknown IPv6 Extension Headers"); > +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, > + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, > + "Permit single packet IPv6 fragments"); > +#endif /* INET6 */ > + > +SYSEND > + > +#endif /* SYSCTL_NODE */ > + > + > +/* > + * Some macros used in the various matching options. > + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T > + * Other macros just cast void * into the appropriate type > + */ > +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) > +#define TCP(p) ((struct tcphdr *)(p)) > +#define SCTP(p) ((struct sctphdr *)(p)) > +#define UDP(p) ((struct udphdr *)(p)) > +#define ICMP(p) ((struct icmphdr *)(p)) > +#define ICMP6(p) ((struct icmp6_hdr *)(p)) > + > +static __inline int > +icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) > +{ > + int type = icmp->icmp_type; > + > + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) ); > +} > + > +#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \ > + (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) ) > + > +static int > +is_icmp_query(struct icmphdr *icmp) > +{ > + int type = icmp->icmp_type; > + > + return (type <= ICMP_MAXTYPE && (TT & (1<<type)) ); > +} > +#undef TT > + > +/* > + * The following checks use two arrays of 8 or 16 bits to store the > + * bits that we want set or clear, respectively. They are in the > + * low and high half of cmd->arg1 or cmd->d[0]. > + * > + * We scan options and store the bits we find set. We succeed if > + * > + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear > + * > + * The code is sometimes optimized not to store additional variables. > + */ > + > +static int > +flags_match(ipfw_insn *cmd, u_int8_t bits) > +{ > + u_char want_clear; > + bits = ~bits; > + > + if ( ((cmd->arg1 & 0xff) & bits) != 0) > + return 0; /* some bits we want set were clear */ > + want_clear = (cmd->arg1 >> 8) & 0xff; > + if ( (want_clear & bits) != want_clear) > + return 0; /* some bits we want clear were set */ > + return 1; > +} > + > +static int > +ipopts_match(struct ip *ip, ipfw_insn *cmd) > +{ > + int optlen, bits = 0; > + u_char *cp = (u_char *)(ip + 1); > + int x = (ip->ip_hl << 2) - sizeof (struct ip); > + > + for (; x > 0; x -= optlen, cp += optlen) { > + int opt = cp[IPOPT_OPTVAL]; > + > + if (opt == IPOPT_EOL) > + break; > + if (opt == IPOPT_NOP) > + optlen = 1; > + else { > + optlen = cp[IPOPT_OLEN]; > + if (optlen <= 0 || optlen > x) > + return 0; /* invalid or truncated */ > + } > + switch (opt) { > + > + default: > + break; > + > + case IPOPT_LSRR: > + bits |= IP_FW_IPOPT_LSRR; > + break; > + > + case IPOPT_SSRR: > + bits |= IP_FW_IPOPT_SSRR; > + break; > + > + case IPOPT_RR: > + bits |= IP_FW_IPOPT_RR; > + break; > + > + case IPOPT_TS: > + bits |= IP_FW_IPOPT_TS; > + break; > + } > + } > + return (flags_match(cmd, bits)); > +} > + > +static int > +tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) > +{ > + int optlen, bits = 0; > + u_char *cp = (u_char *)(tcp + 1); > + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); > + > + for (; x > 0; x -= optlen, cp += optlen) { > + int opt = cp[0]; > + if (opt == TCPOPT_EOL) > + break; > + if (opt == TCPOPT_NOP) > + optlen = 1; > + else { > + optlen = cp[1]; > + if (optlen <= 0) > + break; > + } > + > + switch (opt) { > + > + default: > + break; > + > + case TCPOPT_MAXSEG: > + bits |= IP_FW_TCPOPT_MSS; > + break; > + > + case TCPOPT_WINDOW: > + bits |= IP_FW_TCPOPT_WINDOW; > + break; > + > + case TCPOPT_SACK_PERMITTED: > + case TCPOPT_SACK: > + bits |= IP_FW_TCPOPT_SACK; > + break; > + > + case TCPOPT_TIMESTAMP: > + bits |= IP_FW_TCPOPT_TS; > + break; > + > + } > + } > + return (flags_match(cmd, bits)); > +} > + > +static int > +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, > + uint32_t *tablearg) > +{ > + > + if (ifp == NULL) /* no iface with this packet, match fails */ > + return (0); > + > + /* Check by name or by IP address */ > + if (cmd->name[0] != '\0') { /* match by name */ > + if (cmd->name[0] == '\1') /* use tablearg to match */ > + return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0, > + &ifp->if_index, tablearg); > + /* Check name */ > + if (cmd->p.glob) { > + if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) > + return(1); > + } else { > + if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) > + return(1); > + } > + } else { > +#if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ > + struct ifaddr *ia; > + > + if_addr_rlock(ifp); > + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { > + if (ia->ifa_addr->sa_family != AF_INET) > + continue; > + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) > + (ia->ifa_addr))->sin_addr.s_addr) { > + if_addr_runlock(ifp); > + return(1); /* match */ > + } > + } > + if_addr_runlock(ifp); > +#endif /* __FreeBSD__ */ > + } > + return(0); /* no match, fail ... */ > +} > + > +/* > + * The verify_path function checks if a route to the src exists and > + * if it is reachable via ifp (when provided). > + * > + * The 'verrevpath' option checks that the interface that an IP packet > + * arrives on is the same interface that traffic destined for the > + * packet's source address would be routed out of. > + * The 'versrcreach' option just checks that the source address is > + * reachable via any route (except default) in the routing table. > + * These two are a measure to block forged packets. This is also > + * commonly known as "anti-spoofing" or Unicast Reverse Path > + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs > + * is purposely reminiscent of the Cisco IOS command, > + * > + * ip verify unicast reverse-path > + * ip verify unicast source reachable-via any > + * > + * which implements the same functionality. But note that the syntax > + * is misleading, and the check may be performed on all IP packets > + * whether unicast, multicast, or broadcast. > + */ > +static int > +verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) > +{ > +#if defined(USERSPACE) || !defined(__FreeBSD__) > + return 0; > +#else > + struct route ro; > + struct sockaddr_in *dst; > + > + bzero(&ro, sizeof(ro)); > + > + dst = (struct sockaddr_in *)&(ro.ro_dst); > + dst->sin_family = AF_INET; > + dst->sin_len = sizeof(*dst); > + dst->sin_addr = src; > + in_rtalloc_ign(&ro, 0, fib); > + > + if (ro.ro_rt == NULL) > + return 0; > + > + /* > + * If ifp is provided, check for equality with rtentry. > + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, > + * in order to pass packets injected back by if_simloop(): > + * routing entry (via lo0) for our own address > + * may exist, so we need to handle routing assymetry. > + */ > + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* if no ifp provided, check if rtentry is not default route */ > + if (ifp == NULL && > + satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* or if this is a blackhole/reject route */ > + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* found valid route */ > + RTFREE(ro.ro_rt); > + return 1; > +#endif /* __FreeBSD__ */ > +} > + > +#ifdef INET6 > +/* > + * ipv6 specific rules here... > + */ > +static __inline int > +icmp6type_match (int type, ipfw_insn_u32 *cmd) > +{ > + return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); > +} > + > +static int > +flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) > +{ > + int i; > + for (i=0; i <= cmd->o.arg1; ++i ) > + if (curr_flow == cmd->d[i] ) > + return 1; > + return 0; > +} > + > +/* support for IP6_*_ME opcodes */ > +static int > +search_ip6_addr_net (struct in6_addr * ip6_addr) > +{ > + struct ifnet *mdc; > + struct ifaddr *mdc2; > + struct in6_ifaddr *fdm; > + struct in6_addr copia; > + > + TAILQ_FOREACH(mdc, &V_ifnet, if_link) { > + if_addr_rlock(mdc); > + TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { > + if (mdc2->ifa_addr->sa_family == AF_INET6) { > + fdm = (struct in6_ifaddr *)mdc2; > + copia = fdm->ia_addr.sin6_addr; > + /* need for leaving scope_id in the sock_addr */ > + in6_clearscope(&copia); > + if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { > + if_addr_runlock(mdc); > + return 1; > + } > + } > + } > + if_addr_runlock(mdc); > + } > + return 0; > +} > + > +static int > +verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) > +{ > + struct route_in6 ro; > + struct sockaddr_in6 *dst; > + > + bzero(&ro, sizeof(ro)); > + > + dst = (struct sockaddr_in6 * )&(ro.ro_dst); > + dst->sin6_family = AF_INET6; > + dst->sin6_len = sizeof(*dst); > + dst->sin6_addr = *src; > + > + in6_rtalloc_ign(&ro, 0, fib); > + if (ro.ro_rt == NULL) > + return 0; > + > + /* > + * if ifp is provided, check for equality with rtentry > + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, > + * to support the case of sending packets to an address of our own. > + * (where the former interface is the first argument of if_simloop() > + * (=ifp), the latter is lo0) > + */ > + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* if no ifp provided, check if rtentry is not default route */ > + if (ifp == NULL && > + IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* or if this is a blackhole/reject route */ > + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { > + RTFREE(ro.ro_rt); > + return 0; > + } > + > + /* found valid route */ > + RTFREE(ro.ro_rt); > + return 1; > + > +} > + > +static int > +is_icmp6_query(int icmp6_type) > +{ > + if ((icmp6_type <= ICMP6_MAXTYPE) && > + (icmp6_type == ICMP6_ECHO_REQUEST || > + icmp6_type == ICMP6_MEMBERSHIP_QUERY || > + icmp6_type == ICMP6_WRUREQUEST || > + icmp6_type == ICMP6_FQDN_QUERY || > + icmp6_type == ICMP6_NI_QUERY)) > + return (1); > + > + return (0); > +} > + > +static void > +send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) > +{ > + struct mbuf *m; > + > + m = args->m; > + if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { > + struct tcphdr *tcp; > + tcp = (struct tcphdr *)((char *)ip6 + hlen); > + > + if ((tcp->th_flags & TH_RST) == 0) { > + struct mbuf *m0; > + m0 = ipfw_send_pkt(args->m, &(args->f_id), > + ntohl(tcp->th_seq), ntohl(tcp->th_ack), > + tcp->th_flags | TH_RST); > + if (m0 != NULL) > + ip6_output(m0, NULL, NULL, 0, NULL, NULL, > + NULL); > + } > + FREE_PKT(m); > + } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ > +#if 0 > + /* > + * Unlike above, the mbufs need to line up with the ip6 hdr, > + * as the contents are read. We need to m_adj() the > + * needed amount. > + * The mbuf will however be thrown away so we can adjust it. > + * Remember we did an m_pullup on it already so we > + * can make some assumptions about contiguousness. > + */ > + if (args->L3offset) > + m_adj(m, args->L3offset); > +#endif > + icmp6_error(m, ICMP6_DST_UNREACH, code, 0); > + } else > + FREE_PKT(m); > + > + args->m = NULL; > +} > + > +#endif /* INET6 */ > + > + > +/* > + * sends a reject message, consuming the mbuf passed as an argument. > + */ > +static void > +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) > +{ > + > +#if 0 > + /* XXX When ip is not guaranteed to be at mtod() we will > + * need to account for this */ > + * The mbuf will however be thrown away so we can adjust it. > + * Remember we did an m_pullup on it already so we > + * can make some assumptions about contiguousness. > + */ > + if (args->L3offset) > + m_adj(m, args->L3offset); > +#endif > + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ > + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); > + } else if (args->f_id.proto == IPPROTO_TCP) { > + struct tcphdr *const tcp = > + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); > + if ( (tcp->th_flags & TH_RST) == 0) { > + struct mbuf *m; > + m = ipfw_send_pkt(args->m, &(args->f_id), > + ntohl(tcp->th_seq), ntohl(tcp->th_ack), > + tcp->th_flags | TH_RST); > + if (m != NULL) > + ip_output(m, NULL, NULL, 0, NULL, NULL); > + } > + FREE_PKT(args->m); > + } else > + FREE_PKT(args->m); > + args->m = NULL; > +} > + > +/* > + * Support for uid/gid/jail lookup. These tests are expensive > + * (because we may need to look into the list of active sockets) > + * so we cache the results. ugid_lookupp is 0 if we have not > + * yet done a lookup, 1 if we succeeded, and -1 if we tried > + * and failed. The function always returns the match value. > + * We could actually spare the variable and use *uc, setting > + * it to '(void *)check_uidgid if we have no info, NULL if > + * we tried and failed, or any other value if successful. > + */ > +static int > +check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, > + struct ucred **uc) > +{ > +#if defined(USERSPACE) > + return 0; // not supported in userspace > +#else > +#ifndef __FreeBSD__ > + /* XXX */ > + return cred_check(insn, proto, oif, > + dst_ip, dst_port, src_ip, src_port, > + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); > +#else /* FreeBSD */ > + struct in_addr src_ip, dst_ip; > + struct inpcbinfo *pi; > + struct ipfw_flow_id *id; > + struct inpcb *pcb, *inp; > + struct ifnet *oif; > + int lookupflags; > + int match; > + > + id = &args->f_id; > + inp = args->inp; > + oif = args->oif; > + > + /* > + * Check to see if the UDP or TCP stack supplied us with > + * the PCB. If so, rather then holding a lock and looking > + * up the PCB, we can use the one that was supplied. > + */ > + if (inp && *ugid_lookupp == 0) { > + INP_LOCK_ASSERT(inp); > + if (inp->inp_socket != NULL) { > + *uc = crhold(inp->inp_cred); > + *ugid_lookupp = 1; > + } else > + *ugid_lookupp = -1; > + } > + /* > + * If we have already been here and the packet has no > + * PCB entry associated with it, then we can safely > + * assume that this is a no match. > + */ > + if (*ugid_lookupp == -1) > + return (0); > + if (id->proto == IPPROTO_TCP) { > + lookupflags = 0; > + pi = &V_tcbinfo; > + } else if (id->proto == IPPROTO_UDP) { > + lookupflags = INPLOOKUP_WILDCARD; > + pi = &V_udbinfo; > + } else > + return 0; > + lookupflags |= INPLOOKUP_RLOCKPCB; > + match = 0; > + if (*ugid_lookupp == 0) { > + if (id->addr_type == 6) { > +#ifdef INET6 > + if (oif == NULL) > + pcb = in6_pcblookup_mbuf(pi, > + &id->src_ip6, htons(id->src_port), > + &id->dst_ip6, htons(id->dst_port), > + lookupflags, oif, args->m); > + else > + pcb = in6_pcblookup_mbuf(pi, > + &id->dst_ip6, htons(id->dst_port), > + &id->src_ip6, htons(id->src_port), > + lookupflags, oif, args->m); > +#else > + *ugid_lookupp = -1; > + return (0); > +#endif > + } else { > + src_ip.s_addr = htonl(id->src_ip); > + dst_ip.s_addr = htonl(id->dst_ip); > + if (oif == NULL) > + pcb = in_pcblookup_mbuf(pi, > + src_ip, htons(id->src_port), > + dst_ip, htons(id->dst_port), > + lookupflags, oif, args->m); > + else > + pcb = in_pcblookup_mbuf(pi, > + dst_ip, htons(id->dst_port), > + src_ip, htons(id->src_port), > + lookupflags, oif, args->m); > + } > + if (pcb != NULL) { > + INP_RLOCK_ASSERT(pcb); > + *uc = crhold(pcb->inp_cred); > + *ugid_lookupp = 1; > + INP_RUNLOCK(pcb); > + } > + if (*ugid_lookupp == 0) { > + /* > + * We tried and failed, set the variable to -1 > + * so we will not try again on this packet. > + */ > + *ugid_lookupp = -1; > + return (0); > + } > + } > + if (insn->o.opcode == O_UID) > + match = ((*uc)->cr_uid == (uid_t)insn->d[0]); > + else if (insn->o.opcode == O_GID) > + match = groupmember((gid_t)insn->d[0], *uc); > + else if (insn->o.opcode == O_JAIL) > + match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); > + return (match); > +#endif /* __FreeBSD__ */ > +#endif /* not supported in userspace */ > +} > + > +/* > + * Helper function to set args with info on the rule after the matching > + * one. slot is precise, whereas we guess rule_id as they are > + * assigned sequentially. > + */ > +static inline void > +set_match(struct ip_fw_args *args, int slot, > + struct ip_fw_chain *chain) > +{ > + args->rule.chain_id = chain->id; > + args->rule.slot = slot + 1; /* we use 0 as a marker */ > + args->rule.rule_id = 1 + chain->map[slot]->id; > + args->rule.rulenum = chain->map[slot]->rulenum; > +} > + > +#ifndef LINEAR_SKIPTO > +/* > + * Helper function to enable cached rule lookups using > + * cached_id and cached_pos fields in ipfw rule. > + */ > +static int > +jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, > + int tablearg, int jump_backwards) > +{ > + int f_pos; > + > + /* If possible use cached f_pos (in f->cached_pos), > + * whose version is written in f->cached_id > + * (horrible hacks to avoid changing the ABI). > + */ > + if (num != IP_FW_TARG && f->cached_id == chain->id) > + f_pos = f->cached_pos; > + else { > + int i = IP_FW_ARG_TABLEARG(chain, num, skipto); > + /* make sure we do not jump backward */ > + if (jump_backwards == 0 && i <= f->rulenum) > + i = f->rulenum + 1; > + if (chain->idxmap != NULL) > + f_pos = chain->idxmap[i]; > + else > + f_pos = ipfw_find_rule(chain, i, 0); > + /* update the cache */ > + if (num != IP_FW_TARG) { > + f->cached_id = chain->id; > + f->cached_pos = f_pos; > + } > + } > + > + return (f_pos); > +} > +#else > +/* > + * Helper function to enable real fast rule lookups. > + */ > +static int > +jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, > + int tablearg, int jump_backwards) > +{ > + int f_pos; > + > + num = IP_FW_ARG_TABLEARG(chain, num, skipto); > + /* make sure we do not jump backward */ > + if (jump_backwards == 0 && num <= f->rulenum) > + num = f->rulenum + 1; > + f_pos = chain->idxmap[num]; > + > + return (f_pos); > +} > +#endif > + > +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) > +/* > + * The main check routine for the firewall. > + * > + * All arguments are in args so we can modify them and return them > + * back to the caller. > + * > + * Parameters: > + * > + * args->m (in/out) The packet; we set to NULL when/if we nuke it. > + * Starts with the IP header. > + * args->eh (in) Mac header if present, NULL for layer3 packet. > + * args->L3offset Number of bytes bypassed if we came from L2. > + * e.g. often sizeof(eh) ** NOTYET ** > + * args->oif Outgoing interface, NULL if packet is incoming. > + * The incoming interface is in the mbuf. (in) > + * args->divert_rule (in/out) > + * Skip up to the first rule past this rule number; > + * upon return, non-zero port number for divert or tee. > + * > + * args->rule Pointer to the last matching rule (in/out) > + * args->next_hop Socket we are forwarding to (out). > + * args->next_hop6 IPv6 next hop we are forwarding to (out). > + * args->f_id Addresses grabbed from the packet (out) > + * args->rule.info a cookie depending on rule action > + * > + * Return value: > + * > + * IP_FW_PASS the packet must be accepted > + * IP_FW_DENY the packet must be dropped > + * IP_FW_DIVERT divert packet, port in m_tag > + * IP_FW_TEE tee packet, port in m_tag > + * IP_FW_DUMMYNET to dummynet, pipe in args->cookie > + * IP_FW_NETGRAPH into netgraph, cookie args->cookie > + * args->rule contains the matching rule, > + * args->rule.info has additional information. > + * > + */ > +int > +ipfw_chk(struct ip_fw_args *args) > +{ > + > + /* > + * Local variables holding state while processing a packet: > + * > + * IMPORTANT NOTE: to speed up the processing of rules, there > + * are some assumption on the values of the variables, which > + * are documented here. Should you change them, please check > + * the implementation of the various instructions to make sure > + * that they still work. > + * > + * args->eh The MAC header. It is non-null for a layer2 > + * packet, it is NULL for a layer-3 packet. > + * **notyet** > + * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. > + * > + * m | args->m Pointer to the mbuf, as received from the caller. > + * It may change if ipfw_chk() does an m_pullup, or if it > + * consumes the packet because it calls send_reject(). > + * XXX This has to change, so that ipfw_chk() never modifies > + * or consumes the buffer. > + * ip is the beginning of the ip(4 or 6) header. > + * Calculated by adding the L3offset to the start of data. > + * (Until we start using L3offset, the packet is > + * supposed to start with the ip header). > + */ > + struct mbuf *m = args->m; > + struct ip *ip = mtod(m, struct ip *); > + > + /* > + * For rules which contain uid/gid or jail constraints, cache > + * a copy of the users credentials after the pcb lookup has been > + * executed. This will speed up the processing of rules with > + * these types of constraints, as well as decrease contention > + * on pcb related locks. > + */ > +#ifndef __FreeBSD__ > + struct bsd_ucred ucred_cache; > +#else > + struct ucred *ucred_cache = NULL; > +#endif > + int ucred_lookup = 0; > + > + /* > + * oif | args->oif If NULL, ipfw_chk has been called on the > + * inbound path (ether_input, ip_input). > + * If non-NULL, ipfw_chk has been called on the outbound path > + * (ether_output, ip_output). > + */ > + struct ifnet *oif = args->oif; > + > + int f_pos = 0; /* index of current rule in the array */ > + int retval = 0; > + > + /* > + * hlen The length of the IP header. > + */ > + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ > + > + /* > + * offset The offset of a fragment. offset != 0 means that > + * we have a fragment at this offset of an IPv4 packet. > + * offset == 0 means that (if this is an IPv4 packet) > + * this is the first or only fragment. > + * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header > + * or there is a single packet fragement (fragement header added > + * without needed). We will treat a single packet fragment as if > + * there was no fragment header (or log/block depending on the > + * V_fw_permit_single_frag6 sysctl setting). > + */ > + u_short offset = 0; > + u_short ip6f_mf = 0; > + > + /* > + * Local copies of addresses. They are only valid if we have > + * an IP packet. > + * > + * proto The protocol. Set to 0 for non-ip packets, > + * or to the protocol read from the packet otherwise. > + * proto != 0 means that we have an IPv4 packet. > + * > + * src_port, dst_port port numbers, in HOST format. Only > + * valid for TCP and UDP packets. > + * > + * src_ip, dst_ip ip addresses, in NETWORK format. > + * Only valid for IPv4 packets. > + */ > + uint8_t proto; > + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ > + struct in_addr src_ip, dst_ip; /* NOTE: network format */ > + uint16_t iplen=0; > + int pktlen; > + uint16_t etype = 0; /* Host order stored ether type */ > + > + /* > + * dyn_dir = MATCH_UNKNOWN when rules unchecked, > + * MATCH_NONE when checked and not matched (q = NULL), > + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) > + */ > + int dyn_dir = MATCH_UNKNOWN; > + ipfw_dyn_rule *q = NULL; > + struct ip_fw_chain *chain = &V_layer3_chain; > + > + /* > + * We store in ulp a pointer to the upper layer protocol header. > + * In the ipv4 case this is easy to determine from the header, > + * but for ipv6 we might have some additional headers in the middle. > + * ulp is NULL if not found. > + */ > + void *ulp = NULL; /* upper layer protocol pointer. */ > + > + /* XXX ipv6 variables */ > + int is_ipv6 = 0; > + uint8_t icmp6_type = 0; > + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ > + /* end of ipv6 variables */ > + > + int is_ipv4 = 0; > + > + int done = 0; /* flag to exit the outer loop */ > + IPFW_RLOCK_TRACKER; > + > + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) > + return (IP_FW_PASS); /* accept */ > + > + dst_ip.s_addr = 0; /* make sure it is initialized */ > + src_ip.s_addr = 0; /* make sure it is initialized */ > + pktlen = m->m_pkthdr.len; > + args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ > + proto = args->f_id.proto = 0; /* mark f_id invalid */ > + /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ > + > +/* > + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, > + * then it sets p to point at the offset "len" in the mbuf. WARNING: the > + * pointer might become stale after other pullups (but we never use it > + * this way). > + */ > +#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) > +#define PULLUP_LEN(_len, p, T) \ > +do { \ > + int x = (_len) + T; \ > + if ((m)->m_len < x) { \ > + args->m = m = m_pullup(m, x); \ > + if (m == NULL) \ > + goto pullup_failed; \ > + } \ > + p = (mtod(m, char *) + (_len)); \ > +} while (0) > + > + /* > + * if we have an ether header, > + */ > + if (args->eh) > + etype = ntohs(args->eh->ether_type); > + > + /* Identify IP packets and fill up variables. */ > + if (pktlen >= sizeof(struct ip6_hdr) && > + (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { > + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; > + is_ipv6 = 1; > + args->f_id.addr_type = 6; > + hlen = sizeof(struct ip6_hdr); > + proto = ip6->ip6_nxt; > + > + /* Search extension headers to find upper layer protocols */ > + while (ulp == NULL && offset == 0) { > + switch (proto) { > + case IPPROTO_ICMPV6: > + PULLUP_TO(hlen, ulp, struct icmp6_hdr); > + icmp6_type = ICMP6(ulp)->icmp6_type; > + break; > + > + case IPPROTO_TCP: > + PULLUP_TO(hlen, ulp, struct tcphdr); > + dst_port = TCP(ulp)->th_dport; > + src_port = TCP(ulp)->th_sport; > + /* save flags for dynamic rules */ > + args->f_id._flags = TCP(ulp)->th_flags; > + break; > + > + case IPPROTO_SCTP: > + PULLUP_TO(hlen, ulp, struct sctphdr); > + src_port = SCTP(ulp)->src_port; > + dst_port = SCTP(ulp)->dest_port; > + break; > + > + case IPPROTO_UDP: > + PULLUP_TO(hlen, ulp, struct udphdr); > + dst_port = UDP(ulp)->uh_dport; > + src_port = UDP(ulp)->uh_sport; > + break; > + > + case IPPROTO_HOPOPTS: /* RFC 2460 */ > + PULLUP_TO(hlen, ulp, struct ip6_hbh); > + ext_hd |= EXT_HOPOPTS; > + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; > + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; > + ulp = NULL; > + break; > + > + case IPPROTO_ROUTING: /* RFC 2460 */ > + PULLUP_TO(hlen, ulp, struct ip6_rthdr); > + switch (((struct ip6_rthdr *)ulp)->ip6r_type) { > + case 0: > + ext_hd |= EXT_RTHDR0; > + break; > + case 2: > + ext_hd |= EXT_RTHDR2; > + break; > + default: > + if (V_fw_verbose) > + printf("IPFW2: IPV6 - Unknown " > + "Routing Header type(%d)\n", > + ((struct ip6_rthdr *) > + ulp)->ip6r_type); > + if (V_fw_deny_unknown_exthdrs) > + return (IP_FW_DENY); > + break; > + } > + ext_hd |= EXT_ROUTING; > + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; > + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; > + ulp = NULL; > + break; > + > + case IPPROTO_FRAGMENT: /* RFC 2460 */ > + PULLUP_TO(hlen, ulp, struct ip6_frag); > + ext_hd |= EXT_FRAGMENT; > + hlen += sizeof (struct ip6_frag); > + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; > + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & > + IP6F_OFF_MASK; > + ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & > + IP6F_MORE_FRAG; > + if (V_fw_permit_single_frag6 == 0 && > + offset == 0 && ip6f_mf == 0) { > + if (V_fw_verbose) > + printf("IPFW2: IPV6 - Invalid " > + "Fragment Header\n"); > + if (V_fw_deny_unknown_exthdrs) > + return (IP_FW_DENY); > + break; > + } > + args->f_id.extra = > + ntohl(((struct ip6_frag *)ulp)->ip6f_ident); > + ulp = NULL; > + break; > + > + case IPPROTO_DSTOPTS: /* RFC 2460 */ > + PULLUP_TO(hlen, ulp, struct ip6_hbh); > + ext_hd |= EXT_DSTOPTS; > + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; > + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; > + ulp = NULL; > + break; > + > + case IPPROTO_AH: /* RFC 2402 */ > + PULLUP_TO(hlen, ulp, struct ip6_ext); > + ext_hd |= EXT_AH; > + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; > + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; > + ulp = NULL; > + break; > + > + case IPPROTO_ESP: /* RFC 2406 */ > + PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ > + /* Anything past Seq# is variable length and > + * data past this ext. header is encrypted. */ > + ext_hd |= EXT_ESP; > + break; > + > + case IPPROTO_NONE: /* RFC 2460 */ > + /* > + * Packet ends here, and IPv6 header has > + * already been pulled up. If ip6e_len!=0 > + * then octets must be ignored. > + */ > + ulp = ip; /* non-NULL to get out of loop. */ > + break; > + > + case IPPROTO_OSPFIGP: > + /* XXX OSPF header check? */ > + PULLUP_TO(hlen, ulp, struct ip6_ext); > + break; > + > + case IPPROTO_PIM: > + /* XXX PIM header check? */ > + PULLUP_TO(hlen, ulp, struct pim); > + break; > + > + case IPPROTO_CARP: > + PULLUP_TO(hlen, ulp, struct carp_header); > + if (((struct carp_header *)ulp)->carp_version != > + CARP_VERSION) > + return (IP_FW_DENY); > + if (((struct carp_header *)ulp)->carp_type != > + CARP_ADVERTISEMENT) > + return (IP_FW_DENY); > + break; > + > + case IPPROTO_IPV6: /* RFC 2893 */ > + PULLUP_TO(hlen, ulp, struct ip6_hdr); > + break; > + > + case IPPROTO_IPV4: /* RFC 2893 */ > + PULLUP_TO(hlen, ulp, struct ip); > + break; > + > + default: > + if (V_fw_verbose) > + printf("IPFW2: IPV6 - Unknown " > + "Extension Header(%d), ext_hd=%x\n", > + proto, ext_hd); > + if (V_fw_deny_unknown_exthdrs) > + return (IP_FW_DENY); > + PULLUP_TO(hlen, ulp, struct ip6_ext); > + break; > + } /*switch */ > + } > + ip = mtod(m, struct ip *); > + ip6 = (struct ip6_hdr *)ip; > + args->f_id.src_ip6 = ip6->ip6_src; > + args->f_id.dst_ip6 = ip6->ip6_dst; > + args->f_id.src_ip = 0; > + args->f_id.dst_ip = 0; > + args->f_id.flow_id6 = ntohl(ip6->ip6_flow); > + } else if (pktlen >= sizeof(struct ip) && > + (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { > + is_ipv4 = 1; > + hlen = ip->ip_hl << 2; > + args->f_id.addr_type = 4; > + > + /* > + * Collect parameters into local variables for faster matching. > + */ > + proto = ip->ip_p; > + src_ip = ip->ip_src; > + dst_ip = ip->ip_dst; > + offset = ntohs(ip->ip_off) & IP_OFFMASK; > + iplen = ntohs(ip->ip_len); > + pktlen = iplen < pktlen ? iplen : pktlen; > + > + if (offset == 0) { > + switch (proto) { > + case IPPROTO_TCP: > + PULLUP_TO(hlen, ulp, struct tcphdr); > + dst_port = TCP(ulp)->th_dport; > + src_port = TCP(ulp)->th_sport; > + /* save flags for dynamic rules */ > + args->f_id._flags = TCP(ulp)->th_flags; > + break; > + > + case IPPROTO_SCTP: > + PULLUP_TO(hlen, ulp, struct sctphdr); > + src_port = SCTP(ulp)->src_port; > + dst_port = SCTP(ulp)->dest_port; > + break; > + > + case IPPROTO_UDP: > + PULLUP_TO(hlen, ulp, struct udphdr); > + dst_port = UDP(ulp)->uh_dport; > + src_port = UDP(ulp)->uh_sport; > + break; > + > + case IPPROTO_ICMP: > + PULLUP_TO(hlen, ulp, struct icmphdr); > + //args->f_id.flags = ICMP(ulp)->icmp_type; > + break; > + > + default: > + break; > + } > + } > + > + ip = mtod(m, struct ip *); > + args->f_id.src_ip = ntohl(src_ip.s_addr); > + args->f_id.dst_ip = ntohl(dst_ip.s_addr); > + } > +#undef PULLUP_TO > + if (proto) { /* we may have port numbers, store them */ > + args->f_id.proto = proto; > + args->f_id.src_port = src_port = ntohs(src_port); > + args->f_id.dst_port = dst_port = ntohs(dst_port); > + } > + > + IPFW_PF_RLOCK(chain); > + if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ > + IPFW_PF_RUNLOCK(chain); > + return (IP_FW_PASS); /* accept */ > + } > + if (args->rule.slot) { > + /* > + * Packet has already been tagged as a result of a previous > + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, > + * REASS, NETGRAPH, DIVERT/TEE...) > + * Validate the slot and continue from the next one > + * if still present, otherwise do a lookup. > + */ > + f_pos = (args->rule.chain_id == chain->id) ? > + args->rule.slot : > + ipfw_find_rule(chain, args->rule.rulenum, > + args->rule.rule_id); > + } else { > + f_pos = 0; > + } > + > + /* > + * Now scan the rules, and parse microinstructions for each rule. > + * We have two nested loops and an inner switch. Sometimes we > + * need to break out of one or both loops, or re-enter one of > + * the loops with updated variables. Loop variables are: > + * > + * f_pos (outer loop) points to the current rule. > + * On output it points to the matching rule. > + * done (outer loop) is used as a flag to break the loop. > + * l (inner loop) residual length of current rule. > + * cmd points to the current microinstruction. > + * > + * We break the inner loop by setting l=0 and possibly > + * cmdlen=0 if we don't want to advance cmd. > + * We break the outer loop by setting done=1 > + * We can restart the inner loop by setting l>0 and f_pos, f, cmd > + * as needed. > + */ > + for (; f_pos < chain->n_rules; f_pos++) { > + ipfw_insn *cmd; > + uint32_t tablearg = 0; > + int l, cmdlen, skip_or; /* skip rest of OR block */ > + struct ip_fw *f; > + > + f = chain->map[f_pos]; > + if (V_set_disable & (1 << f->set) ) > + continue; > + > + skip_or = 0; > + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; > + l -= cmdlen, cmd += cmdlen) { > + int match; > + > + /* > + * check_body is a jump target used when we find a > + * CHECK_STATE, and need to jump to the body of > + * the target rule. > + */ > + > +/* check_body: */ > + cmdlen = F_LEN(cmd); > + /* > + * An OR block (insn_1 || .. || insn_n) has the > + * F_OR bit set in all but the last instruction. > + * The first match will set "skip_or", and cause > + * the following instructions to be skipped until > + * past the one with the F_OR bit clear. > + */ > + if (skip_or) { /* skip this instruction */ > + if ((cmd->len & F_OR) == 0) > + skip_or = 0; /* next one is good */ > + continue; > + } > + match = 0; /* set to 1 if we succeed */ > + > + switch (cmd->opcode) { > + /* > + * The first set of opcodes compares the packet's > + * fields with some pattern, setting 'match' if a > + * match is found. At the end of the loop there is > + * logic to deal with F_NOT and F_OR flags associated > + * with the opcode. > + */ > + case O_NOP: > + match = 1; > + break; > + > + case O_FORWARD_MAC: > + printf("ipfw: opcode %d unimplemented\n", > + cmd->opcode); > + break; > + > + case O_GID: > + case O_UID: > + case O_JAIL: > + /* > + * We only check offset == 0 && proto != 0, > + * as this ensures that we have a > + * packet with the ports info. > + */ > + if (offset != 0) > + break; > + if (proto == IPPROTO_TCP || > + proto == IPPROTO_UDP) > + match = check_uidgid( > + (ipfw_insn_u32 *)cmd, > + args, &ucred_lookup, > +#ifdef __FreeBSD__ > + &ucred_cache); > +#else > + (void *)&ucred_cache); > +#endif > + break; > + > + case O_RECV: > + match = iface_match(m->m_pkthdr.rcvif, > + (ipfw_insn_if *)cmd, chain, &tablearg); > + break; > + > + case O_XMIT: > + match = iface_match(oif, (ipfw_insn_if *)cmd, > + chain, &tablearg); > + break; > + > + case O_VIA: > + match = iface_match(oif ? oif : > + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, > + chain, &tablearg); > + break; > + > + case O_MACADDR2: > + if (args->eh != NULL) { /* have MAC header */ > + u_int32_t *want = (u_int32_t *) > + ((ipfw_insn_mac *)cmd)->addr; > + u_int32_t *mask = (u_int32_t *) > + ((ipfw_insn_mac *)cmd)->mask; > + u_int32_t *hdr = (u_int32_t *)args->eh; > + > + match = > + ( want[0] == (hdr[0] & mask[0]) && > + want[1] == (hdr[1] & mask[1]) && > + want[2] == (hdr[2] & mask[2]) ); > + } > + break; > + > + case O_MAC_TYPE: > + if (args->eh != NULL) { > + u_int16_t *p = > + ((ipfw_insn_u16 *)cmd)->ports; > + int i; > + > + for (i = cmdlen - 1; !match && i>0; > + i--, p += 2) > + match = (etype >= p[0] && > + etype <= p[1]); > + } > + break; > + > + case O_FRAG: > + match = (offset != 0); > + break; > + > + case O_IN: /* "out" is "not in" */ > + match = (oif == NULL); > + break; > + > + case O_LAYER2: > + match = (args->eh != NULL); > + break; > + > + case O_DIVERTED: > + { > + /* For diverted packets, args->rule.info > + * contains the divert port (in host format) > + * reason and direction. > + */ > + uint32_t i = args->rule.info; > + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && > + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); > + } > + break; > + > + case O_PROTO: > + /* > + * We do not allow an arg of 0 so the > + * check of "proto" only suffices. > + */ > + match = (proto == cmd->arg1); > + break; > + > + case O_IP_SRC: > + match = is_ipv4 && > + (((ipfw_insn_ip *)cmd)->addr.s_addr == > + src_ip.s_addr); > + break; > + > + case O_IP_SRC_LOOKUP: > + case O_IP_DST_LOOKUP: > + if (is_ipv4) { > + uint32_t key = > + (cmd->opcode == O_IP_DST_LOOKUP) ? > + dst_ip.s_addr : src_ip.s_addr; > + uint32_t v = 0; > + > + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { > + /* generic lookup. The key must be > + * in 32bit big-endian format. > + */ > + v = ((ipfw_insn_u32 *)cmd)->d[1]; > + if (v == 0) > + key = dst_ip.s_addr; > + else if (v == 1) > + key = src_ip.s_addr; > + else if (v == 6) /* dscp */ > + key = (ip->ip_tos >> 2) & 0x3f; > + else if (offset != 0) > + break; > + else if (proto != IPPROTO_TCP && > + proto != IPPROTO_UDP) > + break; > + else if (v == 2) > + key = dst_port; > + else if (v == 3) > + key = src_port; > +#ifndef USERSPACE > + else if (v == 4 || v == 5) { > + check_uidgid( > + (ipfw_insn_u32 *)cmd, > + args, &ucred_lookup, > +#ifdef __FreeBSD__ > + &ucred_cache); > + if (v == 4 /* O_UID */) > + key = ucred_cache->cr_uid; > + else if (v == 5 /* O_JAIL */) > + key = ucred_cache->cr_prison->pr_id; > +#else /* !__FreeBSD__ */ > + (void *)&ucred_cache); > + if (v ==4 /* O_UID */) > + key = ucred_cache.uid; > + else if (v == 5 /* O_JAIL */) > + key = ucred_cache.xid; > +#endif /* !__FreeBSD__ */ > + } > +#endif /* !USERSPACE */ > + else > + break; > + } > + match = ipfw_lookup_table(chain, > + cmd->arg1, key, &v); > + if (!match) > + break; > + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) > + match = > + ((ipfw_insn_u32 *)cmd)->d[0] == v; > + else > + tablearg = v; > + } else if (is_ipv6) { > + uint32_t v = 0; > + void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? > + &args->f_id.dst_ip6: &args->f_id.src_ip6; > + match = ipfw_lookup_table_extended(chain, > + cmd->arg1, > + sizeof(struct in6_addr), > + pkey, &v); > + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) > + match = ((ipfw_insn_u32 *)cmd)->d[0] == v; > + if (match) > + tablearg = v; > + } > + break; > + > + case O_IP_FLOW_LOOKUP: > + { > + uint32_t v = 0; > + match = ipfw_lookup_table_extended(chain, > + cmd->arg1, 0, &args->f_id, &v); > + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) > + match = ((ipfw_insn_u32 *)cmd)->d[0] == v; > + if (match) > + tablearg = v; > + } > + break; > + case O_IP_SRC_MASK: > + case O_IP_DST_MASK: > + if (is_ipv4) { > + uint32_t a = > + (cmd->opcode == O_IP_DST_MASK) ? > + dst_ip.s_addr : src_ip.s_addr; > + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; > + int i = cmdlen-1; > + > + for (; !match && i>0; i-= 2, p+= 2) > + match = (p[0] == (a & p[1])); > + } > + break; > + > + case O_IP_SRC_ME: > + if (is_ipv4) { > + struct ifnet *tif; > + > + INADDR_TO_IFP(src_ip, tif); > + match = (tif != NULL); > + break; > + } > +#ifdef INET6 > + /* FALLTHROUGH */ > + case O_IP6_SRC_ME: > + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); > +#endif > + break; > + > + case O_IP_DST_SET: > + case O_IP_SRC_SET: > + if (is_ipv4) { > + u_int32_t *d = (u_int32_t *)(cmd+1); > + u_int32_t addr = > + cmd->opcode == O_IP_DST_SET ? > + args->f_id.dst_ip : > + args->f_id.src_ip; > + > + if (addr < d[0]) > + break; > + addr -= d[0]; /* subtract base */ > + match = (addr < cmd->arg1) && > + ( d[ 1 + (addr>>5)] & > + (1<<(addr & 0x1f)) ); > + } > + break; > + > + case O_IP_DST: > + match = is_ipv4 && > + (((ipfw_insn_ip *)cmd)->addr.s_addr == > + dst_ip.s_addr); > + break; > + > + case O_IP_DST_ME: > + if (is_ipv4) { > + struct ifnet *tif; > + > + INADDR_TO_IFP(dst_ip, tif); > + match = (tif != NULL); > + break; > + } > +#ifdef INET6 > + /* FALLTHROUGH */ > + case O_IP6_DST_ME: > + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); > +#endif > + break; > + > + > + case O_IP_SRCPORT: > + case O_IP_DSTPORT: > + /* > + * offset == 0 && proto != 0 is enough > + * to guarantee that we have a > + * packet with port info. > + */ > + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) > + && offset == 0) { > + u_int16_t x = > + (cmd->opcode == O_IP_SRCPORT) ? > + src_port : dst_port ; > + u_int16_t *p = > + ((ipfw_insn_u16 *)cmd)->ports; > + int i; > + > + for (i = cmdlen - 1; !match && i>0; > + i--, p += 2) > + match = (x>=p[0] && x<=p[1]); > + } > + break; > + > + case O_ICMPTYPE: > + match = (offset == 0 && proto==IPPROTO_ICMP && > + icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); > + break; > + > +#ifdef INET6 > + case O_ICMP6TYPE: > + match = is_ipv6 && offset == 0 && > + proto==IPPROTO_ICMPV6 && > + icmp6type_match( > + ICMP6(ulp)->icmp6_type, > + (ipfw_insn_u32 *)cmd); > + break; > +#endif /* INET6 */ > + > + case O_IPOPT: > + match = (is_ipv4 && > + ipopts_match(ip, cmd) ); > + break; > + > + case O_IPVER: > + match = (is_ipv4 && > + cmd->arg1 == ip->ip_v); > + break; > + > + case O_IPID: > + case O_IPLEN: > + case O_IPTTL: > + if (is_ipv4) { /* only for IP packets */ > + uint16_t x; > + uint16_t *p; > + int i; > + > + if (cmd->opcode == O_IPLEN) > + x = iplen; > + else if (cmd->opcode == O_IPTTL) > + x = ip->ip_ttl; > + else /* must be IPID */ > + x = ntohs(ip->ip_id); > + if (cmdlen == 1) { > + match = (cmd->arg1 == x); > + break; > + } > + /* otherwise we have ranges */ > + p = ((ipfw_insn_u16 *)cmd)->ports; > + i = cmdlen - 1; > + for (; !match && i>0; i--, p += 2) > + match = (x >= p[0] && x <= p[1]); > + } > + break; > + > + case O_IPPRECEDENCE: > + match = (is_ipv4 && > + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); > + break; > + > + case O_IPTOS: > + match = (is_ipv4 && > + flags_match(cmd, ip->ip_tos)); > + break; > + > + case O_DSCP: > + { > + uint32_t *p; > + uint16_t x; > + > + p = ((ipfw_insn_u32 *)cmd)->d; > + > + if (is_ipv4) > + x = ip->ip_tos >> 2; > + else if (is_ipv6) { > + uint8_t *v; > + v = &((struct ip6_hdr *)ip)->ip6_vfc; > + x = (*v & 0x0F) << 2; > + v++; > + x |= *v >> 6; > + } else > + break; > + > + /* DSCP bitmask is stored as low_u32 high_u32 */ > + if (x > 32) > + match = *(p + 1) & (1 << (x - 32)); > + else > + match = *p & (1 << x); > + } > + break; > + > + case O_TCPDATALEN: > + if (proto == IPPROTO_TCP && offset == 0) { > + struct tcphdr *tcp; > + uint16_t x; > + uint16_t *p; > + int i; > + > + tcp = TCP(ulp); > + x = iplen - > + ((ip->ip_hl + tcp->th_off) << 2); > + if (cmdlen == 1) { > + match = (cmd->arg1 == x); > + break; > + } > + /* otherwise we have ranges */ > + p = ((ipfw_insn_u16 *)cmd)->ports; > + i = cmdlen - 1; > + for (; !match && i>0; i--, p += 2) > + match = (x >= p[0] && x <= p[1]); > + } > + break; > + > + case O_TCPFLAGS: > + match = (proto == IPPROTO_TCP && offset == 0 && > + flags_match(cmd, TCP(ulp)->th_flags)); > + break; > + > + case O_TCPOPTS: > + if (proto == IPPROTO_TCP && offset == 0 && ulp){ > + PULLUP_LEN(hlen, ulp, > + (TCP(ulp)->th_off << 2)); > + match = tcpopts_match(TCP(ulp), cmd); > + } > + break; > + > + case O_TCPSEQ: > + match = (proto == IPPROTO_TCP && offset == 0 && > + ((ipfw_insn_u32 *)cmd)->d[0] == > + TCP(ulp)->th_seq); > + break; > + > + case O_TCPACK: > + match = (proto == IPPROTO_TCP && offset == 0 && > + ((ipfw_insn_u32 *)cmd)->d[0] == > + TCP(ulp)->th_ack); > + break; > + > + case O_TCPWIN: > + if (proto == IPPROTO_TCP && offset == 0) { > + uint16_t x; > + uint16_t *p; > + int i; > + > + x = ntohs(TCP(ulp)->th_win); > + if (cmdlen == 1) { > + match = (cmd->arg1 == x); > + break; > + } > + /* Otherwise we have ranges. */ > + p = ((ipfw_insn_u16 *)cmd)->ports; > + i = cmdlen - 1; > + for (; !match && i > 0; i--, p += 2) > + match = (x >= p[0] && x <= p[1]); > + } > + break; > + > + case O_ESTAB: > + /* reject packets which have SYN only */ > + /* XXX should i also check for TH_ACK ? */ > + match = (proto == IPPROTO_TCP && offset == 0 && > + (TCP(ulp)->th_flags & > + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); > + break; > + > + case O_ALTQ: { > + struct pf_mtag *at; > + struct m_tag *mtag; > + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; > + > + /* > + * ALTQ uses mbuf tags from another > + * packet filtering system - pf(4). > + * We allocate a tag in its format > + * and fill it in, pretending to be pf(4). > + */ > + match = 1; > + at = pf_find_mtag(m); > + if (at != NULL && at->qid != 0) > + break; > + mtag = m_tag_get(PACKET_TAG_PF, > + sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); > + if (mtag == NULL) { > + /* > + * Let the packet fall back to the > + * default ALTQ. > + */ > + break; > + } > + m_tag_prepend(m, mtag); > + at = (struct pf_mtag *)(mtag + 1); > + at->qid = altq->qid; > + at->hdr = ip; > + break; > + } > + > + case O_LOG: > + ipfw_log(chain, f, hlen, args, m, > + oif, offset | ip6f_mf, tablearg, ip); > + match = 1; > + break; > + > + case O_PROB: > + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); > + break; > + > + case O_VERREVPATH: > + /* Outgoing packets automatically pass/match */ > + match = ((oif != NULL) || > + (m->m_pkthdr.rcvif == NULL) || > + ( > +#ifdef INET6 > + is_ipv6 ? > + verify_path6(&(args->f_id.src_ip6), > + m->m_pkthdr.rcvif, args->f_id.fib) : > +#endif > + verify_path(src_ip, m->m_pkthdr.rcvif, > + args->f_id.fib))); > + break; > + > + case O_VERSRCREACH: > + /* Outgoing packets automatically pass/match */ > + match = (hlen > 0 && ((oif != NULL) || > +#ifdef INET6 > + is_ipv6 ? > + verify_path6(&(args->f_id.src_ip6), > + NULL, args->f_id.fib) : > +#endif > + verify_path(src_ip, NULL, args->f_id.fib))); > + break; > + > + case O_ANTISPOOF: > + /* Outgoing packets automatically pass/match */ > + if (oif == NULL && hlen > 0 && > + ( (is_ipv4 && in_localaddr(src_ip)) > +#ifdef INET6 > + || (is_ipv6 && > + in6_localaddr(&(args->f_id.src_ip6))) > +#endif > + )) > + match = > +#ifdef INET6 > + is_ipv6 ? verify_path6( > + &(args->f_id.src_ip6), > + m->m_pkthdr.rcvif, > + args->f_id.fib) : > +#endif > + verify_path(src_ip, > + m->m_pkthdr.rcvif, > + args->f_id.fib); > + else > + match = 1; > + break; > + > + case O_IPSEC: > +#ifdef IPSEC > + match = (m_tag_find(m, > + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); > +#endif > + /* otherwise no match */ > + break; > + > +#ifdef INET6 > + case O_IP6_SRC: > + match = is_ipv6 && > + IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, > + &((ipfw_insn_ip6 *)cmd)->addr6); > + break; > + > + case O_IP6_DST: > + match = is_ipv6 && > + IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, > + &((ipfw_insn_ip6 *)cmd)->addr6); > + break; > + case O_IP6_SRC_MASK: > + case O_IP6_DST_MASK: > + if (is_ipv6) { > + int i = cmdlen - 1; > + struct in6_addr p; > + struct in6_addr *d = > + &((ipfw_insn_ip6 *)cmd)->addr6; > + > + for (; !match && i > 0; d += 2, > + i -= F_INSN_SIZE(struct in6_addr) > + * 2) { > + p = (cmd->opcode == > + O_IP6_SRC_MASK) ? > + args->f_id.src_ip6: > + args->f_id.dst_ip6; > + APPLY_MASK(&p, &d[1]); > + match = > + IN6_ARE_ADDR_EQUAL(&d[0], > + &p); > + } > + } > + break; > + > + case O_FLOW6ID: > + match = is_ipv6 && > + flow6id_match(args->f_id.flow_id6, > + (ipfw_insn_u32 *) cmd); > + break; > + > + case O_EXT_HDR: > + match = is_ipv6 && > + (ext_hd & ((ipfw_insn *) cmd)->arg1); > + break; > + > + case O_IP6: > + match = is_ipv6; > + break; > +#endif > + > + case O_IP4: > + match = is_ipv4; > + break; > + > + case O_TAG: { > + struct m_tag *mtag; > + uint32_t tag = TARG(cmd->arg1, tag); > + > + /* Packet is already tagged with this tag? */ > + mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); > + > + /* We have `untag' action when F_NOT flag is > + * present. And we must remove this mtag from > + * mbuf and reset `match' to zero (`match' will > + * be inversed later). > + * Otherwise we should allocate new mtag and > + * push it into mbuf. > + */ > + if (cmd->len & F_NOT) { /* `untag' action */ > + if (mtag != NULL) > + m_tag_delete(m, mtag); > + match = 0; > + } else { > + if (mtag == NULL) { > + mtag = m_tag_alloc( MTAG_IPFW, > + tag, 0, M_NOWAIT); > + if (mtag != NULL) > + m_tag_prepend(m, mtag); > + } > + match = 1; > + } > + break; > + } > + > + case O_FIB: /* try match the specified fib */ > + if (args->f_id.fib == cmd->arg1) > + match = 1; > + break; > + > + case O_SOCKARG: { > +#ifndef USERSPACE /* not supported in userspace */ > + struct inpcb *inp = args->inp; > + struct inpcbinfo *pi; > + > + if (is_ipv6) /* XXX can we remove this ? */ > + break; > + > + if (proto == IPPROTO_TCP) > + pi = &V_tcbinfo; > + else if (proto == IPPROTO_UDP) > + pi = &V_udbinfo; > + else > + break; > + > + /* > + * XXXRW: so_user_cookie should almost > + * certainly be inp_user_cookie? > + */ > + > + /* For incomming packet, lookup up the > + inpcb using the src/dest ip/port tuple */ > + if (inp == NULL) { > + inp = in_pcblookup(pi, > + src_ip, htons(src_port), > + dst_ip, htons(dst_port), > + INPLOOKUP_RLOCKPCB, NULL); > + if (inp != NULL) { > + tablearg = > + inp->inp_socket->so_user_cookie; > + if (tablearg) > + match = 1; > + INP_RUNLOCK(inp); > + } > + } else { > + if (inp->inp_socket) { > + tablearg = > + inp->inp_socket->so_user_cookie; > + if (tablearg) > + match = 1; > + } > + } > +#endif /* !USERSPACE */ > + break; > + } > + > + case O_TAGGED: { > + struct m_tag *mtag; > + uint32_t tag = TARG(cmd->arg1, tag); > + > + if (cmdlen == 1) { > + match = m_tag_locate(m, MTAG_IPFW, > + tag, NULL) != NULL; > + break; > + } > + > + /* we have ranges */ > + for (mtag = m_tag_first(m); > + mtag != NULL && !match; > + mtag = m_tag_next(m, mtag)) { > + uint16_t *p; > + int i; > + > + if (mtag->m_tag_cookie != MTAG_IPFW) > + continue; > + > + p = ((ipfw_insn_u16 *)cmd)->ports; > + i = cmdlen - 1; > + for(; !match && i > 0; i--, p += 2) > + match = > + mtag->m_tag_id >= p[0] && > + mtag->m_tag_id <= p[1]; > + } > + break; > + } > + > + /* > + * The second set of opcodes represents 'actions', > + * i.e. the terminal part of a rule once the packet > + * matches all previous patterns. > + * Typically there is only one action for each rule, > + * and the opcode is stored at the end of the rule > + * (but there are exceptions -- see below). > + * > + * In general, here we set retval and terminate the > + * outer loop (would be a 'break 3' in some language, > + * but we need to set l=0, done=1) > + * > + * Exceptions: > + * O_COUNT and O_SKIPTO actions: > + * instead of terminating, we jump to the next rule > + * (setting l=0), or to the SKIPTO target (setting > + * f/f_len, cmd and l as needed), respectively. > + * > + * O_TAG, O_LOG and O_ALTQ action parameters: > + * perform some action and set match = 1; > + * > + * O_LIMIT and O_KEEP_STATE: these opcodes are > + * not real 'actions', and are stored right > + * before the 'action' part of the rule. > + * These opcodes try to install an entry in the > + * state tables; if successful, we continue with > + * the next opcode (match=1; break;), otherwise > + * the packet must be dropped (set retval, > + * break loops with l=0, done=1) > + * > + * O_PROBE_STATE and O_CHECK_STATE: these opcodes > + * cause a lookup of the state table, and a jump > + * to the 'action' part of the parent rule > + * if an entry is found, or > + * (CHECK_STATE only) a jump to the next rule if > + * the entry is not found. > + * The result of the lookup is cached so that > + * further instances of these opcodes become NOPs. > + * The jump to the next rule is done by setting > + * l=0, cmdlen=0. > + */ > + case O_LIMIT: > + case O_KEEP_STATE: > + if (ipfw_install_state(chain, f, > + (ipfw_insn_limit *)cmd, args, tablearg)) { > + /* error or limit violation */ > + retval = IP_FW_DENY; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + } > + match = 1; > + break; > + > + case O_PROBE_STATE: > + case O_CHECK_STATE: > + /* > + * dynamic rules are checked at the first > + * keep-state or check-state occurrence, > + * with the result being stored in dyn_dir. > + * The compiler introduces a PROBE_STATE > + * instruction for us when we have a > + * KEEP_STATE (because PROBE_STATE needs > + * to be run first). > + */ > + if (dyn_dir == MATCH_UNKNOWN && > + (q = ipfw_lookup_dyn_rule(&args->f_id, > + &dyn_dir, proto == IPPROTO_TCP ? > + TCP(ulp) : NULL)) > + != NULL) { > + /* > + * Found dynamic entry, update stats > + * and jump to the 'action' part of > + * the parent rule by setting > + * f, cmd, l and clearing cmdlen. > + */ > + IPFW_INC_DYN_COUNTER(q, pktlen); > + /* XXX we would like to have f_pos > + * readily accessible in the dynamic > + * rule, instead of having to > + * lookup q->rule. > + */ > + f = q->rule; > + f_pos = ipfw_find_rule(chain, > + f->rulenum, f->id); > + cmd = ACTION_PTR(f); > + l = f->cmd_len - f->act_ofs; > + ipfw_dyn_unlock(q); > + cmdlen = 0; > + match = 1; > + break; > + } > + /* > + * Dynamic entry not found. If CHECK_STATE, > + * skip to next rule, if PROBE_STATE just > + * ignore and continue with next opcode. > + */ > + if (cmd->opcode == O_CHECK_STATE) > + l = 0; /* exit inner loop */ > + match = 1; > + break; > + > + case O_ACCEPT: > + retval = 0; /* accept */ > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > + > + case O_PIPE: > + case O_QUEUE: > + set_match(args, f_pos, chain); > + args->rule.info = TARG(cmd->arg1, pipe); > + if (cmd->opcode == O_PIPE) > + args->rule.info |= IPFW_IS_PIPE; > + if (V_fw_one_pass) > + args->rule.info |= IPFW_ONEPASS; > + retval = IP_FW_DUMMYNET; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > + > + case O_DIVERT: > + case O_TEE: > + if (args->eh) /* not on layer 2 */ > + break; > + /* otherwise this is terminal */ > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + retval = (cmd->opcode == O_DIVERT) ? > + IP_FW_DIVERT : IP_FW_TEE; > + set_match(args, f_pos, chain); > + args->rule.info = TARG(cmd->arg1, divert); > + break; > + > + case O_COUNT: > + IPFW_INC_RULE_COUNTER(f, pktlen); > + l = 0; /* exit inner loop */ > + break; > + > + case O_SKIPTO: > + IPFW_INC_RULE_COUNTER(f, pktlen); > + f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); > + /* > + * Skip disabled rules, and re-enter > + * the inner loop with the correct > + * f_pos, f, l and cmd. > + * Also clear cmdlen and skip_or > + */ > + for (; f_pos < chain->n_rules - 1 && > + (V_set_disable & > + (1 << chain->map[f_pos]->set)); > + f_pos++) > + ; > + /* Re-enter the inner loop at the skipto rule. */ > + f = chain->map[f_pos]; > + l = f->cmd_len; > + cmd = f->cmd; > + match = 1; > + cmdlen = 0; > + skip_or = 0; > + continue; > + break; /* not reached */ > + > + case O_CALLRETURN: { > + /* > + * Implementation of `subroutine' call/return, > + * in the stack carried in an mbuf tag. This > + * is different from `skipto' in that any call > + * address is possible (`skipto' must prevent > + * backward jumps to avoid endless loops). > + * We have `return' action when F_NOT flag is > + * present. The `m_tag_id' field is used as > + * stack pointer. > + */ > + struct m_tag *mtag; > + uint16_t jmpto, *stack; > + > +#define IS_CALL ((cmd->len & F_NOT) == 0) > +#define IS_RETURN ((cmd->len & F_NOT) != 0) > + /* > + * Hand-rolled version of m_tag_locate() with > + * wildcard `type'. > + * If not already tagged, allocate new tag. > + */ > + mtag = m_tag_first(m); > + while (mtag != NULL) { > + if (mtag->m_tag_cookie == > + MTAG_IPFW_CALL) > + break; > + mtag = m_tag_next(m, mtag); > + } > + if (mtag == NULL && IS_CALL) { > + mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, > + IPFW_CALLSTACK_SIZE * > + sizeof(uint16_t), M_NOWAIT); > + if (mtag != NULL) > + m_tag_prepend(m, mtag); > + } > + > + /* > + * On error both `call' and `return' just > + * continue with next rule. > + */ > + if (IS_RETURN && (mtag == NULL || > + mtag->m_tag_id == 0)) { > + l = 0; /* exit inner loop */ > + break; > + } > + if (IS_CALL && (mtag == NULL || > + mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { > + printf("ipfw: call stack error, " > + "go to next rule\n"); > + l = 0; /* exit inner loop */ > + break; > + } > + > + IPFW_INC_RULE_COUNTER(f, pktlen); > + stack = (uint16_t *)(mtag + 1); > + > + /* > + * The `call' action may use cached f_pos > + * (in f->next_rule), whose version is written > + * in f->next_rule. > + * The `return' action, however, doesn't have > + * fixed jump address in cmd->arg1 and can't use > + * cache. > + */ > + if (IS_CALL) { > + stack[mtag->m_tag_id] = f->rulenum; > + mtag->m_tag_id++; > + f_pos = JUMP(chain, f, cmd->arg1, > + tablearg, 1); > + } else { /* `return' action */ > + mtag->m_tag_id--; > + jmpto = stack[mtag->m_tag_id] + 1; > + f_pos = ipfw_find_rule(chain, jmpto, 0); > + } > + > + /* > + * Skip disabled rules, and re-enter > + * the inner loop with the correct > + * f_pos, f, l and cmd. > + * Also clear cmdlen and skip_or > + */ > + for (; f_pos < chain->n_rules - 1 && > + (V_set_disable & > + (1 << chain->map[f_pos]->set)); f_pos++) > + ; > + /* Re-enter the inner loop at the dest rule. */ > + f = chain->map[f_pos]; > + l = f->cmd_len; > + cmd = f->cmd; > + cmdlen = 0; > + skip_or = 0; > + continue; > + break; /* NOTREACHED */ > + } > +#undef IS_CALL > +#undef IS_RETURN > + > + case O_REJECT: > + /* > + * Drop the packet and send a reject notice > + * if the packet is not ICMP (or is an ICMP > + * query), and it is not multicast/broadcast. > + */ > + if (hlen > 0 && is_ipv4 && offset == 0 && > + (proto != IPPROTO_ICMP || > + is_icmp_query(ICMP(ulp))) && > + !(m->m_flags & (M_BCAST|M_MCAST)) && > + !IN_MULTICAST(ntohl(dst_ip.s_addr))) { > + send_reject(args, cmd->arg1, iplen, ip); > + m = args->m; > + } > + /* FALLTHROUGH */ > +#ifdef INET6 > + case O_UNREACH6: > + if (hlen > 0 && is_ipv6 && > + ((offset & IP6F_OFF_MASK) == 0) && > + (proto != IPPROTO_ICMPV6 || > + (is_icmp6_query(icmp6_type) == 1)) && > + !(m->m_flags & (M_BCAST|M_MCAST)) && > + !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { > + send_reject6( > + args, cmd->arg1, hlen, > + (struct ip6_hdr *)ip); > + m = args->m; > + } > + /* FALLTHROUGH */ > +#endif > + case O_DENY: > + retval = IP_FW_DENY; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > + > + case O_FORWARD_IP: > +#ifndef USERSPACE /* allow forward in userspace */ > + if (args->eh) /* not valid on layer2 pkts */ > + break; > +#endif /* !USERSPACE */ > + if (q == NULL || q->rule != f || > + dyn_dir == MATCH_FORWARD) { > + struct sockaddr_in *sa; > + sa = &(((ipfw_insn_sa *)cmd)->sa); > + if (sa->sin_addr.s_addr == INADDR_ANY) { > + bcopy(sa, &args->hopstore, > + sizeof(*sa)); > + args->hopstore.sin_addr.s_addr = > + htonl(tablearg); > + args->next_hop = &args->hopstore; > + } else { > + args->next_hop = sa; > + } > + } > + retval = IP_FW_PASS; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > + > +#ifdef INET6 > + case O_FORWARD_IP6: > + if (args->eh) /* not valid on layer2 pkts */ > + break; > + if (q == NULL || q->rule != f || > + dyn_dir == MATCH_FORWARD) { > + struct sockaddr_in6 *sin6; > + > + sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); > + args->next_hop6 = sin6; > + } > + retval = IP_FW_PASS; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > +#endif > + > + case O_NETGRAPH: > + case O_NGTEE: > + set_match(args, f_pos, chain); > + args->rule.info = TARG(cmd->arg1, netgraph); > + if (V_fw_one_pass) > + args->rule.info |= IPFW_ONEPASS; > + retval = (cmd->opcode == O_NETGRAPH) ? > + IP_FW_NETGRAPH : IP_FW_NGTEE; > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + break; > + > + case O_SETFIB: { > + uint32_t fib; > + > + IPFW_INC_RULE_COUNTER(f, pktlen); > + fib = TARG(cmd->arg1, fib) & 0x7FFFF; > + if (fib >= rt_numfibs) > + fib = 0; > + M_SETFIB(m, fib); > + args->f_id.fib = fib; > + l = 0; /* exit inner loop */ > + break; > + } > + > + case O_SETDSCP: { > + uint16_t code; > + > + code = TARG(cmd->arg1, dscp) & 0x3F; > + l = 0; /* exit inner loop */ > + if (is_ipv4) { > + uint16_t a; > + > + a = ip->ip_tos; > + ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); > + a += ntohs(ip->ip_sum) - ip->ip_tos; > + ip->ip_sum = htons(a); > + } else if (is_ipv6) { > + uint8_t *v; > + > + v = &((struct ip6_hdr *)ip)->ip6_vfc; > + *v = (*v & 0xF0) | (code >> 2); > + v++; > + *v = (*v & 0x3F) | ((code & 0x03) << 6); > + } else > + break; > + > + IPFW_INC_RULE_COUNTER(f, pktlen); > + break; > + } > + > + case O_NAT: > + l = 0; /* exit inner loop */ > + done = 1; /* exit outer loop */ > + if (!IPFW_NAT_LOADED) { > + retval = IP_FW_DENY; > + break; > + } > + > + struct cfg_nat *t; > + int nat_id; > + > + set_match(args, f_pos, chain); > + /* Check if this is 'global' nat rule */ > + if (cmd->arg1 == 0) { > + retval = ipfw_nat_ptr(args, NULL, m); > + break; > + } > + t = ((ipfw_insn_nat *)cmd)->nat; > + if (t == NULL) { > + nat_id = TARG(cmd->arg1, nat); > + t = (*lookup_nat_ptr)(&chain->nat, nat_id); > + > + if (t == NULL) { > + retval = IP_FW_DENY; > + break; > + } > + if (cmd->arg1 != IP_FW_TARG) > + ((ipfw_insn_nat *)cmd)->nat = t; > + } > + retval = ipfw_nat_ptr(args, t, m); > + break; > + > + case O_REASS: { > + int ip_off; > + > + IPFW_INC_RULE_COUNTER(f, pktlen); > + l = 0; /* in any case exit inner loop */ > + ip_off = ntohs(ip->ip_off); > + > + /* if not fragmented, go to next rule */ > + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) > + break; > + > + args->m = m = ip_reass(m); > + > + /* > + * do IP header checksum fixup. > + */ > + if (m == NULL) { /* fragment got swallowed */ > + retval = IP_FW_DENY; > + } else { /* good, packet complete */ > + int hlen; > + > + ip = mtod(m, struct ip *); > + hlen = ip->ip_hl << 2; > + ip->ip_sum = 0; > + if (hlen == sizeof(struct ip)) > + ip->ip_sum = in_cksum_hdr(ip); > + else > + ip->ip_sum = in_cksum(m, hlen); > + retval = IP_FW_REASS; > + set_match(args, f_pos, chain); > + } > + done = 1; /* exit outer loop */ > + break; > + } > + > + default: > + panic("-- unknown opcode %d\n", cmd->opcode); > + } /* end of switch() on opcodes */ > + /* > + * if we get here with l=0, then match is irrelevant. > + */ > + > + if (cmd->len & F_NOT) > + match = !match; > + > + if (match) { > + if (cmd->len & F_OR) > + skip_or = 1; > + } else { > + if (!(cmd->len & F_OR)) /* not an OR block, */ > + break; /* try next rule */ > + } > + > + } /* end of inner loop, scan opcodes */ > +#undef PULLUP_LEN > + > + if (done) > + break; > + > +/* next_rule:; */ /* try next rule */ > + > + } /* end of outer for, scan rules */ > + > + if (done) { > + struct ip_fw *rule = chain->map[f_pos]; > + /* Update statistics */ > + IPFW_INC_RULE_COUNTER(rule, pktlen); > + } else { > + retval = IP_FW_DENY; > + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); > + } > + IPFW_PF_RUNLOCK(chain); > +#ifdef __FreeBSD__ > + if (ucred_cache != NULL) > + crfree(ucred_cache); > +#endif > + return (retval); > + > +pullup_failed: > + if (V_fw_verbose) > + printf("ipfw: pullup failed\n"); > + return (IP_FW_DENY); > +} > + > +/* > + * Set maximum number of tables that can be used in given VNET ipfw instance. > + */ > +#ifdef SYSCTL_NODE > +static int > +sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) > +{ > + int error; > + unsigned int ntables; > + > + ntables = V_fw_tables_max; > + > + error = sysctl_handle_int(oidp, &ntables, 0, req); > + /* Read operation or some error */ > + if ((error != 0) || (req->newptr == NULL)) > + return (error); > + > + return (ipfw_resize_tables(&V_layer3_chain, ntables)); > +} > + > +/* > + * Switches table namespace between global and per-set. > + */ > +static int > +sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) > +{ > + int error; > + unsigned int sets; > + > + sets = V_fw_tables_sets; > + > + error = sysctl_handle_int(oidp, &sets, 0, req); > + /* Read operation or some error */ > + if ((error != 0) || (req->newptr == NULL)) > + return (error); > + > + return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); > +} > +#endif > + > +/* > + * Module and VNET glue > + */ > + > +/* > + * Stuff that must be initialised only on boot or module load > + */ > +static int > +ipfw_init(void) > +{ > + int error = 0; > + > + /* > + * Only print out this stuff the first time around, > + * when called from the sysinit code. > + */ > + printf("ipfw2 " > +#ifdef INET6 > + "(+ipv6) " > +#endif > + "initialized, divert %s, nat %s, " > + "default to %s, logging ", > +#ifdef IPDIVERT > + "enabled", > +#else > + "loadable", > +#endif > +#ifdef IPFIREWALL_NAT > + "enabled", > +#else > + "loadable", > +#endif > + default_to_accept ? "accept" : "deny"); > + > + /* > + * Note: V_xxx variables can be accessed here but the vnet specific > + * initializer may not have been called yet for the VIMAGE case. > + * Tuneables will have been processed. We will print out values for > + * the default vnet. > + * XXX This should all be rationalized AFTER 8.0 > + */ > + if (V_fw_verbose == 0) > + printf("disabled\n"); > + else if (V_verbose_limit == 0) > + printf("unlimited\n"); > + else > + printf("limited to %d packets/entry by default\n", > + V_verbose_limit); > + > + /* Check user-supplied table count for validness */ > + if (default_fw_tables > IPFW_TABLES_MAX) > + default_fw_tables = IPFW_TABLES_MAX; > + > + ipfw_init_sopt_handler(); > + ipfw_log_bpf(1); /* init */ > + ipfw_iface_init(); > + return (error); > +} > + > +/* > + * Called for the removal of the last instance only on module unload. > + */ > +static void > +ipfw_destroy(void) > +{ > + > + ipfw_iface_destroy(); > + ipfw_log_bpf(0); /* uninit */ > + ipfw_destroy_sopt_handler(); > + printf("IP firewall unloaded\n"); > +} > + > +/* > + * Stuff that must be initialized for every instance > + * (including the first of course). > + */ > +static int > +vnet_ipfw_init(const void *unused) > +{ > + int error, first; > + struct ip_fw *rule = NULL; > + struct ip_fw_chain *chain; > + > + chain = &V_layer3_chain; > + > + first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; > + > + /* First set up some values that are compile time options */ > + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ > + V_fw_deny_unknown_exthdrs = 1; > +#ifdef IPFIREWALL_VERBOSE > + V_fw_verbose = 1; > +#endif > +#ifdef IPFIREWALL_VERBOSE_LIMIT > + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; > +#endif > +#ifdef IPFIREWALL_NAT > + LIST_INIT(&chain->nat); > +#endif > + > + ipfw_init_counters(); > + /* insert the default rule and create the initial map */ > + chain->n_rules = 1; > + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); > + rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); > + > + /* Set initial number of tables */ > + V_fw_tables_max = default_fw_tables; > + error = ipfw_init_tables(chain, first); > + if (error) { > + printf("ipfw2: setting up tables failed\n"); > + free(chain->map, M_IPFW); > + free(rule, M_IPFW); > + return (ENOSPC); > + } > + > + /* fill and insert the default rule */ > + rule->act_ofs = 0; > + rule->rulenum = IPFW_DEFAULT_RULE; > + rule->cmd_len = 1; > + rule->set = RESVD_SET; > + rule->cmd[0].len = 1; > + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; > + chain->default_rule = chain->map[0] = rule; > + chain->id = rule->id = 1; > + /* Pre-calculate rules length for legacy dump format */ > + chain->static_len = sizeof(struct ip_fw_rule0); > + > + IPFW_LOCK_INIT(chain); > + ipfw_dyn_init(chain); > +#ifdef LINEAR_SKIPTO > + ipfw_init_skipto_cache(chain); > +#endif > + > + /* First set up some values that are compile time options */ > + V_ipfw_vnet_ready = 1; /* Open for business */ > + > + /* > + * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. > + * Even if the latter two fail we still keep the module alive > + * because the sockopt and layer2 paths are still useful. > + * ipfw[6]_hook return 0 on success, ENOENT on failure, > + * so we can ignore the exact return value and just set a flag. > + * > + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so > + * changes in the underlying (per-vnet) variables trigger > + * immediate hook()/unhook() calls. > + * In layer2 we have the same behaviour, except that V_ether_ipfw > + * is checked on each packet because there are no pfil hooks. > + */ > + V_ip_fw_ctl_ptr = ipfw_ctl3; > + error = ipfw_attach_hooks(1); > + return (error); > +} > + > +/* > + * Called for the removal of each instance. > + */ > +static int > +vnet_ipfw_uninit(const void *unused) > +{ > + struct ip_fw *reap; > + struct ip_fw_chain *chain = &V_layer3_chain; > + int i, last; > + > + V_ipfw_vnet_ready = 0; /* tell new callers to go away */ > + /* > + * disconnect from ipv4, ipv6, layer2 and sockopt. > + * Then grab, release and grab again the WLOCK so we make > + * sure the update is propagated and nobody will be in. > + */ > + (void)ipfw_attach_hooks(0 /* detach */); > + V_ip_fw_ctl_ptr = NULL; > + > + last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; > + > + IPFW_UH_WLOCK(chain); > + IPFW_UH_WUNLOCK(chain); > + IPFW_UH_WLOCK(chain); > + > + IPFW_WLOCK(chain); > + ipfw_dyn_uninit(0); /* run the callout_drain */ > + IPFW_WUNLOCK(chain); > + > + reap = NULL; > + IPFW_WLOCK(chain); > + for (i = 0; i < chain->n_rules; i++) > + ipfw_reap_add(chain, &reap, chain->map[i]); > + free(chain->map, M_IPFW); > +#ifdef LINEAR_SKIPTO > + ipfw_destroy_skipto_cache(chain); > +#endif > + IPFW_WUNLOCK(chain); > + IPFW_UH_WUNLOCK(chain); > + ipfw_destroy_tables(chain, last); > + if (reap != NULL) > + ipfw_reap_rules(reap); > + vnet_ipfw_iface_destroy(chain); > + IPFW_LOCK_DESTROY(chain); > + ipfw_dyn_uninit(1); /* free the remaining parts */ > + ipfw_destroy_counters(); > + return (0); > +} > + > +/* > + * Module event handler. > + * In general we have the choice of handling most of these events by the > + * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to > + * use the SYSINIT handlers as they are more capable of expressing the > + * flow of control during module and vnet operations, so this is just > + * a skeleton. Note there is no SYSINIT equivalent of the module > + * SHUTDOWN handler, but we don't have anything to do in that case anyhow. > + */ > +static int > +ipfw_modevent(module_t mod, int type, void *unused) > +{ > + int err = 0; > + > + switch (type) { > + case MOD_LOAD: > + /* Called once at module load or > + * system boot if compiled in. */ > + break; > + case MOD_QUIESCE: > + /* Called before unload. May veto unloading. */ > + break; > + case MOD_UNLOAD: > + /* Called during unload. */ > + break; > + case MOD_SHUTDOWN: > + /* Called during system shutdown. */ > + break; > + default: > + err = EOPNOTSUPP; > + break; > + } > + return err; > +} > + > +static moduledata_t ipfwmod = { > + "ipfw", > + ipfw_modevent, > + 0 > +}; > + > +/* Define startup order. */ > +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN > +#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ > +#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ > +#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ > + > +DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); > +FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); > +MODULE_VERSION(ipfw, 3); > +/* should declare some dependencies here */ > + > +/* > + * Starting up. Done in order after ipfwmod() has been called. > + * VNET_SYSINIT is also called for each existing vnet and each new vnet. > + */ > +SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, > + ipfw_init, NULL); > +VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, > + vnet_ipfw_init, NULL); > + > +/* > + * Closing up shop. These are done in REVERSE ORDER, but still > + * after ipfwmod() has been called. Not called on reboot. > + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. > + * or when the module is unloaded. > + */ > +SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, > + ipfw_destroy, NULL); > +VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, > + vnet_ipfw_uninit, NULL); > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c > new file mode 100644 > index 0000000..ba6f579 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c > @@ -0,0 +1,1604 @@ > +/*- > + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_dynamic.c 272840 2014-10-09 19:32:35Z melifaro $"); > + > +#define DEB(x) > +#define DDB(x) x > + > +/* > + * Dynamic rule support for ipfw > + */ > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/ktr.h> > +#include <sys/lock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/sysctl.h> > +#include <sys/syslog.h> > +#include <net/ethernet.h> /* for ETHERTYPE_IP */ > +#include <net/if.h> > +#include <net/if_var.h> > +#include <net/vnet.h> > + > +#include <netinet/in.h> > +#include <netinet/ip.h> > +#include <netinet/ip_var.h> /* ip_defttl */ > +#include <netinet/ip_fw.h> > +#include <netinet/tcp_var.h> > +#include <netinet/udp.h> > + > +#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ > +#ifdef INET6 > +#include <netinet6/in6_var.h> > +#include <netinet6/ip6_var.h> > +#endif > + > +#include <netpfil/ipfw/ip_fw_private.h> > + > +#include <machine/in_cksum.h> /* XXX for in_cksum */ > + > +#ifdef MAC > +#include <security/mac/mac_framework.h> > +#endif > + > +/* > + * Description of dynamic rules. > + * > + * Dynamic rules are stored in lists accessed through a hash table > + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can > + * be modified through the sysctl variable dyn_buckets which is > + * updated when the table becomes empty. > + * > + * XXX currently there is only one list, ipfw_dyn. > + * > + * When a packet is received, its address fields are first masked > + * with the mask defined for the rule, then hashed, then matched > + * against the entries in the corresponding list. > + * Dynamic rules can be used for different purposes: > + * + stateful rules; > + * + enforcing limits on the number of sessions; > + * + in-kernel NAT (not implemented yet) > + * > + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, > + * measured in seconds and depending on the flags. > + * > + * The total number of dynamic rules is equal to UMA zone items count. > + * The max number of dynamic rules is dyn_max. When we reach > + * the maximum number of rules we do not create anymore. This is > + * done to avoid consuming too much memory, but also too much > + * time when searching on each packet (ideally, we should try instead > + * to put a limit on the length of the list on each bucket...). > + * > + * Each dynamic rule holds a pointer to the parent ipfw rule so > + * we know what action to perform. Dynamic rules are removed when > + * the parent rule is deleted. This can be changed by dyn_keep_states > + * sysctl. > + * > + * There are some limitations with dynamic rules -- we do not > + * obey the 'randomized match', and we do not do multiple > + * passes through the firewall. XXX check the latter!!! > + */ > + > +struct ipfw_dyn_bucket { > + struct mtx mtx; /* Bucket protecting lock */ > + ipfw_dyn_rule *head; /* Pointer to first rule */ > +}; > + > +/* > + * Static variables followed by global ones > + */ > +static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v); > +static VNET_DEFINE(u_int32_t, dyn_buckets_max); > +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); > +static VNET_DEFINE(struct callout, ipfw_timeout); > +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) > +#define V_dyn_buckets_max VNET(dyn_buckets_max) > +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) > +#define V_ipfw_timeout VNET(ipfw_timeout) > + > +static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone); > +#define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone) > + > +#define IPFW_BUCK_LOCK_INIT(b) \ > + mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF) > +#define IPFW_BUCK_LOCK_DESTROY(b) \ > + mtx_destroy(&(b)->mtx) > +#define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx) > +#define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx) > +#define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED) > + > + > +static VNET_DEFINE(int, dyn_keep_states); > +#define V_dyn_keep_states VNET(dyn_keep_states) > + > +/* > + * Timeouts for various events in handing dynamic rules. > + */ > +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); > +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); > +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); > +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); > +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); > +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); > + > +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) > +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) > +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) > +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) > +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) > +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) > + > +/* > + * Keepalives are sent if dyn_keepalive is set. They are sent every > + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval > + * seconds of lifetime of a rule. > + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower > + * than dyn_keepalive_period. > + */ > + > +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); > +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); > +static VNET_DEFINE(u_int32_t, dyn_keepalive); > +static VNET_DEFINE(time_t, dyn_keepalive_last); > + > +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) > +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) > +#define V_dyn_keepalive VNET(dyn_keepalive) > +#define V_dyn_keepalive_last VNET(dyn_keepalive_last) > + > +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ > + > +#define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone) > +#define V_dyn_max VNET(dyn_max) > + > +/* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */ > +static int ipfw_dyn_count; /* number of objects */ > + > +#ifdef USERSPACE /* emulation of UMA object counters for userspace */ > +#define uma_zone_get_cur(x) ipfw_dyn_count > +#endif /* USERSPACE */ > + > +static int last_log; /* Log ratelimiting */ > + > +static void ipfw_dyn_tick(void *vnetx); > +static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int); > +#ifdef SYSCTL_NODE > + > +static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS); > +static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS); > + > +SYSBEGIN(f2) > + > +SYSCTL_DECL(_net_inet_ip_fw); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, > + CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, > + "Max number of dyn. buckets"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, > + CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, > + "Current Number of dyn. buckets"); > +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, > + CTLTYPE_UINT|CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", > + "Number of dyn. rules"); > +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, > + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", > + "Max number of dyn. rules"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, > + "Lifetime of dyn. rules for acks"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, > + "Lifetime of dyn. rules for syn"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, > + "Lifetime of dyn. rules for fin"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, > + "Lifetime of dyn. rules for rst"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, > + "Lifetime of dyn. rules for UDP"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, > + CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, > + "Lifetime of dyn. rules for other situations"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, > + CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, > + "Enable keepalives for dyn. rules"); > +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, > + CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, > + "Do not flush dynamic states on rule deletion"); > + > +SYSEND > + > +#endif /* SYSCTL_NODE */ > + > + > +#ifdef INET6 > +static __inline int > +hash_packet6(struct ipfw_flow_id *id) > +{ > + u_int32_t i; > + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ > + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ > + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ > + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ > + (id->dst_port) ^ (id->src_port); > + return i; > +} > +#endif > + > +/* > + * IMPORTANT: the hash function for dynamic rules must be commutative > + * in source and destination (ip,port), because rules are bidirectional > + * and we want to find both in the same bucket. > + */ > +static __inline int > +hash_packet(struct ipfw_flow_id *id, int buckets) > +{ > + u_int32_t i; > + > +#ifdef INET6 > + if (IS_IP6_FLOW_ID(id)) > + i = hash_packet6(id); > + else > +#endif /* INET6 */ > + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); > + i &= (buckets - 1); > + return i; > +} > + > +/** > + * Print customizable flow id description via log(9) facility. > + */ > +static void > +print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags, > + char *prefix, char *postfix) > +{ > + struct in_addr da; > +#ifdef INET6 > + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; > +#else > + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; > +#endif > + > +#ifdef INET6 > + if (IS_IP6_FLOW_ID(id)) { > + ip6_sprintf(src, &id->src_ip6); > + ip6_sprintf(dst, &id->dst_ip6); > + } else > +#endif > + { > + da.s_addr = htonl(id->src_ip); > + inet_ntop(AF_INET, &da, src, sizeof(src)); > + da.s_addr = htonl(id->dst_ip); > + inet_ntop(AF_INET, &da, dst, sizeof(dst)); > + } > + log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", > + prefix, dyn_type, src, id->src_port, dst, > + id->dst_port, DYN_COUNT, postfix); > +} > + > +#define print_dyn_rule(id, dtype, prefix, postfix) \ > + print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) > + > +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) > +#define TIME_LE(a,b) ((int)((a)-(b)) < 0) > + > +/* > + * Lookup a dynamic rule, locked version. > + */ > +static ipfw_dyn_rule * > +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction, > + struct tcphdr *tcp) > +{ > + /* > + * Stateful ipfw extensions. > + * Lookup into dynamic session queue. > + */ > +#define MATCH_REVERSE 0 > +#define MATCH_FORWARD 1 > +#define MATCH_NONE 2 > +#define MATCH_UNKNOWN 3 > + int dir = MATCH_NONE; > + ipfw_dyn_rule *prev, *q = NULL; > + > + IPFW_BUCK_ASSERT(i); > + > + for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) { > + if (q->dyn_type == O_LIMIT_PARENT && q->count) > + continue; > + > + if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT) > + continue; > + > + if (IS_IP6_FLOW_ID(pkt)) { > + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && > + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && > + pkt->src_port == q->id.src_port && > + pkt->dst_port == q->id.dst_port) { > + dir = MATCH_FORWARD; > + break; > + } > + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && > + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && > + pkt->src_port == q->id.dst_port && > + pkt->dst_port == q->id.src_port) { > + dir = MATCH_REVERSE; > + break; > + } > + } else { > + if (pkt->src_ip == q->id.src_ip && > + pkt->dst_ip == q->id.dst_ip && > + pkt->src_port == q->id.src_port && > + pkt->dst_port == q->id.dst_port) { > + dir = MATCH_FORWARD; > + break; > + } > + if (pkt->src_ip == q->id.dst_ip && > + pkt->dst_ip == q->id.src_ip && > + pkt->src_port == q->id.dst_port && > + pkt->dst_port == q->id.src_port) { > + dir = MATCH_REVERSE; > + break; > + } > + } > + } > + if (q == NULL) > + goto done; /* q = NULL, not found */ > + > + if (prev != NULL) { /* found and not in front */ > + prev->next = q->next; > + q->next = V_ipfw_dyn_v[i].head; > + V_ipfw_dyn_v[i].head = q; > + } > + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ > + uint32_t ack; > + u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); > + > +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) > +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) > +#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) > +#define ACK_FWD 0x10000 /* fwd ack seen */ > +#define ACK_REV 0x20000 /* rev ack seen */ > + > + q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); > + switch (q->state & TCP_FLAGS) { > + case TH_SYN: /* opening */ > + q->expire = time_uptime + V_dyn_syn_lifetime; > + break; > + > + case BOTH_SYN: /* move to established */ > + case BOTH_SYN | TH_FIN: /* one side tries to close */ > + case BOTH_SYN | (TH_FIN << 8): > +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) > + if (tcp == NULL) > + break; > + > + ack = ntohl(tcp->th_ack); > + if (dir == MATCH_FORWARD) { > + if (q->ack_fwd == 0 || > + _SEQ_GE(ack, q->ack_fwd)) { > + q->ack_fwd = ack; > + q->state |= ACK_FWD; > + } > + } else { > + if (q->ack_rev == 0 || > + _SEQ_GE(ack, q->ack_rev)) { > + q->ack_rev = ack; > + q->state |= ACK_REV; > + } > + } > + if ((q->state & (ACK_FWD | ACK_REV)) == > + (ACK_FWD | ACK_REV)) { > + q->expire = time_uptime + V_dyn_ack_lifetime; > + q->state &= ~(ACK_FWD | ACK_REV); > + } > + break; > + > + case BOTH_SYN | BOTH_FIN: /* both sides closed */ > + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) > + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; > + q->expire = time_uptime + V_dyn_fin_lifetime; > + break; > + > + default: > +#if 0 > + /* > + * reset or some invalid combination, but can also > + * occur if we use keep-state the wrong way. > + */ > + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) > + printf("invalid state: 0x%x\n", q->state); > +#endif > + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) > + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; > + q->expire = time_uptime + V_dyn_rst_lifetime; > + break; > + } > + } else if (pkt->proto == IPPROTO_UDP) { > + q->expire = time_uptime + V_dyn_udp_lifetime; > + } else { > + /* other protocols */ > + q->expire = time_uptime + V_dyn_short_lifetime; > + } > +done: > + if (match_direction != NULL) > + *match_direction = dir; > + return (q); > +} > + > +ipfw_dyn_rule * > +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, > + struct tcphdr *tcp) > +{ > + ipfw_dyn_rule *q; > + int i; > + > + i = hash_packet(pkt, V_curr_dyn_buckets); > + > + IPFW_BUCK_LOCK(i); > + q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp); > + if (q == NULL) > + IPFW_BUCK_UNLOCK(i); > + /* NB: return table locked when q is not NULL */ > + return q; > +} > + > +/* > + * Unlock bucket mtx > + * @p - pointer to dynamic rule > + */ > +void > +ipfw_dyn_unlock(ipfw_dyn_rule *q) > +{ > + > + IPFW_BUCK_UNLOCK(q->bucket); > +} > + > +static int > +resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets) > +{ > + int i, k, nbuckets_old; > + ipfw_dyn_rule *q; > + struct ipfw_dyn_bucket *dyn_v, *dyn_v_old; > + > + /* Check if given number is power of 2 and less than 64k */ > + if ((nbuckets > 65536) || (!powerof2(nbuckets))) > + return 1; > + > + CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__, > + V_curr_dyn_buckets, nbuckets); > + > + /* Allocate and initialize new hash */ > + dyn_v = malloc(nbuckets * sizeof(ipfw_dyn_rule), M_IPFW, > + M_WAITOK | M_ZERO); > + > + for (i = 0 ; i < nbuckets; i++) > + IPFW_BUCK_LOCK_INIT(&dyn_v[i]); > + > + /* > + * Call upper half lock, as get_map() do to ease > + * read-only access to dynamic rules hash from sysctl > + */ > + IPFW_UH_WLOCK(chain); > + > + /* > + * Acquire chain write lock to permit hash access > + * for main traffic path without additional locks > + */ > + IPFW_WLOCK(chain); > + > + /* Save old values */ > + nbuckets_old = V_curr_dyn_buckets; > + dyn_v_old = V_ipfw_dyn_v; > + > + /* Skip relinking if array is not set up */ > + if (V_ipfw_dyn_v == NULL) > + V_curr_dyn_buckets = 0; > + > + /* Re-link all dynamic states */ > + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { > + while (V_ipfw_dyn_v[i].head != NULL) { > + /* Remove from current chain */ > + q = V_ipfw_dyn_v[i].head; > + V_ipfw_dyn_v[i].head = q->next; > + > + /* Get new hash value */ > + k = hash_packet(&q->id, nbuckets); > + q->bucket = k; > + /* Add to the new head */ > + q->next = dyn_v[k].head; > + dyn_v[k].head = q; > + } > + } > + > + /* Update current pointers/buckets values */ > + V_curr_dyn_buckets = nbuckets; > + V_ipfw_dyn_v = dyn_v; > + > + IPFW_WUNLOCK(chain); > + > + IPFW_UH_WUNLOCK(chain); > + > + /* Start periodic callout on initial creation */ > + if (dyn_v_old == NULL) { > + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0); > + return (0); > + } > + > + /* Destroy all mutexes */ > + for (i = 0 ; i < nbuckets_old ; i++) > + IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]); > + > + /* Free old hash */ > + free(dyn_v_old, M_IPFW); > + > + return 0; > +} > + > +/** > + * Install state of type 'type' for a dynamic session. > + * The hash table contains two type of rules: > + * - regular rules (O_KEEP_STATE) > + * - rules for sessions with limited number of sess per user > + * (O_LIMIT). When they are created, the parent is > + * increased by 1, and decreased on delete. In this case, > + * the third parameter is the parent rule and not the chain. > + * - "parent" rules for the above (O_LIMIT_PARENT). > + */ > +static ipfw_dyn_rule * > +add_dyn_rule(struct ipfw_flow_id *id, int i, u_int8_t dyn_type, struct ip_fw *rule) > +{ > + ipfw_dyn_rule *r; > + > + IPFW_BUCK_ASSERT(i); > + > + r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); > + if (r == NULL) { > + if (last_log != time_uptime) { > + last_log = time_uptime; > + log(LOG_DEBUG, "ipfw: %s: Cannot allocate rule\n", > + __func__); > + } > + return NULL; > + } > + ipfw_dyn_count++; > + > + /* > + * refcount on parent is already incremented, so > + * it is safe to use parent unlocked. > + */ > + if (dyn_type == O_LIMIT) { > + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; > + if ( parent->dyn_type != O_LIMIT_PARENT) > + panic("invalid parent"); > + r->parent = parent; > + rule = parent->rule; > + } > + > + r->id = *id; > + r->expire = time_uptime + V_dyn_syn_lifetime; > + r->rule = rule; > + r->dyn_type = dyn_type; > + IPFW_ZERO_DYN_COUNTER(r); > + r->count = 0; > + > + r->bucket = i; > + r->next = V_ipfw_dyn_v[i].head; > + V_ipfw_dyn_v[i].head = r; > + DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");) > + return r; > +} > + > +/** > + * lookup dynamic parent rule using pkt and rule as search keys. > + * If the lookup fails, then install one. > + */ > +static ipfw_dyn_rule * > +lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule) > +{ > + ipfw_dyn_rule *q; > + int i, is_v6; > + > + is_v6 = IS_IP6_FLOW_ID(pkt); > + i = hash_packet( pkt, V_curr_dyn_buckets ); > + *pindex = i; > + IPFW_BUCK_LOCK(i); > + for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next) > + if (q->dyn_type == O_LIMIT_PARENT && > + rule== q->rule && > + pkt->proto == q->id.proto && > + pkt->src_port == q->id.src_port && > + pkt->dst_port == q->id.dst_port && > + ( > + (is_v6 && > + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), > + &(q->id.src_ip6)) && > + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), > + &(q->id.dst_ip6))) || > + (!is_v6 && > + pkt->src_ip == q->id.src_ip && > + pkt->dst_ip == q->id.dst_ip) > + ) > + ) { > + q->expire = time_uptime + V_dyn_short_lifetime; > + DEB(print_dyn_rule(pkt, q->dyn_type, > + "lookup_dyn_parent found", "");) > + return q; > + } > + > + /* Add virtual limiting rule */ > + return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule); > +} > + > +/** > + * Install dynamic state for rule type cmd->o.opcode > + * > + * Returns 1 (failure) if state is not installed because of errors or because > + * session limitations are enforced. > + */ > +int > +ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, > + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) > +{ > + ipfw_dyn_rule *q; > + int i; > + > + DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", "");) > + > + i = hash_packet(&args->f_id, V_curr_dyn_buckets); > + > + IPFW_BUCK_LOCK(i); > + > + q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); > + > + if (q != NULL) { /* should never occur */ > + DEB( > + if (last_log != time_uptime) { > + last_log = time_uptime; > + printf("ipfw: %s: entry already present, done\n", > + __func__); > + }) > + IPFW_BUCK_UNLOCK(i); > + return (0); > + } > + > + /* > + * State limiting is done via uma(9) zone limiting. > + * Save pointer to newly-installed rule and reject > + * packet if add_dyn_rule() returned NULL. > + * Note q is currently set to NULL. > + */ > + > + switch (cmd->o.opcode) { > + case O_KEEP_STATE: /* bidir rule */ > + q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule); > + break; > + > + case O_LIMIT: { /* limit number of sessions */ > + struct ipfw_flow_id id; > + ipfw_dyn_rule *parent; > + uint32_t conn_limit; > + uint16_t limit_mask = cmd->limit_mask; > + int pindex; > + > + conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); > + > + DEB( > + if (cmd->conn_limit == IP_FW_TARG) > + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " > + "(tablearg)\n", __func__, conn_limit); > + else > + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", > + __func__, conn_limit); > + ) > + > + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; > + id.proto = args->f_id.proto; > + id.addr_type = args->f_id.addr_type; > + id.fib = M_GETFIB(args->m); > + > + if (IS_IP6_FLOW_ID (&(args->f_id))) { > + if (limit_mask & DYN_SRC_ADDR) > + id.src_ip6 = args->f_id.src_ip6; > + if (limit_mask & DYN_DST_ADDR) > + id.dst_ip6 = args->f_id.dst_ip6; > + } else { > + if (limit_mask & DYN_SRC_ADDR) > + id.src_ip = args->f_id.src_ip; > + if (limit_mask & DYN_DST_ADDR) > + id.dst_ip = args->f_id.dst_ip; > + } > + if (limit_mask & DYN_SRC_PORT) > + id.src_port = args->f_id.src_port; > + if (limit_mask & DYN_DST_PORT) > + id.dst_port = args->f_id.dst_port; > + > + /* > + * We have to release lock for previous bucket to > + * avoid possible deadlock > + */ > + IPFW_BUCK_UNLOCK(i); > + > + if ((parent = lookup_dyn_parent(&id, &pindex, rule)) == NULL) { > + printf("ipfw: %s: add parent failed\n", __func__); > + IPFW_BUCK_UNLOCK(pindex); > + return (1); > + } > + > + if (parent->count >= conn_limit) { > + if (V_fw_verbose && last_log != time_uptime) { > + last_log = time_uptime; > + char sbuf[24]; > + last_log = time_uptime; > + snprintf(sbuf, sizeof(sbuf), > + "%d drop session", > + parent->rule->rulenum); > + print_dyn_rule_flags(&args->f_id, > + cmd->o.opcode, > + LOG_SECURITY | LOG_DEBUG, > + sbuf, "too many entries"); > + } > + IPFW_BUCK_UNLOCK(pindex); > + return (1); > + } > + /* Increment counter on parent */ > + parent->count++; > + IPFW_BUCK_UNLOCK(pindex); > + > + IPFW_BUCK_LOCK(i); > + q = add_dyn_rule(&args->f_id, i, O_LIMIT, (struct ip_fw *)parent); > + if (q == NULL) { > + /* Decrement index and notify caller */ > + IPFW_BUCK_UNLOCK(i); > + IPFW_BUCK_LOCK(pindex); > + parent->count--; > + IPFW_BUCK_UNLOCK(pindex); > + return (1); > + } > + break; > + } > + default: > + printf("ipfw: %s: unknown dynamic rule type %u\n", > + __func__, cmd->o.opcode); > + } > + > + if (q == NULL) { > + IPFW_BUCK_UNLOCK(i); > + return (1); /* Notify caller about failure */ > + } > + > + /* XXX just set lifetime */ > + lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); > + > + IPFW_BUCK_UNLOCK(i); > + return (0); > +} > + > +/* > + * Generate a TCP packet, containing either a RST or a keepalive. > + * When flags & TH_RST, we are sending a RST packet, because of a > + * "reset" action matched the packet. > + * Otherwise we are sending a keepalive, and flags & TH_ > + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required > + * so that MAC can label the reply appropriately. > + */ > +struct mbuf * > +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, > + u_int32_t ack, int flags) > +{ > + struct mbuf *m = NULL; /* stupid compiler */ > + int len, dir; > + struct ip *h = NULL; /* stupid compiler */ > +#ifdef INET6 > + struct ip6_hdr *h6 = NULL; > +#endif > + struct tcphdr *th = NULL; > + > + MGETHDR(m, M_NOWAIT, MT_DATA); > + if (m == NULL) > + return (NULL); > + > + M_SETFIB(m, id->fib); > +#ifdef MAC > + if (replyto != NULL) > + mac_netinet_firewall_reply(replyto, m); > + else > + mac_netinet_firewall_send(m); > +#else > + (void)replyto; /* don't warn about unused arg */ > +#endif > + > + switch (id->addr_type) { > + case 4: > + len = sizeof(struct ip) + sizeof(struct tcphdr); > + break; > +#ifdef INET6 > + case 6: > + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); > + break; > +#endif > + default: > + /* XXX: log me?!? */ > + FREE_PKT(m); > + return (NULL); > + } > + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); > + > + m->m_data += max_linkhdr; > + m->m_flags |= M_SKIP_FIREWALL; > + m->m_pkthdr.len = m->m_len = len; > + m->m_pkthdr.rcvif = NULL; > + bzero(m->m_data, len); > + > + switch (id->addr_type) { > + case 4: > + h = mtod(m, struct ip *); > + > + /* prepare for checksum */ > + h->ip_p = IPPROTO_TCP; > + h->ip_len = htons(sizeof(struct tcphdr)); > + if (dir) { > + h->ip_src.s_addr = htonl(id->src_ip); > + h->ip_dst.s_addr = htonl(id->dst_ip); > + } else { > + h->ip_src.s_addr = htonl(id->dst_ip); > + h->ip_dst.s_addr = htonl(id->src_ip); > + } > + > + th = (struct tcphdr *)(h + 1); > + break; > +#ifdef INET6 > + case 6: > + h6 = mtod(m, struct ip6_hdr *); > + > + /* prepare for checksum */ > + h6->ip6_nxt = IPPROTO_TCP; > + h6->ip6_plen = htons(sizeof(struct tcphdr)); > + if (dir) { > + h6->ip6_src = id->src_ip6; > + h6->ip6_dst = id->dst_ip6; > + } else { > + h6->ip6_src = id->dst_ip6; > + h6->ip6_dst = id->src_ip6; > + } > + > + th = (struct tcphdr *)(h6 + 1); > + break; > +#endif > + } > + > + if (dir) { > + th->th_sport = htons(id->src_port); > + th->th_dport = htons(id->dst_port); > + } else { > + th->th_sport = htons(id->dst_port); > + th->th_dport = htons(id->src_port); > + } > + th->th_off = sizeof(struct tcphdr) >> 2; > + > + if (flags & TH_RST) { > + if (flags & TH_ACK) { > + th->th_seq = htonl(ack); > + th->th_flags = TH_RST; > + } else { > + if (flags & TH_SYN) > + seq++; > + th->th_ack = htonl(seq); > + th->th_flags = TH_RST | TH_ACK; > + } > + } else { > + /* > + * Keepalive - use caller provided sequence numbers > + */ > + th->th_seq = htonl(seq); > + th->th_ack = htonl(ack); > + th->th_flags = TH_ACK; > + } > + > + switch (id->addr_type) { > + case 4: > + th->th_sum = in_cksum(m, len); > + > + /* finish the ip header */ > + h->ip_v = 4; > + h->ip_hl = sizeof(*h) >> 2; > + h->ip_tos = IPTOS_LOWDELAY; > + h->ip_off = htons(0); > + h->ip_len = htons(len); > + h->ip_ttl = V_ip_defttl; > + h->ip_sum = 0; > + break; > +#ifdef INET6 > + case 6: > + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), > + sizeof(struct tcphdr)); > + > + /* finish the ip6 header */ > + h6->ip6_vfc |= IPV6_VERSION; > + h6->ip6_hlim = IPV6_DEFHLIM; > + break; > +#endif > + } > + > + return (m); > +} > + > +/* > + * Queue keepalive packets for given dynamic rule > + */ > +static struct mbuf ** > +ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q) > +{ > + struct mbuf *m_rev, *m_fwd; > + > + m_rev = (q->state & ACK_REV) ? NULL : > + ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); > + m_fwd = (q->state & ACK_FWD) ? NULL : > + ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); > + > + if (m_rev != NULL) { > + *mtailp = m_rev; > + mtailp = &(*mtailp)->m_nextpkt; > + } > + if (m_fwd != NULL) { > + *mtailp = m_fwd; > + mtailp = &(*mtailp)->m_nextpkt; > + } > + > + return (mtailp); > +} > + > +/* > + * This procedure is used to perform various maintance > + * on dynamic hash list. Currently it is called every second. > + */ > +static void > +ipfw_dyn_tick(void * vnetx) > +{ > + struct ip_fw_chain *chain; > + int check_ka = 0; > +#ifdef VIMAGE > + struct vnet *vp = vnetx; > +#endif > + > + CURVNET_SET(vp); > + > + chain = &V_layer3_chain; > + > + /* Run keepalive checks every keepalive_period iff ka is enabled */ > + if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) && > + (V_dyn_keepalive != 0)) { > + V_dyn_keepalive_last = time_uptime; > + check_ka = 1; > + } > + > + check_dyn_rules(chain, NULL, check_ka, 1); > + > + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0); > + > + CURVNET_RESTORE(); > +} > + > + > +/* > + * Walk thru all dynamic states doing generic maintance: > + * 1) free expired states > + * 2) free all states based on deleted rule / set > + * 3) send keepalives for states if needed > + * > + * @chain - pointer to current ipfw rules chain > + * @rule - delete all states originated by given rule if != NULL > + * @set - delete all states originated by any rule in set @set if != RESVD_SET > + * @check_ka - perform checking/sending keepalives > + * @timer - indicate call from timer routine. > + * > + * Timer routine must call this function unlocked to permit > + * sending keepalives/resizing table. > + * > + * Others has to call function with IPFW_UH_WLOCK held. > + * Additionally, function assume that dynamic rule/set is > + * ALREADY deleted so no new states can be generated by > + * 'deleted' rules. > + * > + * Write lock is needed to ensure that unused parent rules > + * are not freed by other instance (see stage 2, 3) > + */ > +static void > +check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt, > + int check_ka, int timer) > +{ > + struct mbuf *m0, *m, *mnext, **mtailp; > + struct ip *h; > + int i, dyn_count, new_buckets = 0, max_buckets; > + int expired = 0, expired_limits = 0, parents = 0, total = 0; > + ipfw_dyn_rule *q, *q_prev, *q_next; > + ipfw_dyn_rule *exp_head, **exptailp; > + ipfw_dyn_rule *exp_lhead, **expltailp; > + > + KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated", > + __func__)); > + > + /* Avoid possible LOR */ > + KASSERT(!check_ka || timer, ("%s: keepalive check with lock held", > + __func__)); > + > + /* > + * Do not perform any checks if we currently have no dynamic states > + */ > + if (DYN_COUNT == 0) > + return; > + > + /* Expired states */ > + exp_head = NULL; > + exptailp = &exp_head; > + > + /* Expired limit states */ > + exp_lhead = NULL; > + expltailp = &exp_lhead; > + > + /* > + * We make a chain of packets to go out here -- not deferring > + * until after we drop the IPFW dynamic rule lock would result > + * in a lock order reversal with the normal packet input -> ipfw > + * call stack. > + */ > + m0 = NULL; > + mtailp = &m0; > + > + /* Protect from hash resizing */ > + if (timer != 0) > + IPFW_UH_WLOCK(chain); > + else > + IPFW_UH_WLOCK_ASSERT(chain); > + > +#define NEXT_RULE() { q_prev = q; q = q->next ; continue; } > + > + /* Stage 1: perform requested deletion */ > + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { > + IPFW_BUCK_LOCK(i); > + for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) { > + /* account every rule */ > + total++; > + > + /* Skip parent rules at all */ > + if (q->dyn_type == O_LIMIT_PARENT) { > + parents++; > + NEXT_RULE(); > + } > + > + /* > + * Remove rules which are: > + * 1) expired > + * 2) matches deletion range > + */ > + if ((TIME_LEQ(q->expire, time_uptime)) || > + (rt != NULL && ipfw_match_range(q->rule, rt))) { > + if (TIME_LE(time_uptime, q->expire) && > + q->dyn_type == O_KEEP_STATE && > + V_dyn_keep_states != 0) { > + /* > + * Do not delete state if > + * it is not expired and > + * dyn_keep_states is ON. > + * However we need to re-link it > + * to any other stable rule > + */ > + q->rule = chain->default_rule; > + NEXT_RULE(); > + } > + > + /* Unlink q from current list */ > + q_next = q->next; > + if (q == V_ipfw_dyn_v[i].head) > + V_ipfw_dyn_v[i].head = q_next; > + else > + q_prev->next = q_next; > + > + q->next = NULL; > + > + /* queue q to expire list */ > + if (q->dyn_type != O_LIMIT) { > + *exptailp = q; > + exptailp = &(*exptailp)->next; > + DEB(print_dyn_rule(&q->id, q->dyn_type, > + "unlink entry", "left"); > + ) > + } else { > + /* Separate list for limit rules */ > + *expltailp = q; > + expltailp = &(*expltailp)->next; > + expired_limits++; > + DEB(print_dyn_rule(&q->id, q->dyn_type, > + "unlink limit entry", "left"); > + ) > + } > + > + q = q_next; > + expired++; > + continue; > + } > + > + /* > + * Check if we need to send keepalive: > + * we need to ensure if is time to do KA, > + * this is established TCP session, and > + * expire time is within keepalive interval > + */ > + if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) && > + ((q->state & BOTH_SYN) == BOTH_SYN) && > + (TIME_LEQ(q->expire, time_uptime + > + V_dyn_keepalive_interval))) > + mtailp = ipfw_dyn_send_ka(mtailp, q); > + > + NEXT_RULE(); > + } > + IPFW_BUCK_UNLOCK(i); > + } > + > + /* Stage 2: decrement counters from O_LIMIT parents */ > + if (expired_limits != 0) { > + /* > + * XXX: Note that deleting set with more than one > + * heavily-used LIMIT rules can result in overwhelming > + * locking due to lack of per-hash value sorting > + * > + * We should probably think about: > + * 1) pre-allocating hash of size, say, > + * MAX(16, V_curr_dyn_buckets / 1024) > + * 2) checking if expired_limits is large enough > + * 3) If yes, init hash (or its part), re-link > + * current list and start decrementing procedure in > + * each bucket separately > + */ > + > + /* > + * Small optimization: do not unlock bucket until > + * we see the next item resides in different bucket > + */ > + if (exp_lhead != NULL) { > + i = exp_lhead->parent->bucket; > + IPFW_BUCK_LOCK(i); > + } > + for (q = exp_lhead; q != NULL; q = q->next) { > + if (i != q->parent->bucket) { > + IPFW_BUCK_UNLOCK(i); > + i = q->parent->bucket; > + IPFW_BUCK_LOCK(i); > + } > + > + /* Decrease parent refcount */ > + q->parent->count--; > + } > + if (exp_lhead != NULL) > + IPFW_BUCK_UNLOCK(i); > + } > + > + /* > + * We protectet ourselves from unused parent deletion > + * (from the timer function) by holding UH write lock. > + */ > + > + /* Stage 3: remove unused parent rules */ > + if ((parents != 0) && (expired != 0)) { > + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { > + IPFW_BUCK_LOCK(i); > + for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) { > + if (q->dyn_type != O_LIMIT_PARENT) > + NEXT_RULE(); > + > + if (q->count != 0) > + NEXT_RULE(); > + > + /* Parent rule without consumers */ > + > + /* Unlink q from current list */ > + q_next = q->next; > + if (q == V_ipfw_dyn_v[i].head) > + V_ipfw_dyn_v[i].head = q_next; > + else > + q_prev->next = q_next; > + > + q->next = NULL; > + > + /* Add to expired list */ > + *exptailp = q; > + exptailp = &(*exptailp)->next; > + > + DEB(print_dyn_rule(&q->id, q->dyn_type, > + "unlink parent entry", "left"); > + ) > + > + expired++; > + > + q = q_next; > + } > + IPFW_BUCK_UNLOCK(i); > + } > + } > + > +#undef NEXT_RULE > + > + if (timer != 0) { > + /* > + * Check if we need to resize hash: > + * if current number of states exceeds number of buckes in hash, > + * grow hash size to the minimum power of 2 which is bigger than > + * current states count. Limit hash size by 64k. > + */ > + max_buckets = (V_dyn_buckets_max > 65536) ? > + 65536 : V_dyn_buckets_max; > + > + dyn_count = DYN_COUNT; > + > + if ((dyn_count > V_curr_dyn_buckets * 2) && > + (dyn_count < max_buckets)) { > + new_buckets = V_curr_dyn_buckets; > + while (new_buckets < dyn_count) { > + new_buckets *= 2; > + > + if (new_buckets >= max_buckets) > + break; > + } > + } > + > + IPFW_UH_WUNLOCK(chain); > + } > + > + /* Finally delete old states ad limits if any */ > + for (q = exp_head; q != NULL; q = q_next) { > + q_next = q->next; > + uma_zfree(V_ipfw_dyn_rule_zone, q); > + ipfw_dyn_count--; > + } > + > + for (q = exp_lhead; q != NULL; q = q_next) { > + q_next = q->next; > + uma_zfree(V_ipfw_dyn_rule_zone, q); > + ipfw_dyn_count--; > + } > + > + /* > + * The rest code MUST be called from timer routine only > + * without holding any locks > + */ > + if (timer == 0) > + return; > + > + /* Send keepalive packets if any */ > + for (m = m0; m != NULL; m = mnext) { > + mnext = m->m_nextpkt; > + m->m_nextpkt = NULL; > + h = mtod(m, struct ip *); > + if (h->ip_v == 4) > + ip_output(m, NULL, NULL, 0, NULL, NULL); > +#ifdef INET6 > + else > + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); > +#endif > + } > + > + /* Run table resize without holding any locks */ > + if (new_buckets != 0) > + resize_dynamic_table(chain, new_buckets); > +} > + > +/* > + * Deletes all dynamic rules originated by given rule or all rules in > + * given set. Specify RESVD_SET to indicate set should not be used. > + * @chain - pointer to current ipfw rules chain > + * @rr - delete all states originated by rules in matched range. > + * > + * Function has to be called with IPFW_UH_WLOCK held. > + * Additionally, function assume that dynamic rule/set is > + * ALREADY deleted so no new states can be generated by > + * 'deleted' rules. > + */ > +void > +ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt) > +{ > + > + check_dyn_rules(chain, rt, 0, 0); > +} > + > +/* > + * Check if rule contains at least one dynamic opcode. > + * > + * Returns 1 if such opcode is found, 0 otherwise. > + */ > +int > +ipfw_is_dyn_rule(struct ip_fw *rule) > +{ > + int cmdlen, l; > + ipfw_insn *cmd; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + switch (cmd->opcode) { > + case O_LIMIT: > + case O_KEEP_STATE: > + case O_PROBE_STATE: > + case O_CHECK_STATE: > + return (1); > + } > + } > + > + return (0); > +} > + > +void > +ipfw_dyn_init(struct ip_fw_chain *chain) > +{ > + > + V_ipfw_dyn_v = NULL; > + V_dyn_buckets_max = 256; /* must be power of 2 */ > + V_curr_dyn_buckets = 256; /* must be power of 2 */ > + > + V_dyn_ack_lifetime = 300; > + V_dyn_syn_lifetime = 20; > + V_dyn_fin_lifetime = 1; > + V_dyn_rst_lifetime = 1; > + V_dyn_udp_lifetime = 10; > + V_dyn_short_lifetime = 5; > + > + V_dyn_keepalive_interval = 20; > + V_dyn_keepalive_period = 5; > + V_dyn_keepalive = 1; /* do send keepalives */ > + V_dyn_keepalive_last = time_uptime; > + > + V_dyn_max = 4096; /* max # of dynamic rules */ > + > + V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", > + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, > + UMA_ALIGN_PTR, 0); > + > + /* Enforce limit on dynamic rules */ > + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); > + > + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); > + > + /* > + * This can potentially be done on first dynamic rule > + * being added to chain. > + */ > + resize_dynamic_table(chain, V_curr_dyn_buckets); > +} > + > +void > +ipfw_dyn_uninit(int pass) > +{ > + int i; > + > + if (pass == 0) { > + callout_drain(&V_ipfw_timeout); > + return; > + } > + > + if (V_ipfw_dyn_v != NULL) { > + /* > + * Skip deleting all dynamic states - > + * uma_zdestroy() does this more efficiently; > + */ > + > + /* Destroy all mutexes */ > + for (i = 0 ; i < V_curr_dyn_buckets ; i++) > + IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]); > + free(V_ipfw_dyn_v, M_IPFW); > + V_ipfw_dyn_v = NULL; > + } > + > + uma_zdestroy(V_ipfw_dyn_rule_zone); > +} > + > +#ifdef SYSCTL_NODE > +/* > + * Get/set maximum number of dynamic states in given VNET instance. > + */ > +static int > +sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS) > +{ > + int error; > + unsigned int nstates; > + > + nstates = V_dyn_max; > + > + error = sysctl_handle_int(oidp, &nstates, 0, req); > + /* Read operation or some error */ > + if ((error != 0) || (req->newptr == NULL)) > + return (error); > + > + V_dyn_max = nstates; > + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); > + > + return (0); > +} > + > +/* > + * Get current number of dynamic states in given VNET instance. > + */ > +static int > +sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS) > +{ > + int error; > + unsigned int nstates; > + > + nstates = DYN_COUNT; > + > + error = sysctl_handle_int(oidp, &nstates, 0, req); > + > + return (error); > +} > +#endif > + > +/* > + * Returns size of dynamic states in legacy format > + */ > +int > +ipfw_dyn_len(void) > +{ > + > + return (V_ipfw_dyn_v == NULL) ? 0 : > + (DYN_COUNT * sizeof(ipfw_dyn_rule)); > +} > + > +/* > + * Returns number of dynamic states. > + * Used by dump format v1 (current). > + */ > +int > +ipfw_dyn_get_count(void) > +{ > + > + return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT; > +} > + > +static void > +export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst) > +{ > + > + memcpy(dst, src, sizeof(*src)); > + memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum)); > + /* > + * store set number into high word of > + * dst->rule pointer. > + */ > + memcpy((char *)&dst->rule + sizeof(src->rule->rulenum), > + &(src->rule->set), sizeof(src->rule->set)); > + /* > + * store a non-null value in "next". > + * The userland code will interpret a > + * NULL here as a marker > + * for the last dynamic rule. > + */ > + memcpy(&dst->next, &dst, sizeof(dst)); > + dst->expire = > + TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime; > +} > + > +/* > + * Fills int buffer given by @sd with dynamic states. > + * Used by dump format v1 (current). > + * > + * Returns 0 on success. > + */ > +int > +ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) > +{ > + ipfw_dyn_rule *p; > + ipfw_obj_dyntlv *dst, *last; > + ipfw_obj_ctlv *ctlv; > + int i; > + size_t sz; > + > + if (V_ipfw_dyn_v == NULL) > + return (0); > + > + IPFW_UH_RLOCK_ASSERT(chain); > + > + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); > + if (ctlv == NULL) > + return (ENOMEM); > + sz = sizeof(ipfw_obj_dyntlv); > + ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; > + ctlv->objsize = sz; > + last = NULL; > + > + for (i = 0 ; i < V_curr_dyn_buckets; i++) { > + IPFW_BUCK_LOCK(i); > + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { > + dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz); > + if (dst == NULL) { > + IPFW_BUCK_UNLOCK(i); > + return (ENOMEM); > + } > + > + export_dyn_rule(p, &dst->state); > + dst->head.length = sz; > + dst->head.type = IPFW_TLV_DYN_ENT; > + last = dst; > + } > + IPFW_BUCK_UNLOCK(i); > + } > + > + if (last != NULL) /* mark last dynamic rule */ > + last->head.flags = IPFW_DF_LAST; > + > + return (0); > +} > + > +/* > + * Fill given buffer with dynamic states (legacy format). > + * IPFW_UH_RLOCK has to be held while calling. > + */ > +void > +ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) > +{ > + ipfw_dyn_rule *p, *last = NULL; > + char *bp; > + int i; > + > + if (V_ipfw_dyn_v == NULL) > + return; > + bp = *pbp; > + > + IPFW_UH_RLOCK_ASSERT(chain); > + > + for (i = 0 ; i < V_curr_dyn_buckets; i++) { > + IPFW_BUCK_LOCK(i); > + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { > + if (bp + sizeof *p <= ep) { > + ipfw_dyn_rule *dst = > + (ipfw_dyn_rule *)bp; > + > + export_dyn_rule(p, dst); > + last = dst; > + bp += sizeof(ipfw_dyn_rule); > + } > + } > + IPFW_BUCK_UNLOCK(i); > + } > + > + if (last != NULL) /* mark last dynamic rule */ > + bzero(&last->next, sizeof(last)); > + *pbp = bp; > +} > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c > new file mode 100644 > index 0000000..7e9c992 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c > @@ -0,0 +1,537 @@ > +/*- > + * Copyright (c) 2014 Yandex LLC. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: projects/ipfw/sys/netpfil/ipfw/ip_fw_iface.c 267384 2014-06-12 09:59:11Z melifaro $"); > + > +/* > + * Kernel interface tracking API. > + * > + */ > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/queue.h> > +#include <sys/eventhandler.h> > +#include <net/if.h> > +#include <net/if_var.h> > +#include <net/vnet.h> > + > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ > +#include <netinet/ip_fw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > + > +#define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg) > + > +#define DEFAULT_IFACES 128 > + > +static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, > + uint16_t ifindex); > +static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, > + uint16_t ifindex); > +static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > + > +static struct ipfw_sopt_handler scodes[] = { > + { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces }, > +}; > + > +/* > + * FreeBSD Kernel interface. > + */ > +static void ipfw_kifhandler(void *arg, struct ifnet *ifp); > +static int ipfw_kiflookup(char *name); > +static void iface_khandler_register(void); > +static void iface_khandler_deregister(void); > + > +static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event; > +static int num_vnets = 0; > +static struct mtx vnet_mtx; > + > +/* > + * Checks if kernel interface is contained in our tracked > + * interface list and calls attach/detach handler. > + */ > +static void > +ipfw_kifhandler(void *arg, struct ifnet *ifp) > +{ > + struct ip_fw_chain *ch; > + struct ipfw_iface *iif; > + struct namedobj_instance *ii; > + uintptr_t htype; > + > + if (V_ipfw_vnet_ready == 0) > + return; > + > + ch = &V_layer3_chain; > + htype = (uintptr_t)arg; > + > + IPFW_UH_WLOCK(ch); > + ii = CHAIN_TO_II(ch); > + if (ii == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return; > + } > + iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0, > + if_name(ifp)); > + if (iif != NULL) { > + if (htype == 1) > + handle_ifattach(ch, iif, ifp->if_index); > + else > + handle_ifdetach(ch, iif, ifp->if_index); > + } > + IPFW_UH_WUNLOCK(ch); > +} > + > +/* > + * Reference current VNET as iface tracking API user. > + * Registers interface tracking handlers for first VNET. > + */ > +static void > +iface_khandler_register() > +{ > + int create; > + > + create = 0; > + > + mtx_lock(&vnet_mtx); > + if (num_vnets == 0) > + create = 1; > + num_vnets++; > + mtx_unlock(&vnet_mtx); > + > + if (create == 0) > + return; > + > + printf("IPFW: starting up interface tracker\n"); > + > + ipfw_ifdetach_event = EVENTHANDLER_REGISTER( > + ifnet_departure_event, ipfw_kifhandler, NULL, > + EVENTHANDLER_PRI_ANY); > + ipfw_ifattach_event = EVENTHANDLER_REGISTER( > + ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1), > + EVENTHANDLER_PRI_ANY); > +} > + > +/* > + * > + * Detach interface event handlers on last VNET instance > + * detach. > + */ > +static void > +iface_khandler_deregister() > +{ > + int destroy; > + > + destroy = 0; > + mtx_lock(&vnet_mtx); > + if (num_vnets == 1) > + destroy = 1; > + num_vnets--; > + mtx_unlock(&vnet_mtx); > + > + if (destroy == 0) > + return; > + > + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, > + ipfw_ifattach_event); > + EVENTHANDLER_DEREGISTER(ifnet_departure_event, > + ipfw_ifdetach_event); > +} > + > +/* > + * Retrieves ifindex for given @name. > + * > + * Returns ifindex or 0. > + */ > +static int > +ipfw_kiflookup(char *name) > +{ > + struct ifnet *ifp; > + int ifindex; > + > + ifindex = 0; > + > + if ((ifp = ifunit_ref(name)) != NULL) { > + ifindex = ifp->if_index; > + if_rele(ifp); > + } > + > + return (ifindex); > +} > + > +/* > + * Global ipfw startup hook. > + * Since we perform lazy initialization, do nothing except > + * mutex init. > + */ > +int > +ipfw_iface_init() > +{ > + > + mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF); > + IPFW_ADD_SOPT_HANDLER(1, scodes); > + return (0); > +} > + > +/* > + * Global ipfw destroy hook. > + * Unregister khandlers iff init has been done. > + */ > +void > +ipfw_iface_destroy() > +{ > + > + IPFW_DEL_SOPT_HANDLER(1, scodes); > + mtx_destroy(&vnet_mtx); > +} > + > +/* > + * Perform actual init on internal request. > + * Inits both namehash and global khandler. > + */ > +static void > +vnet_ipfw_iface_init(struct ip_fw_chain *ch) > +{ > + struct namedobj_instance *ii; > + > + ii = ipfw_objhash_create(DEFAULT_IFACES); > + IPFW_UH_WLOCK(ch); > + if (ch->ifcfg == NULL) { > + ch->ifcfg = ii; > + ii = NULL; > + } > + IPFW_UH_WUNLOCK(ch); > + > + if (ii != NULL) { > + /* Already initialized. Free namehash. */ > + ipfw_objhash_destroy(ii); > + } else { > + /* We're the first ones. Init kernel hooks. */ > + iface_khandler_register(); > + } > +} > + > +static void > +destroy_iface(struct namedobj_instance *ii, struct named_object *no, > + void *arg) > +{ > + > + /* Assume all consumers have been already detached */ > + free(no, M_IPFW); > +} > + > +/* > + * Per-VNET ipfw detach hook. > + * > + */ > +void > +vnet_ipfw_iface_destroy(struct ip_fw_chain *ch) > +{ > + struct namedobj_instance *ii; > + > + IPFW_UH_WLOCK(ch); > + ii = CHAIN_TO_II(ch); > + ch->ifcfg = NULL; > + IPFW_UH_WUNLOCK(ch); > + > + if (ii != NULL) { > + ipfw_objhash_foreach(ii, destroy_iface, ch); > + ipfw_objhash_destroy(ii); > + iface_khandler_deregister(); > + } > +} > + > +/* > + * Notify the subsystem that we are interested in tracking > + * interface @name. This function has to be called without > + * holding any locks to permit allocating the necessary states > + * for proper interface tracking. > + * > + * Returns 0 on success. > + */ > +int > +ipfw_iface_ref(struct ip_fw_chain *ch, char *name, > + struct ipfw_ifc *ic) > +{ > + struct namedobj_instance *ii; > + struct ipfw_iface *iif, *tmp; > + > + if (strlen(name) >= sizeof(iif->ifname)) > + return (EINVAL); > + > + IPFW_UH_WLOCK(ch); > + > + ii = CHAIN_TO_II(ch); > + if (ii == NULL) { > + > + /* > + * First request to subsystem. > + * Let's perform init. > + */ > + IPFW_UH_WUNLOCK(ch); > + vnet_ipfw_iface_init(ch); > + IPFW_UH_WLOCK(ch); > + ii = CHAIN_TO_II(ch); > + } > + > + iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); > + > + if (iif != NULL) { > + iif->no.refcnt++; > + ic->iface = iif; > + IPFW_UH_WUNLOCK(ch); > + return (0); > + } > + > + IPFW_UH_WUNLOCK(ch); > + > + /* Not found. Let's create one */ > + iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO); > + TAILQ_INIT(&iif->consumers); > + iif->no.name = iif->ifname; > + strlcpy(iif->ifname, name, sizeof(iif->ifname)); > + > + /* > + * Ref & link to the list. > + * > + * We assume ifnet_arrival_event / ifnet_departure_event > + * are not holding any locks. > + */ > + iif->no.refcnt = 1; > + IPFW_UH_WLOCK(ch); > + > + tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); > + if (tmp != NULL) { > + /* Interface has been created since unlock. Ref and return */ > + tmp->no.refcnt++; > + ic->iface = tmp; > + IPFW_UH_WUNLOCK(ch); > + free(iif, M_IPFW); > + return (0); > + } > + > + iif->ifindex = ipfw_kiflookup(name); > + if (iif->ifindex != 0) > + iif->resolved = 1; > + > + ipfw_objhash_add(ii, &iif->no); > + ic->iface = iif; > + > + IPFW_UH_WUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Adds @ic to the list of iif interface consumers. > + * Must be called with holding both UH+WLOCK. > + * Callback may be immediately called (if interface exists). > + */ > +void > +ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) > +{ > + struct ipfw_iface *iif; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + IPFW_WLOCK_ASSERT(ch); > + > + iif = ic->iface; > + > + TAILQ_INSERT_TAIL(&iif->consumers, ic, next); > + if (iif->resolved != 0) > + ic->cb(ch, ic->cbdata, iif->ifindex); > +} > + > +/* > + * Unlinks interface tracker object @ic from interface. > + * Must be called while holding UH lock. > + */ > +void > +ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) > +{ > + struct ipfw_iface *iif; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + iif = ic->iface; > + TAILQ_REMOVE(&iif->consumers, ic, next); > +} > + > +/* > + * Unreference interface specified by @ic. > + * Must be called without holding any locks. > + */ > +void > +ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic) > +{ > + struct ipfw_iface *iif; > + > + iif = ic->iface; > + ic->iface = NULL; > + > + IPFW_UH_WLOCK(ch); > + iif->no.refcnt--; > + /* TODO: check for references & delete */ > + IPFW_UH_WUNLOCK(ch); > +} > + > +/* > + * Interface arrival handler. > + */ > +static void > +handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, > + uint16_t ifindex) > +{ > + struct ipfw_ifc *ic; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + iif->gencnt++; > + iif->resolved = 1; > + iif->ifindex = ifindex; > + > + IPFW_WLOCK(ch); > + TAILQ_FOREACH(ic, &iif->consumers, next) > + ic->cb(ch, ic->cbdata, iif->ifindex); > + IPFW_WUNLOCK(ch); > +} > + > +/* > + * Interface departure handler. > + */ > +static void > +handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, > + uint16_t ifindex) > +{ > + struct ipfw_ifc *ic; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + IPFW_WLOCK(ch); > + TAILQ_FOREACH(ic, &iif->consumers, next) > + ic->cb(ch, ic->cbdata, 0); > + IPFW_WUNLOCK(ch); > + > + iif->gencnt++; > + iif->resolved = 0; > + iif->ifindex = 0; > +} > + > +struct dump_iface_args { > + struct ip_fw_chain *ch; > + struct sockopt_data *sd; > +}; > + > +static void > +export_iface_internal(struct namedobj_instance *ii, struct named_object *no, > + void *arg) > +{ > + ipfw_iface_info *i; > + struct dump_iface_args *da; > + struct ipfw_iface *iif; > + > + da = (struct dump_iface_args *)arg; > + > + i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i)); > + KASSERT(i != 0, ("previously checked buffer is not enough")); > + > + iif = (struct ipfw_iface *)no; > + > + strlcpy(i->ifname, iif->ifname, sizeof(i->ifname)); > + if (iif->resolved) > + i->flags |= IPFW_IFFLAG_RESOLVED; > + i->ifindex = iif->ifindex; > + i->refcnt = iif->no.refcnt; > + i->gencnt = iif->gencnt; > +} > + > +/* > + * Lists all interface currently tracked by ipfw. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size > + * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ] > + * > + * Returns 0 on success > + */ > +static int > +list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct namedobj_instance *ii; > + struct _ipfw_obj_lheader *olh; > + struct dump_iface_args da; > + uint32_t count, size; > + > + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); > + if (olh == NULL) > + return (EINVAL); > + if (sd->valsize < olh->size) > + return (EINVAL); > + > + IPFW_UH_RLOCK(ch); > + ii = CHAIN_TO_II(ch); > + if (ii != NULL) > + count = ipfw_objhash_count(ii); > + else > + count = 0; > + size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader); > + > + /* Fill in header regadless of buffer size */ > + olh->count = count; > + olh->objsize = sizeof(ipfw_iface_info); > + > + if (size > olh->size) { > + olh->size = size; > + IPFW_UH_RUNLOCK(ch); > + return (ENOMEM); > + } > + olh->size = size; > + > + da.ch = ch; > + da.sd = sd; > + > + if (ii != NULL) > + ipfw_objhash_foreach(ii, export_iface_internal, &da); > + IPFW_UH_RUNLOCK(ch); > + > + return (0); > +} > + > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c > new file mode 100644 > index 0000000..cbbd875 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c > @@ -0,0 +1,567 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_log.c 272840 2014-10-09 19:32:35Z melifaro $"); > + > +/* > + * Logging support for ipfw > + */ > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/mbuf.h> > +#include <sys/kernel.h> > +#include <sys/socket.h> > +#include <sys/sysctl.h> > +#include <sys/syslog.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <net/ethernet.h> /* for ETHERTYPE_IP */ > +#include <net/if.h> > +#include <net/if_var.h> > +#include <net/if_clone.h> > +#include <net/vnet.h> > +#include <net/if_types.h> /* for IFT_PFLOG */ > +#include <net/bpf.h> /* for BPF */ > + > +#include <netinet/in.h> > +#include <netinet/ip.h> > +#include <netinet/ip_icmp.h> > +#include <netinet/ip_var.h> > +#include <netinet/ip_fw.h> > +#include <netinet/tcp_var.h> > +#include <netinet/udp.h> > + > +#include <netinet/ip6.h> > +#include <netinet/icmp6.h> > +#ifdef INET6 > +#include <netinet6/in6_var.h> /* ip6_sprintf() */ > +#endif > + > +#include <netpfil/ipfw/ip_fw_private.h> > + > +#ifdef MAC > +#include <security/mac/mac_framework.h> > +#endif > + > +/* > + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T > + * Other macros just cast void * into the appropriate type > + */ > +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) > +#define TCP(p) ((struct tcphdr *)(p)) > +#define SCTP(p) ((struct sctphdr *)(p)) > +#define UDP(p) ((struct udphdr *)(p)) > +#define ICMP(p) ((struct icmphdr *)(p)) > +#define ICMP6(p) ((struct icmp6_hdr *)(p)) > + > +#ifdef __APPLE__ > +#undef snprintf > +#define snprintf sprintf > +#define SNPARGS(buf, len) buf + len > +#define SNP(buf) buf > +#else /* !__APPLE__ */ > +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 > +#define SNP(buf) buf, sizeof(buf) > +#endif /* !__APPLE__ */ > + > +#ifdef WITHOUT_BPF > +void > +ipfw_log_bpf(int onoff) > +{ > +} > +#else /* !WITHOUT_BPF */ > +static struct ifnet *log_if; /* hook to attach to bpf */ > +static struct rwlock log_if_lock; > +#define LOGIF_LOCK_INIT(x) rw_init(&log_if_lock, "ipfw log_if lock") > +#define LOGIF_LOCK_DESTROY(x) rw_destroy(&log_if_lock) > +#define LOGIF_RLOCK(x) rw_rlock(&log_if_lock) > +#define LOGIF_RUNLOCK(x) rw_runlock(&log_if_lock) > +#define LOGIF_WLOCK(x) rw_wlock(&log_if_lock) > +#define LOGIF_WUNLOCK(x) rw_wunlock(&log_if_lock) > + > +static const char ipfwname[] = "ipfw"; > + > +/* we use this dummy function for all ifnet callbacks */ > +static int > +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) > +{ > + return EINVAL; > +} > + > +static int > +ipfw_log_output(struct ifnet *ifp, struct mbuf *m, > + const struct sockaddr *dst, struct route *ro) > +{ > + if (m != NULL) > + FREE_PKT(m); > + return EINVAL; > +} > + > +static void > +ipfw_log_start(struct ifnet* ifp) > +{ > + panic("ipfw_log_start() must not be called"); > +} > + > +static const u_char ipfwbroadcastaddr[6] = > + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; > + > +static int > +ipfw_log_clone_match(struct if_clone *ifc, const char *name) > +{ > + > + return (strncmp(name, ipfwname, sizeof(ipfwname) - 1) == 0); > +} > + > +static int > +ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len, > + caddr_t params) > +{ > + int error; > + int unit; > + struct ifnet *ifp; > + > + error = ifc_name2unit(name, &unit); > + if (error) > + return (error); > + > + error = ifc_alloc_unit(ifc, &unit); > + if (error) > + return (error); > + > + ifp = if_alloc(IFT_PFLOG); > + if (ifp == NULL) { > + ifc_free_unit(ifc, unit); > + return (ENOSPC); > + } > + ifp->if_dname = ipfwname; > + ifp->if_dunit = unit; > + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", ipfwname, unit); > + strlcpy(name, ifp->if_xname, len); > + ifp->if_mtu = 65536; > + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; > + ifp->if_init = (void *)log_dummy; > + ifp->if_ioctl = log_dummy; > + ifp->if_start = ipfw_log_start; > + ifp->if_output = ipfw_log_output; > + ifp->if_addrlen = 6; > + ifp->if_hdrlen = 14; > + ifp->if_broadcastaddr = ipfwbroadcastaddr; > + ifp->if_baudrate = IF_Mbps(10); > + > + LOGIF_WLOCK(); > + if (log_if == NULL) > + log_if = ifp; > + else { > + LOGIF_WUNLOCK(); > + if_free(ifp); > + ifc_free_unit(ifc, unit); > + return (EEXIST); > + } > + LOGIF_WUNLOCK(); > + if_attach(ifp); > + bpfattach(ifp, DLT_EN10MB, 14); > + > + return (0); > +} > + > +static int > +ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) > +{ > + int unit; > + > + if (ifp == NULL) > + return (0); > + > + LOGIF_WLOCK(); > + if (log_if != NULL && ifp == log_if) > + log_if = NULL; > + else { > + LOGIF_WUNLOCK(); > + return (EINVAL); > + } > + LOGIF_WUNLOCK(); > + > + unit = ifp->if_dunit; > + bpfdetach(ifp); > + if_detach(ifp); > + if_free(ifp); > + ifc_free_unit(ifc, unit); > + > + return (0); > +} > + > +static struct if_clone *ipfw_log_cloner; > + > +void > +ipfw_log_bpf(int onoff) > +{ > + > + if (onoff) { > + LOGIF_LOCK_INIT(); > + ipfw_log_cloner = if_clone_advanced(ipfwname, 0, > + ipfw_log_clone_match, ipfw_log_clone_create, > + ipfw_log_clone_destroy); > + } else { > + if_clone_detach(ipfw_log_cloner); > + LOGIF_LOCK_DESTROY(); > + } > +} > +#endif /* !WITHOUT_BPF */ > + > +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) > +/* > + * We enter here when we have a rule with O_LOG. > + * XXX this function alone takes about 2Kbytes of code! > + */ > +void > +ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, > + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, > + u_short offset, uint32_t tablearg, struct ip *ip) > +{ > + char *action; > + int limit_reached = 0; > + char action2[92], proto[128], fragment[32]; > + > + if (V_fw_verbose == 0) { > +#ifndef WITHOUT_BPF > + LOGIF_RLOCK(); > + if (log_if == NULL || log_if->if_bpf == NULL) { > + LOGIF_RUNLOCK(); > + return; > + } > + > + if (args->eh) /* layer2, use orig hdr */ > + BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); > + else { > + /* Add fake header. Later we will store > + * more info in the header. > + */ > + if (ip->ip_v == 4) > + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); > + else if (ip->ip_v == 6) > + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m); > + else > + /* Obviously bogus EtherType. */ > + BPF_MTAP2(log_if, "DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m); > + } > + LOGIF_RUNLOCK(); > +#endif /* !WITHOUT_BPF */ > + return; > + } > + /* the old 'log' function */ > + fragment[0] = '\0'; > + proto[0] = '\0'; > + > + if (f == NULL) { /* bogus pkt */ > + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) > + return; > + V_norule_counter++; > + if (V_norule_counter == V_verbose_limit) > + limit_reached = V_verbose_limit; > + action = "Refuse"; > + } else { /* O_LOG is the first action, find the real one */ > + ipfw_insn *cmd = ACTION_PTR(f); > + ipfw_insn_log *l = (ipfw_insn_log *)cmd; > + > + if (l->max_log != 0 && l->log_left == 0) > + return; > + l->log_left--; > + if (l->log_left == 0) > + limit_reached = l->max_log; > + cmd += F_LEN(cmd); /* point to first action */ > + if (cmd->opcode == O_ALTQ) { > + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; > + > + snprintf(SNPARGS(action2, 0), "Altq %d", > + altq->qid); > + cmd += F_LEN(cmd); > + } > + if (cmd->opcode == O_PROB || cmd->opcode == O_TAG || > + cmd->opcode == O_SETDSCP) > + cmd += F_LEN(cmd); > + > + action = action2; > + switch (cmd->opcode) { > + case O_DENY: > + action = "Deny"; > + break; > + > + case O_REJECT: > + if (cmd->arg1==ICMP_REJECT_RST) > + action = "Reset"; > + else if (cmd->arg1==ICMP_UNREACH_HOST) > + action = "Reject"; > + else > + snprintf(SNPARGS(action2, 0), "Unreach %d", > + cmd->arg1); > + break; > + > + case O_UNREACH6: > + if (cmd->arg1==ICMP6_UNREACH_RST) > + action = "Reset"; > + else > + snprintf(SNPARGS(action2, 0), "Unreach %d", > + cmd->arg1); > + break; > + > + case O_ACCEPT: > + action = "Accept"; > + break; > + case O_COUNT: > + action = "Count"; > + break; > + case O_DIVERT: > + snprintf(SNPARGS(action2, 0), "Divert %d", > + TARG(cmd->arg1, divert)); > + break; > + case O_TEE: > + snprintf(SNPARGS(action2, 0), "Tee %d", > + TARG(cmd->arg1, divert)); > + break; > + case O_SETFIB: > + snprintf(SNPARGS(action2, 0), "SetFib %d", > + TARG(cmd->arg1, fib)); > + break; > + case O_SKIPTO: > + snprintf(SNPARGS(action2, 0), "SkipTo %d", > + TARG(cmd->arg1, skipto)); > + break; > + case O_PIPE: > + snprintf(SNPARGS(action2, 0), "Pipe %d", > + TARG(cmd->arg1, pipe)); > + break; > + case O_QUEUE: > + snprintf(SNPARGS(action2, 0), "Queue %d", > + TARG(cmd->arg1, pipe)); > + break; > + case O_FORWARD_IP: { > + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; > + int len; > + struct in_addr dummyaddr; > + if (sa->sa.sin_addr.s_addr == INADDR_ANY) > + dummyaddr.s_addr = htonl(tablearg); > + else > + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; > + > + len = snprintf(SNPARGS(action2, 0), "Forward to %s", > + inet_ntoa(dummyaddr)); > + > + if (sa->sa.sin_port) > + snprintf(SNPARGS(action2, len), ":%d", > + sa->sa.sin_port); > + } > + break; > +#ifdef INET6 > + case O_FORWARD_IP6: { > + char buf[INET6_ADDRSTRLEN]; > + ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; > + int len; > + > + len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", > + ip6_sprintf(buf, &sa->sa.sin6_addr)); > + > + if (sa->sa.sin6_port) > + snprintf(SNPARGS(action2, len), ":%u", > + sa->sa.sin6_port); > + } > + break; > +#endif > + case O_NETGRAPH: > + snprintf(SNPARGS(action2, 0), "Netgraph %d", > + cmd->arg1); > + break; > + case O_NGTEE: > + snprintf(SNPARGS(action2, 0), "Ngtee %d", > + cmd->arg1); > + break; > + case O_NAT: > + action = "Nat"; > + break; > + case O_REASS: > + action = "Reass"; > + break; > + case O_CALLRETURN: > + if (cmd->len & F_NOT) > + action = "Return"; > + else > + snprintf(SNPARGS(action2, 0), "Call %d", > + cmd->arg1); > + break; > + default: > + action = "UNKNOWN"; > + break; > + } > + } > + > + if (hlen == 0) { /* non-ip */ > + snprintf(SNPARGS(proto, 0), "MAC"); > + > + } else { > + int len; > +#ifdef INET6 > + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; > +#else > + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; > +#endif > + struct icmphdr *icmp; > + struct tcphdr *tcp; > + struct udphdr *udp; > +#ifdef INET6 > + struct ip6_hdr *ip6 = NULL; > + struct icmp6_hdr *icmp6; > + u_short ip6f_mf; > +#endif > + src[0] = '\0'; > + dst[0] = '\0'; > +#ifdef INET6 > + ip6f_mf = offset & IP6F_MORE_FRAG; > + offset &= IP6F_OFF_MASK; > + > + if (IS_IP6_FLOW_ID(&(args->f_id))) { > + char ip6buf[INET6_ADDRSTRLEN]; > + snprintf(src, sizeof(src), "[%s]", > + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); > + snprintf(dst, sizeof(dst), "[%s]", > + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); > + > + ip6 = (struct ip6_hdr *)ip; > + tcp = (struct tcphdr *)(((char *)ip) + hlen); > + udp = (struct udphdr *)(((char *)ip) + hlen); > + } else > +#endif > + { > + tcp = L3HDR(struct tcphdr, ip); > + udp = L3HDR(struct udphdr, ip); > + > + inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); > + inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); > + } > + > + switch (args->f_id.proto) { > + case IPPROTO_TCP: > + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); > + if (offset == 0) > + snprintf(SNPARGS(proto, len), ":%d %s:%d", > + ntohs(tcp->th_sport), > + dst, > + ntohs(tcp->th_dport)); > + else > + snprintf(SNPARGS(proto, len), " %s", dst); > + break; > + > + case IPPROTO_UDP: > + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); > + if (offset == 0) > + snprintf(SNPARGS(proto, len), ":%d %s:%d", > + ntohs(udp->uh_sport), > + dst, > + ntohs(udp->uh_dport)); > + else > + snprintf(SNPARGS(proto, len), " %s", dst); > + break; > + > + case IPPROTO_ICMP: > + icmp = L3HDR(struct icmphdr, ip); > + if (offset == 0) > + len = snprintf(SNPARGS(proto, 0), > + "ICMP:%u.%u ", > + icmp->icmp_type, icmp->icmp_code); > + else > + len = snprintf(SNPARGS(proto, 0), "ICMP "); > + len += snprintf(SNPARGS(proto, len), "%s", src); > + snprintf(SNPARGS(proto, len), " %s", dst); > + break; > +#ifdef INET6 > + case IPPROTO_ICMPV6: > + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); > + if (offset == 0) > + len = snprintf(SNPARGS(proto, 0), > + "ICMPv6:%u.%u ", > + icmp6->icmp6_type, icmp6->icmp6_code); > + else > + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); > + len += snprintf(SNPARGS(proto, len), "%s", src); > + snprintf(SNPARGS(proto, len), " %s", dst); > + break; > +#endif > + default: > + len = snprintf(SNPARGS(proto, 0), "P:%d %s", > + args->f_id.proto, src); > + snprintf(SNPARGS(proto, len), " %s", dst); > + break; > + } > + > +#ifdef INET6 > + if (IS_IP6_FLOW_ID(&(args->f_id))) { > + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) > + snprintf(SNPARGS(fragment, 0), > + " (frag %08x:%d@%d%s)", > + args->f_id.extra, > + ntohs(ip6->ip6_plen) - hlen, > + ntohs(offset) << 3, ip6f_mf ? "+" : ""); > + } else > +#endif > + { > + int ipoff, iplen; > + ipoff = ntohs(ip->ip_off); > + iplen = ntohs(ip->ip_len); > + if (ipoff & (IP_MF | IP_OFFMASK)) > + snprintf(SNPARGS(fragment, 0), > + " (frag %d:%d@%d%s)", > + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), > + offset << 3, > + (ipoff & IP_MF) ? "+" : ""); > + } > + } > +#ifdef __FreeBSD__ > + if (oif || m->m_pkthdr.rcvif) > + log(LOG_SECURITY | LOG_INFO, > + "ipfw: %d %s %s %s via %s%s\n", > + f ? f->rulenum : -1, > + action, proto, oif ? "out" : "in", > + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, > + fragment); > + else > +#endif > + log(LOG_SECURITY | LOG_INFO, > + "ipfw: %d %s %s [no if info]%s\n", > + f ? f->rulenum : -1, > + action, proto, fragment); > + if (limit_reached) > + log(LOG_SECURITY | LOG_NOTICE, > + "ipfw: limit %d reached on entry %d\n", > + limit_reached, f ? f->rulenum : -1); > +} > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c > new file mode 100644 > index 0000000..f41b607 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c > @@ -0,0 +1,587 @@ > +/*- > + * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_pfil.c 264540 2014-04-16 14:37:11Z ae $"); > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#include "opt_inet6.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> > +#include <sys/module.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <sys/socket.h> > +#include <sys/sysctl.h> > + > +#include <net/if.h> > +#include <net/route.h> > +#include <net/ethernet.h> > +#include <net/pfil.h> > +#include <net/vnet.h> > + > +#include <netinet/in.h> > +#include <netinet/in_systm.h> > +#include <netinet/ip.h> > +#include <netinet/ip_var.h> > +#include <netinet/ip_fw.h> > +#ifdef INET6 > +#include <netinet/ip6.h> > +#include <netinet6/ip6_var.h> > +#endif > + > +#include <netgraph/ng_ipfw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > + > +#include <machine/in_cksum.h> > + > +static VNET_DEFINE(int, fw_enable) = 1; > +#define V_fw_enable VNET(fw_enable) > + > +#ifdef INET6 > +static VNET_DEFINE(int, fw6_enable) = 1; > +#define V_fw6_enable VNET(fw6_enable) > +#endif > + > +static VNET_DEFINE(int, fwlink_enable) = 0; > +#define V_fwlink_enable VNET(fwlink_enable) > + > +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); > + > +/* Forward declarations. */ > +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); > +int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int, > + struct inpcb *); > +int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int, > + struct inpcb *); > + > +#ifdef SYSCTL_NODE > + > +SYSBEGIN(f1) > + > +SYSCTL_DECL(_net_inet_ip_fw); > +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, > + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, > + ipfw_chg_hook, "I", "Enable ipfw"); > +#ifdef INET6 > +SYSCTL_DECL(_net_inet6_ip6_fw); > +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, > + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, > + ipfw_chg_hook, "I", "Enable ipfw+6"); > +#endif /* INET6 */ > + > +SYSCTL_DECL(_net_link_ether); > +SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw, > + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0, > + ipfw_chg_hook, "I", "Pass ether pkts through firewall"); > + > +SYSEND > + > +#endif /* SYSCTL_NODE */ > + > +/* > + * The pfilter hook to pass packets to ipfw_chk and then to > + * dummynet, divert, netgraph or other modules. > + * The packet may be consumed. > + */ > +int > +ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, > + struct inpcb *inp) > +{ > + struct ip_fw_args args; > + struct m_tag *tag; > + int ipfw; > + int ret; > + > + /* convert dir to IPFW values */ > + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; > + bzero(&args, sizeof(args)); > + > +again: > + /* > + * extract and remove the tag if present. If we are left > + * with onepass, optimize the outgoing path. > + */ > + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); > + if (tag != NULL) { > + args.rule = *((struct ipfw_rule_ref *)(tag+1)); > + m_tag_delete(*m0, tag); > + if (args.rule.info & IPFW_ONEPASS) > + return (0); > + } > + > + args.m = *m0; > + args.oif = dir == DIR_OUT ? ifp : NULL; > + args.inp = inp; > + > + ipfw = ipfw_chk(&args); > + *m0 = args.m; > + > + KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", > + __func__)); > + > + /* breaking out of the switch means drop */ > + ret = 0; /* default return value for pass */ > + switch (ipfw) { > + case IP_FW_PASS: > + /* next_hop may be set by ipfw_chk */ > + if (args.next_hop == NULL && args.next_hop6 == NULL) > + break; /* pass */ > +#if (!defined(INET6) && !defined(INET)) > + ret = EACCES; > +#else > + { > + struct m_tag *fwd_tag; > + size_t len; > + > + KASSERT(args.next_hop == NULL || args.next_hop6 == NULL, > + ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__, > + args.next_hop, args.next_hop6)); > +#ifdef INET6 > + if (args.next_hop6 != NULL) > + len = sizeof(struct sockaddr_in6); > +#endif > +#ifdef INET > + if (args.next_hop != NULL) > + len = sizeof(struct sockaddr_in); > +#endif > + > + /* Incoming packets should not be tagged so we do not > + * m_tag_find. Outgoing packets may be tagged, so we > + * reuse the tag if present. > + */ > + fwd_tag = (dir == DIR_IN) ? NULL : > + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); > + if (fwd_tag != NULL) { > + m_tag_unlink(*m0, fwd_tag); > + } else { > + fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len, > + M_NOWAIT); > + if (fwd_tag == NULL) { > + ret = EACCES; > + break; /* i.e. drop */ > + } > + } > +#ifdef INET6 > + if (args.next_hop6 != NULL) { > + bcopy(args.next_hop6, (fwd_tag+1), len); > + if (in6_localip(&args.next_hop6->sin6_addr)) > + (*m0)->m_flags |= M_FASTFWD_OURS; > + (*m0)->m_flags |= M_IP6_NEXTHOP; > + } > +#endif > +#ifdef INET > + if (args.next_hop != NULL) { > + bcopy(args.next_hop, (fwd_tag+1), len); > + if (in_localip(args.next_hop->sin_addr)) > + (*m0)->m_flags |= M_FASTFWD_OURS; > + (*m0)->m_flags |= M_IP_NEXTHOP; > + } > +#endif > + m_tag_prepend(*m0, fwd_tag); > + } > +#endif /* INET || INET6 */ > + break; > + > + case IP_FW_DENY: > + ret = EACCES; > + break; /* i.e. drop */ > + > + case IP_FW_DUMMYNET: > + ret = EACCES; > + if (ip_dn_io_ptr == NULL) > + break; /* i.e. drop */ > + if (mtod(*m0, struct ip *)->ip_v == 4) > + ret = ip_dn_io_ptr(m0, dir, &args); > + else if (mtod(*m0, struct ip *)->ip_v == 6) > + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); > + else > + break; /* drop it */ > + /* > + * XXX should read the return value. > + * dummynet normally eats the packet and sets *m0=NULL > + * unless the packet can be sent immediately. In this > + * case args is updated and we should re-run the > + * check without clearing args. > + */ > + if (*m0 != NULL) > + goto again; > + break; > + > + case IP_FW_TEE: > + case IP_FW_DIVERT: > + if (ip_divert_ptr == NULL) { > + ret = EACCES; > + break; /* i.e. drop */ > + } > + ret = ipfw_divert(m0, dir, &args.rule, > + (ipfw == IP_FW_TEE) ? 1 : 0); > + /* continue processing for the original packet (tee). */ > + if (*m0) > + goto again; > + break; > + > + case IP_FW_NGTEE: > + case IP_FW_NETGRAPH: > + if (ng_ipfw_input_p == NULL) { > + ret = EACCES; > + break; /* i.e. drop */ > + } > + ret = ng_ipfw_input_p(m0, dir, &args, > + (ipfw == IP_FW_NGTEE) ? 1 : 0); > + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ > + goto again; /* continue with packet */ > + break; > + > + case IP_FW_NAT: > + /* honor one-pass in case of successful nat */ > + if (V_fw_one_pass) > + break; /* ret is already 0 */ > + goto again; > + > + case IP_FW_REASS: > + goto again; /* continue with packet */ > + > + default: > + KASSERT(0, ("%s: unknown retval", __func__)); > + } > + > + if (ret != 0) { > + if (*m0) > + FREE_PKT(*m0); > + *m0 = NULL; > + } > + > + return ret; > +} > + > +/* > + * ipfw processing for ethernet packets (in and out). > + * Inteface is NULL from ether_demux, and ifp from > + * ether_output_frame. > + */ > +int > +ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir, > + struct inpcb *inp) > +{ > + struct ether_header *eh; > + struct ether_header save_eh; > + struct mbuf *m; > + int i, ret; > + struct ip_fw_args args; > + struct m_tag *mtag; > + > + /* fetch start point from rule, if any */ > + mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); > + if (mtag == NULL) { > + args.rule.slot = 0; > + } else { > + /* dummynet packet, already partially processed */ > + struct ipfw_rule_ref *r; > + > + /* XXX can we free it after use ? */ > + mtag->m_tag_id = PACKET_TAG_NONE; > + r = (struct ipfw_rule_ref *)(mtag + 1); > + if (r->info & IPFW_ONEPASS) > + return (0); > + args.rule = *r; > + } > + > + /* I need some amt of data to be contiguous */ > + m = *m0; > + i = min(m->m_pkthdr.len, max_protohdr); > + if (m->m_len < i) { > + m = m_pullup(m, i); > + if (m == NULL) { > + *m0 = m; > + return (0); > + } > + } > + eh = mtod(m, struct ether_header *); > +#if defined(USERSPACE) > + args.eh = eh; > +#else > + save_eh = *eh; /* save copy for restore below */ > + args.eh = &save_eh; /* MAC header for bridged/MAC packets */ > +#endif > + m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ > + > + args.m = m; /* the packet we are looking at */ > + args.oif = dir == PFIL_OUT ? dst: NULL; /* destination, if any */ > + args.next_hop = NULL; /* we do not support forward yet */ > + args.next_hop6 = NULL; /* we do not support forward yet */ > + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ > + i = ipfw_chk(&args); > + m = args.m; > + if (m != NULL) { > + /* > + * Restore Ethernet header, as needed, in case the > + * mbuf chain was replaced by ipfw. > + */ > + M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); > + if (m == NULL) { > + *m0 = NULL; > + return (0); > + } > + if (eh != mtod(m, struct ether_header *)) > + bcopy(&save_eh, mtod(m, struct ether_header *), > + ETHER_HDR_LEN); > + } > + *m0 = m; > + > + ret = 0; > + /* Check result of ipfw_chk() */ > +#if defined(USERSPACE) > + /* fwd 1.1.1.1 causes the packet to be bounced back. > + * This is signalled by setting the low bit of the peer > + */ > + if (args.next_hop) { > + uintptr_t *p = (void *)&(m->__m_peer); > + *p |= 1; > + } > +#endif > + switch (i) { > + case IP_FW_PASS: > + break; > + > + case IP_FW_DENY: > + ret = EACCES; > + break; /* i.e. drop */ > + > + case IP_FW_DUMMYNET: > + ret = EACCES; > + int dir; > + > + if (ip_dn_io_ptr == NULL) > + break; /* i.e. drop */ > + > + *m0 = NULL; > + dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); > + ip_dn_io_ptr(&m, dir, &args); > + return 0; > + > + default: > + KASSERT(0, ("%s: unknown retval", __func__)); > + } > + > + if (ret != 0) { > + if (*m0) > + FREE_PKT(*m0); > + *m0 = NULL; > + } > + > + return ret; > +} > + > +/* do the divert, return 1 on error 0 on success */ > +static int > +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, > + int tee) > +{ > + /* > + * ipfw_chk() has already tagged the packet with the divert tag. > + * If tee is set, copy packet and return original. > + * If not tee, consume packet and send it to divert socket. > + */ > + struct mbuf *clone; > + struct ip *ip = mtod(*m0, struct ip *); > + struct m_tag *tag; > + > + /* Cloning needed for tee? */ > + if (tee == 0) { > + clone = *m0; /* use the original mbuf */ > + *m0 = NULL; > + } else { > + clone = m_dup(*m0, M_NOWAIT); > + /* If we cannot duplicate the mbuf, we sacrifice the divert > + * chain and continue with the tee-ed packet. > + */ > + if (clone == NULL) > + return 1; > + } > + > + /* > + * Divert listeners can normally handle non-fragmented packets, > + * but we can only reass in the non-tee case. > + * This means that listeners on a tee rule may get fragments, > + * and have to live with that. > + * Note that we now have the 'reass' ipfw option so if we care > + * we can do it before a 'tee'. > + */ > + if (!tee) switch (ip->ip_v) { > + case IPVERSION: > + if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { > + int hlen; > + struct mbuf *reass; > + > + reass = ip_reass(clone); /* Reassemble packet. */ > + if (reass == NULL) > + return 0; /* not an error */ > + /* if reass = NULL then it was consumed by ip_reass */ > + /* > + * IP header checksum fixup after reassembly and leave header > + * in network byte order. > + */ > + ip = mtod(reass, struct ip *); > + hlen = ip->ip_hl << 2; > + ip->ip_sum = 0; > + if (hlen == sizeof(struct ip)) > + ip->ip_sum = in_cksum_hdr(ip); > + else > + ip->ip_sum = in_cksum(reass, hlen); > + clone = reass; > + } > + break; > +#ifdef INET6 > + case IPV6_VERSION >> 4: > + { > + struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); > + > + if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { > + int nxt, off; > + > + off = sizeof(struct ip6_hdr); > + nxt = frag6_input(&clone, &off, 0); > + if (nxt == IPPROTO_DONE) > + return (0); > + } > + break; > + } > +#endif > + } > + > + /* attach a tag to the packet with the reinject info */ > + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, > + sizeof(struct ipfw_rule_ref), M_NOWAIT); > + if (tag == NULL) { > + FREE_PKT(clone); > + return 1; > + } > + *((struct ipfw_rule_ref *)(tag+1)) = *rule; > + m_tag_prepend(clone, tag); > + > + /* Do the dirty job... */ > + ip_divert_ptr(clone, incoming); > + return 0; > +} > + > +/* > + * attach or detach hooks for a given protocol family > + */ > +static int > +ipfw_hook(int onoff, int pf) > +{ > + struct pfil_head *pfh; > + pfil_func_t hook_func; > + > + pfh = pfil_head_get(PFIL_TYPE_AF, pf); > + if (pfh == NULL) > + return ENOENT; > + > + hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet; > + > + (void) (onoff ? pfil_add_hook : pfil_remove_hook) > + (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); > + > + return 0; > +} > + > +int > +ipfw_attach_hooks(int arg) > +{ > + int error = 0; > + > + if (arg == 0) /* detach */ > + ipfw_hook(0, AF_INET); > + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { > + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ > + printf("ipfw_hook() error\n"); > + } > +#ifdef INET6 > + if (arg == 0) /* detach */ > + ipfw_hook(0, AF_INET6); > + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { > + error = ENOENT; > + printf("ipfw6_hook() error\n"); > + } > +#endif > + if (arg == 0) /* detach */ > + ipfw_hook(0, AF_LINK); > + else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { > + error = ENOENT; > + printf("ipfw_link_hook() error\n"); > + } > + return error; > +} > + > +int > +ipfw_chg_hook(SYSCTL_HANDLER_ARGS) > +{ > + int newval; > + int error; > + int af; > + > + if (arg1 == &V_fw_enable) > + af = AF_INET; > +#ifdef INET6 > + else if (arg1 == &V_fw6_enable) > + af = AF_INET6; > +#endif > + else if (arg1 == &V_fwlink_enable) > + af = AF_LINK; > + else > + return (EINVAL); > + > + newval = *(int *)arg1; > + /* Handle sysctl change */ > + error = sysctl_handle_int(oidp, &newval, 0, req); > + > + if (error) > + return (error); > + > + /* Formalize new value */ > + newval = (newval) ? 1 : 0; > + > + if (*(int *)arg1 == newval) > + return (0); > + > + error = ipfw_hook(newval, af); > + if (error) > + return (error); > + *(int *)arg1 = newval; > + > + return (0); > +} > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h b/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h > new file mode 100644 > index 0000000..e7ad538 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h > @@ -0,0 +1,625 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * $FreeBSD: head/sys/netpfil/ipfw/ip_fw_private.h 272840 2014-10-09 19:32:35Z melifaro $ > + */ > + > +#ifndef _IPFW2_PRIVATE_H > +#define _IPFW2_PRIVATE_H > + > +/* > + * Internal constants and data structures used by ipfw components > + * and not meant to be exported outside the kernel. > + */ > + > +#ifdef _KERNEL > + > +/* > + * For platforms that do not have SYSCTL support, we wrap the > + * SYSCTL_* into a function (one per file) to collect the values > + * into an array at module initialization. The wrapping macros, > + * SYSBEGIN() and SYSEND, are empty in the default case. > + */ > +#ifndef SYSBEGIN > +#define SYSBEGIN(x) > +#endif > +#ifndef SYSEND > +#define SYSEND > +#endif > + > +/* Return values from ipfw_chk() */ > +enum { > + IP_FW_PASS = 0, > + IP_FW_DENY, > + IP_FW_DIVERT, > + IP_FW_TEE, > + IP_FW_DUMMYNET, > + IP_FW_NETGRAPH, > + IP_FW_NGTEE, > + IP_FW_NAT, > + IP_FW_REASS, > +}; > + > +/* > + * Structure for collecting parameters to dummynet for ip6_output forwarding > + */ > +struct _ip6dn_args { > + struct ip6_pktopts *opt_or; > + struct route_in6 ro_or; > + int flags_or; > + struct ip6_moptions *im6o_or; > + struct ifnet *origifp_or; > + struct ifnet *ifp_or; > + struct sockaddr_in6 dst_or; > + u_long mtu_or; > + struct route_in6 ro_pmtu_or; > +}; > + > + > +/* > + * Arguments for calling ipfw_chk() and dummynet_io(). We put them > + * all into a structure because this way it is easier and more > + * efficient to pass variables around and extend the interface. > + */ > +struct ip_fw_args { > + struct mbuf *m; /* the mbuf chain */ > + struct ifnet *oif; /* output interface */ > + struct sockaddr_in *next_hop; /* forward address */ > + struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ > + > + /* > + * On return, it points to the matching rule. > + * On entry, rule.slot > 0 means the info is valid and > + * contains the starting rule for an ipfw search. > + * If chain_id == chain->id && slot >0 then jump to that slot. > + * Otherwise, we locate the first rule >= rulenum:rule_id > + */ > + struct ipfw_rule_ref rule; /* match/restart info */ > + > + struct ether_header *eh; /* for bridged packets */ > + > + struct ipfw_flow_id f_id; /* grabbed from IP header */ > + //uint32_t cookie; /* a cookie depending on rule action */ > + struct inpcb *inp; > + > + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ > + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ > +}; > + > +MALLOC_DECLARE(M_IPFW); > + > +/* > + * Hooks sometime need to know the direction of the packet > + * (divert, dummynet, netgraph, ...) > + * We use a generic definition here, with bit0-1 indicating the > + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the > + * specific protocol > + * indicating the protocol (if necessary) > + */ > +enum { > + DIR_MASK = 0x3, > + DIR_OUT = 0, > + DIR_IN = 1, > + DIR_FWD = 2, > + DIR_DROP = 3, > + PROTO_LAYER2 = 0x4, /* set for layer 2 */ > + /* PROTO_DEFAULT = 0, */ > + PROTO_IPV4 = 0x08, > + PROTO_IPV6 = 0x10, > + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ > + /* PROTO_OLDBDG = 0x14, unused, old bridge */ > +}; > + > +/* wrapper for freeing a packet, in case we need to do more work */ > +#ifndef FREE_PKT > +#if defined(__linux__) || defined(_WIN32) > +#define FREE_PKT(m) netisr_dispatch(-1, m) > +#else > +#define FREE_PKT(m) m_freem(m) > +#endif > +#endif /* !FREE_PKT */ > + > +/* > + * Function definitions. > + */ > + > +/* attach (arg = 1) or detach (arg = 0) hooks */ > +int ipfw_attach_hooks(int); > +#ifdef NOTYET > +void ipfw_nat_destroy(void); > +#endif > + > +/* In ip_fw_log.c */ > +struct ip; > +struct ip_fw_chain; > +void ipfw_log_bpf(int); > +void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, > + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, > + u_short offset, uint32_t tablearg, struct ip *ip); > +VNET_DECLARE(u_int64_t, norule_counter); > +#define V_norule_counter VNET(norule_counter) > +VNET_DECLARE(int, verbose_limit); > +#define V_verbose_limit VNET(verbose_limit) > + > +/* In ip_fw_dynamic.c */ > + > +enum { /* result for matching dynamic rules */ > + MATCH_REVERSE = 0, > + MATCH_FORWARD, > + MATCH_NONE, > + MATCH_UNKNOWN, > +}; > + > +/* > + * The lock for dynamic rules is only used once outside the file, > + * and only to release the result of lookup_dyn_rule(). > + * Eventually we may implement it with a callback on the function. > + */ > +struct ip_fw_chain; > +struct sockopt_data; > +int ipfw_is_dyn_rule(struct ip_fw *rule); > +void ipfw_expire_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *); > +void ipfw_dyn_unlock(ipfw_dyn_rule *q); > + > +struct tcphdr; > +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, > + u_int32_t, u_int32_t, int); > +int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, > + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); > +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, > + int *match_direction, struct tcphdr *tcp); > +void ipfw_remove_dyn_children(struct ip_fw *rule); > +void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); > +int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); > + > +void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ > +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ > +int ipfw_dyn_len(void); > +int ipfw_dyn_get_count(void); > + > +/* common variables */ > +VNET_DECLARE(int, fw_one_pass); > +#define V_fw_one_pass VNET(fw_one_pass) > + > +VNET_DECLARE(int, fw_verbose); > +#define V_fw_verbose VNET(fw_verbose) > + > +VNET_DECLARE(struct ip_fw_chain, layer3_chain); > +#define V_layer3_chain VNET(layer3_chain) > + > +VNET_DECLARE(int, ipfw_vnet_ready); > +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) > + > +VNET_DECLARE(u_int32_t, set_disable); > +#define V_set_disable VNET(set_disable) > + > +VNET_DECLARE(int, autoinc_step); > +#define V_autoinc_step VNET(autoinc_step) > + > +VNET_DECLARE(unsigned int, fw_tables_max); > +#define V_fw_tables_max VNET(fw_tables_max) > + > +VNET_DECLARE(unsigned int, fw_tables_sets); > +#define V_fw_tables_sets VNET(fw_tables_sets) > + > +struct tables_config; > + > +#ifdef _KERNEL > +/* > + * Here we have the structure representing an ipfw rule. > + * > + * It starts with a general area > + * followed by an array of one or more instructions, which the code > + * accesses as an array of 32-bit values. > + * > + * Given a rule pointer r: > + * > + * r->cmd is the start of the first instruction. > + * ACTION_PTR(r) is the start of the first action (things to do > + * once a rule matched). > + */ > + > +struct ip_fw { > + uint16_t act_ofs; /* offset of action in 32-bit units */ > + uint16_t cmd_len; /* # of 32-bit words in cmd */ > + uint16_t rulenum; /* rule number */ > + uint8_t set; /* rule set (0..31) */ > + uint8_t flags; /* currently unused */ > + counter_u64_t cntr; /* Pointer to rule counters */ > + uint32_t timestamp; /* tv_sec of last match */ > + uint32_t id; /* rule id */ > + uint32_t cached_id; /* used by jump_fast */ > + uint32_t cached_pos; /* used by jump_fast */ > + > + ipfw_insn cmd[1]; /* storage for commands */ > +}; > + > +#define IPFW_RULE_CNTR_SIZE (2 * sizeof(counter_u64_t)) > + > +#endif > + > +struct ip_fw_chain { > + struct ip_fw **map; /* array of rule ptrs to ease lookup */ > + uint32_t id; /* ruleset id */ > + int n_rules; /* number of static rules */ > + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ > + void *tablestate; /* runtime table info */ > + void *valuestate; /* runtime table value info */ > + int *idxmap; /* skipto array of rules */ > +#if defined( __linux__ ) || defined( _WIN32 ) > + spinlock_t rwmtx; > +#else > + struct rmlock rwmtx; > +#endif > + int static_len; /* total len of static rules (v0) */ > + uint32_t gencnt; /* NAT generation count */ > + struct ip_fw *default_rule; > + struct tables_config *tblcfg; /* tables module data */ > + void *ifcfg; /* interface module data */ > + int *idxmap_back; /* standby skipto array of rules */ > +#if defined( __linux__ ) || defined( _WIN32 ) > + spinlock_t uh_lock; > +#else > + struct rwlock uh_lock; /* lock for upper half */ > +#endif > +}; > + > +/* 64-byte structure representing multi-field table value */ > +struct table_value { > + uint32_t tag; /* O_TAG/O_TAGGED */ > + uint32_t pipe; /* O_PIPE/O_QUEUE */ > + uint16_t divert; /* O_DIVERT/O_TEE */ > + uint16_t skipto; /* skipto, CALLRET */ > + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ > + uint32_t fib; /* O_SETFIB */ > + uint32_t nat; /* O_NAT */ > + uint32_t nh4; > + uint8_t dscp; > + uint8_t spare0[3]; > + /* -- 32 bytes -- */ > + struct in6_addr nh6; > + uint32_t limit; /* O_LIMIT */ > + uint32_t spare1; > + uint64_t refcnt; /* Number of references */ > +}; > + > +struct namedobj_instance; > + > +struct named_object { > + TAILQ_ENTRY(named_object) nn_next; /* namehash */ > + TAILQ_ENTRY(named_object) nv_next; /* valuehash */ > + char *name; /* object name */ > + uint8_t type; /* object type */ > + uint8_t compat; /* Object name is number */ > + uint16_t kidx; /* object kernel index */ > + uint16_t uidx; /* userland idx for compat records */ > + uint32_t set; /* set object belongs to */ > + uint32_t refcnt; /* number of references */ > +}; > +TAILQ_HEAD(namedobjects_head, named_object); > + > +struct sockopt; /* used by tcp_var.h */ > +struct sockopt_data { > + caddr_t kbuf; /* allocated buffer */ > + size_t ksize; /* given buffer size */ > + size_t koff; /* data already used */ > + size_t kavail; /* number of bytes available */ > + size_t ktotal; /* total bytes pushed */ > + struct sockopt *sopt; /* socket data */ > + caddr_t sopt_val; /* sopt user buffer */ > + size_t valsize; /* original data size */ > +}; > + > +struct ipfw_ifc; > + > +typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, > + uint16_t ifindex); > + > +struct ipfw_iface { > + struct named_object no; > + char ifname[64]; > + int resolved; > + uint16_t ifindex; > + uint16_t spare; > + uint64_t gencnt; > + TAILQ_HEAD(, ipfw_ifc) consumers; > +}; > + > +struct ipfw_ifc { > + TAILQ_ENTRY(ipfw_ifc) next; > + struct ipfw_iface *iface; > + ipfw_ifc_cb *cb; > + void *cbdata; > +}; > + > +/* Macro for working with various counters */ > +#define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ > + counter_u64_add((_cntr)->cntr, 1); \ > + counter_u64_add((_cntr)->cntr + 1, _bytes); \ > + if ((_cntr)->timestamp != time_uptime) \ > + (_cntr)->timestamp = time_uptime; \ > + } while (0) > + > +#define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ > + (_cntr)->pcnt++; \ > + (_cntr)->bcnt += _bytes; \ > + } while (0) > + > +#define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ > + counter_u64_zero((_cntr)->cntr); \ > + counter_u64_zero((_cntr)->cntr + 1); \ > + (_cntr)->timestamp = 0; \ > + } while (0) > + > +#define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ > + (_cntr)->pcnt = 0; \ > + (_cntr)->bcnt = 0; \ > + } while (0) > + > +#define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f > +#define IP_FW_ARG_TABLEARG(ch, a, f) \ > + (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) > +/* > + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c > + * so the variable and the macros must be here. > + */ > + > +#if defined( __linux__ ) || defined( _WIN32 ) > +#define IPFW_LOCK_INIT(_chain) do { \ > + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ > + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ > + } while (0) > + > +#define IPFW_LOCK_DESTROY(_chain) do { \ > + rw_destroy(&(_chain)->rwmtx); \ > + rw_destroy(&(_chain)->uh_lock); \ > + } while (0) > + > +#define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) > +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) > + > +#define IPFW_RLOCK_TRACKER > +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) > +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) > +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) > +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) > +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) > +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) > +#else /* FreeBSD */ > +#define IPFW_LOCK_INIT(_chain) do { \ > + rm_init(&(_chain)->rwmtx, "IPFW static rules"); \ > + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ > + } while (0) > + > +#define IPFW_LOCK_DESTROY(_chain) do { \ > + rm_destroy(&(_chain)->rwmtx); \ > + rw_destroy(&(_chain)->uh_lock); \ > + } while (0) > + > +#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) > +#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) > + > +#define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker > +#define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) > +#define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) > +#define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) > +#define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) > +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) > +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) > +#endif > + > +#define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) > +#define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) > + > +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) > +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) > +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) > +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) > + > +struct obj_idx { > + uint16_t uidx; /* internal index supplied by userland */ > + uint16_t kidx; /* kernel object index */ > + uint16_t off; /* tlv offset from rule end in 4-byte words */ > + uint8_t spare; > + uint8_t type; /* object type within its category */ > +}; > + > +struct rule_check_info { > + uint16_t flags; /* rule-specific check flags */ > + uint16_t table_opcodes; /* count of opcodes referencing table */ > + uint16_t urule_numoff; /* offset of rulenum in bytes */ > + uint8_t version; /* rule version */ > + uint8_t spare; > + ipfw_obj_ctlv *ctlv; /* name TLV containter */ > + struct ip_fw *krule; /* resulting rule pointer */ > + caddr_t urule; /* original rule pointer */ > + struct obj_idx obuf[8]; /* table references storage */ > +}; > + > +/* Legacy interface support */ > +/* > + * FreeBSD 8 export rule format > + */ > +struct ip_fw_rule0 { > + struct ip_fw *x_next; /* linked list of rules */ > + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ > + /* 'next_rule' is used to pass up 'set_disable' status */ > + > + uint16_t act_ofs; /* offset of action in 32-bit units */ > + uint16_t cmd_len; /* # of 32-bit words in cmd */ > + uint16_t rulenum; /* rule number */ > + uint8_t set; /* rule set (0..31) */ > + uint8_t _pad; /* padding */ > + uint32_t id; /* rule id */ > + > + /* These fields are present in all rules. */ > + uint64_t pcnt; /* Packet counter */ > + uint64_t bcnt; /* Byte counter */ > + uint32_t timestamp; /* tv_sec of last match */ > + > + ipfw_insn cmd[1]; /* storage for commands */ > +}; > + > +struct ip_fw_bcounter0 { > + uint64_t pcnt; /* Packet counter */ > + uint64_t bcnt; /* Byte counter */ > + uint32_t timestamp; /* tv_sec of last match */ > +}; > + > +/* Kernel rule length */ > +/* > + * RULE _K_ SIZE _V_ -> > + * get kernel size from userland rool version _V_. > + * RULE _U_ SIZE _V_ -> > + * get user size version _V_ from kernel rule > + * RULESIZE _V_ -> > + * get user size rule length > + */ > +/* FreeBSD8 <> current kernel format */ > +#define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) > +#define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) > +/* FreeBSD11 <> current kernel format */ > +#define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ > + (r)->cmd_len * 4 - 4, 8)) > +#define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) > + > + > +/* In ip_fw_iface.c */ > +int ipfw_iface_init(void); > +void ipfw_iface_destroy(void); > +void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); > +int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, > + struct ipfw_ifc *ic); > +void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); > +void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); > +void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); > + > +/* In ip_fw_sockopt.c */ > +void ipfw_init_skipto_cache(struct ip_fw_chain *chain); > +void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); > +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); > +int ipfw_ctl3(struct sockopt *sopt); > +int ipfw_chk(struct ip_fw_args *args); > +void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, > + struct ip_fw *rule); > +void ipfw_reap_rules(struct ip_fw *head); > +void ipfw_init_counters(void); > +void ipfw_destroy_counters(void); > +struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); > +int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); > + > +typedef int (sopt_handler_f)(struct ip_fw_chain *ch, > + ip_fw3_opheader *op3, struct sockopt_data *sd); > +struct ipfw_sopt_handler { > + uint16_t opcode; > + uint8_t version; > + uint8_t dir; > + sopt_handler_f *handler; > + uint64_t refcnt; > +}; > +#define HDIR_SET 0x01 /* Handler is used to set some data */ > +#define HDIR_GET 0x02 /* Handler is used to retrieve data */ > +#define HDIR_BOTH HDIR_GET|HDIR_SET > + > +void ipfw_init_sopt_handler(void); > +void ipfw_destroy_sopt_handler(void); > +void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); > +int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); > +caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); > +caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); > +#define IPFW_ADD_SOPT_HANDLER(f, c) do { \ > + if ((f) != 0) \ > + ipfw_add_sopt_handler(c, \ > + sizeof(c) / sizeof(c[0])); \ > + } while(0) > +#define IPFW_DEL_SOPT_HANDLER(l, c) do { \ > + if ((l) != 0) \ > + ipfw_del_sopt_handler(c, \ > + sizeof(c) / sizeof(c[0])); \ > + } while(0) > + > +typedef void (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, > + void *arg); > +typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, void *key, > + uint32_t kopt); > +typedef int (objhash_cmp_f)(struct named_object *no, void *key, uint32_t kopt); > +struct namedobj_instance *ipfw_objhash_create(uint32_t items); > +void ipfw_objhash_destroy(struct namedobj_instance *); > +void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); > +void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, > + void **idx, int *blocks); > +void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, > + void **idx, int *blocks); > +void ipfw_objhash_bitmap_free(void *idx, int blocks); > +void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); > +struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, > + uint32_t set, char *name); > +struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, > + uint16_t idx); > +int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, > + struct named_object *b); > +void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); > +void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); > +uint32_t ipfw_objhash_count(struct namedobj_instance *ni); > +void ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, > + void *arg); > +int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); > +int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); > +void ipfw_objhash_set_funcs(struct namedobj_instance *ni, > + objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); > + > +/* In ip_fw_table.c */ > +struct table_info; > + > +typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > + > +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, > + uint32_t *val); > +int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, > + void *paddr, uint32_t *val); > +int ipfw_init_tables(struct ip_fw_chain *ch, int first); > +int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); > +int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); > +void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); > + > +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ > + > +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); > + > +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); > +typedef int ipfw_nat_cfg_t(struct sockopt *); > + > +VNET_DECLARE(int, ipfw_nat_ready); > +#define V_ipfw_nat_ready VNET(ipfw_nat_ready) > +#define IPFW_NAT_LOADED (V_ipfw_nat_ready) > + > +extern ipfw_nat_t *ipfw_nat_ptr; > +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; > +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; > +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; > +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; > + > +#endif /* _KERNEL */ > +#endif /* _IPFW2_PRIVATE_H */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c > new file mode 100644 > index 0000000..bdf2692 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c > @@ -0,0 +1,3469 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * Copyright (c) 2014 Yandex LLC > + * Copyright (c) 2014 Alexander V. Chernikov > + * > + * Supported by: Valeria Paoli > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_sockopt.c 273035 2014-10-13 13:49:28Z melifaro $"); > + > +/* > + * Control socket and rule management routines for ipfw. > + * Control is currently implemented via IP_FW3 setsockopt() code. > + */ > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/mbuf.h> /* struct m_tag used by nested headers */ > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/priv.h> > +#include <sys/proc.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/sysctl.h> > +#include <sys/syslog.h> > +#include <sys/fnv_hash.h> > +#include <net/if.h> > +#include <net/route.h> > +#include <net/vnet.h> > +#include <vm/vm.h> > +#include <vm/vm_extern.h> > + > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* hooks */ > +#include <netinet/ip_fw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/ip_fw_table.h> > + > +#ifdef MAC > +#include <security/mac/mac_framework.h> > +#endif > + > +static int ipfw_ctl(struct sockopt *sopt); > +static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, > + struct rule_check_info *ci); > +static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, > + struct rule_check_info *ci); > +static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, > + struct rule_check_info *ci); > + > +#define NAMEDOBJ_HASH_SIZE 32 > + > +struct namedobj_instance { > + struct namedobjects_head *names; > + struct namedobjects_head *values; > + uint32_t nn_size; /* names hash size */ > + uint32_t nv_size; /* number hash size */ > + u_long *idx_mask; /* used items bitmask */ > + uint32_t max_blocks; /* number of "long" blocks in bitmask */ > + uint32_t count; /* number of items */ > + uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */ > + objhash_hash_f *hash_f; > + objhash_cmp_f *cmp_f; > +}; > +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ > + > +static uint32_t objhash_hash_name(struct namedobj_instance *ni, void *key, > + uint32_t kopt); > +static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val); > +static int objhash_cmp_name(struct named_object *no, void *name, uint32_t set); > + > +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); > + > +static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > +static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > + > +/* ctl3 handler data */ > +struct mtx ctl3_lock; > +#define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF) > +#define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock) > +#define CTL3_LOCK() mtx_lock(&ctl3_lock) > +#define CTL3_UNLOCK() mtx_unlock(&ctl3_lock) > + > +static struct ipfw_sopt_handler *ctl3_handlers; > +static size_t ctl3_hsize; > +static uint64_t ctl3_refct, ctl3_gencnt; > +#define CTL3_SMALLBUF 4096 /* small page-size write buffer */ > +#define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */ > + > +static int ipfw_flush_sopt_data(struct sockopt_data *sd); > + > +static struct ipfw_sopt_handler scodes[] = { > + { IP_FW_XGET, 0, HDIR_GET, dump_config }, > + { IP_FW_XADD, 0, HDIR_BOTH, add_rules }, > + { IP_FW_XDEL, 0, HDIR_BOTH, del_rules }, > + { IP_FW_XZERO, 0, HDIR_SET, clear_rules }, > + { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules }, > + { IP_FW_XMOVE, 0, HDIR_SET, move_rules }, > + { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets }, > + { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets }, > + { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets }, > + { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, > +}; > + > +/* > + * static variables followed by global ones > + */ > + > +static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone); > +#define V_ipfw_cntr_zone VNET(ipfw_cntr_zone) > + > +void > +ipfw_init_counters() > +{ > + > + V_ipfw_cntr_zone = uma_zcreate("IPFW counters", > + IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL, > + UMA_ALIGN_PTR, UMA_ZONE_PCPU); > +} > + > +void > +ipfw_destroy_counters() > +{ > + > + uma_zdestroy(V_ipfw_cntr_zone); > +} > + > +struct ip_fw * > +ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize) > +{ > + struct ip_fw *rule; > + > + rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO); > + rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO); > + > + return (rule); > +} > + > +static void > +free_rule(struct ip_fw *rule) > +{ > + > + uma_zfree(V_ipfw_cntr_zone, rule->cntr); > + free(rule, M_IPFW); > +} > + > + > +/* > + * Find the smallest rule >= key, id. > + * We could use bsearch but it is so simple that we code it directly > + */ > +int > +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) > +{ > + int i, lo, hi; > + struct ip_fw *r; > + > + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { > + i = (lo + hi) / 2; > + r = chain->map[i]; > + if (r->rulenum < key) > + lo = i + 1; /* continue from the next one */ > + else if (r->rulenum > key) > + hi = i; /* this might be good */ > + else if (r->id < id) > + lo = i + 1; /* continue from the next one */ > + else /* r->id >= id */ > + hi = i; /* this might be good */ > + }; > + return hi; > +} > + > +/* > + * Builds skipto cache on rule set @map. > + */ > +static void > +update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map) > +{ > + int *smap, rulenum; > + int i, mi; > + > + IPFW_UH_WLOCK_ASSERT(chain); > + > + mi = 0; > + rulenum = map[mi]->rulenum; > + smap = chain->idxmap_back; > + > + if (smap == NULL) > + return; > + > + for (i = 0; i < 65536; i++) { > + smap[i] = mi; > + /* Use the same rule index until i < rulenum */ > + if (i != rulenum || i == 65535) > + continue; > + /* Find next rule with num > i */ > + rulenum = map[++mi]->rulenum; > + while (rulenum == i) > + rulenum = map[++mi]->rulenum; > + } > +} > + > +/* > + * Swaps prepared (backup) index with current one. > + */ > +static void > +swap_skipto_cache(struct ip_fw_chain *chain) > +{ > + int *map; > + > + IPFW_UH_WLOCK_ASSERT(chain); > + IPFW_WLOCK_ASSERT(chain); > + > + map = chain->idxmap; > + chain->idxmap = chain->idxmap_back; > + chain->idxmap_back = map; > +} > + > +/* > + * Allocate and initialize skipto cache. > + */ > +void > +ipfw_init_skipto_cache(struct ip_fw_chain *chain) > +{ > + int *idxmap, *idxmap_back; > + > + idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW, > + M_WAITOK | M_ZERO); > + idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW, > + M_WAITOK | M_ZERO); > + > + /* > + * Note we may be called at any time after initialization, > + * for example, on first skipto rule, so we need to > + * provide valid chain->idxmap on return > + */ > + > + IPFW_UH_WLOCK(chain); > + if (chain->idxmap != NULL) { > + IPFW_UH_WUNLOCK(chain); > + free(idxmap, M_IPFW); > + free(idxmap_back, M_IPFW); > + return; > + } > + > + /* Set backup pointer first to permit building cache */ > + chain->idxmap_back = idxmap_back; > + update_skipto_cache(chain, chain->map); > + IPFW_WLOCK(chain); > + /* It is now safe to set chain->idxmap ptr */ > + chain->idxmap = idxmap; > + swap_skipto_cache(chain); > + IPFW_WUNLOCK(chain); > + IPFW_UH_WUNLOCK(chain); > +} > + > +/* > + * Destroys skipto cache. > + */ > +void > +ipfw_destroy_skipto_cache(struct ip_fw_chain *chain) > +{ > + > + if (chain->idxmap != NULL) > + free(chain->idxmap, M_IPFW); > + if (chain->idxmap != NULL) > + free(chain->idxmap_back, M_IPFW); > +} > + > + > +/* > + * allocate a new map, returns the chain locked. extra is the number > + * of entries to add or delete. > + */ > +static struct ip_fw ** > +get_map(struct ip_fw_chain *chain, int extra, int locked) > +{ > + > + for (;;) { > + struct ip_fw **map; > + int i, mflags; > + > + mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK); > + > + i = chain->n_rules + extra; > + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags); > + if (map == NULL) { > + printf("%s: cannot allocate map\n", __FUNCTION__); > + return NULL; > + } > + if (!locked) > + IPFW_UH_WLOCK(chain); > + if (i >= chain->n_rules + extra) /* good */ > + return map; > + /* otherwise we lost the race, free and retry */ > + if (!locked) > + IPFW_UH_WUNLOCK(chain); > + free(map, M_IPFW); > + } > +} > + > +/* > + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK > + */ > +static struct ip_fw ** > +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) > +{ > + struct ip_fw **old_map; > + > + IPFW_WLOCK(chain); > + chain->id++; > + chain->n_rules = new_len; > + old_map = chain->map; > + chain->map = new_map; > + swap_skipto_cache(chain); > + IPFW_WUNLOCK(chain); > + return old_map; > +} > + > + > +static void > +export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr) > +{ > + > + cntr->size = sizeof(*cntr); > + > + if (krule->cntr != NULL) { > + cntr->pcnt = counter_u64_fetch(krule->cntr); > + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); > + cntr->timestamp = krule->timestamp; > + } > + if (cntr->timestamp > 0) > + cntr->timestamp += boottime.tv_sec; > +} > + > +static void > +export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr) > +{ > + > + if (krule->cntr != NULL) { > + cntr->pcnt = counter_u64_fetch(krule->cntr); > + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); > + cntr->timestamp = krule->timestamp; > + } > + if (cntr->timestamp > 0) > + cntr->timestamp += boottime.tv_sec; > +} > + > +/* > + * Copies rule @urule from v1 userland format (current). > + * to kernel @krule. > + * Assume @krule is zeroed. > + */ > +static void > +import_rule1(struct rule_check_info *ci) > +{ > + struct ip_fw_rule *urule; > + struct ip_fw *krule; > + > + urule = (struct ip_fw_rule *)ci->urule; > + krule = (struct ip_fw *)ci->krule; > + > + /* copy header */ > + krule->act_ofs = urule->act_ofs; > + krule->cmd_len = urule->cmd_len; > + krule->rulenum = urule->rulenum; > + krule->set = urule->set; > + krule->flags = urule->flags; > + > + /* Save rulenum offset */ > + ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); > + > + /* Copy opcodes */ > + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); > +} > + > +/* > + * Export rule into v1 format (Current). > + * Layout: > + * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT) > + * [ ip_fw_rule ] OR > + * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs). > + * ] > + * Assume @data is zeroed. > + */ > +static void > +export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs) > +{ > + struct ip_fw_bcounter *cntr; > + struct ip_fw_rule *urule; > + ipfw_obj_tlv *tlv; > + > + /* Fill in TLV header */ > + tlv = (ipfw_obj_tlv *)data; > + tlv->type = IPFW_TLV_RULE_ENT; > + tlv->length = len; > + > + if (rcntrs != 0) { > + /* Copy counters */ > + cntr = (struct ip_fw_bcounter *)(tlv + 1); > + urule = (struct ip_fw_rule *)(cntr + 1); > + export_cntr1_base(krule, cntr); > + } else > + urule = (struct ip_fw_rule *)(tlv + 1); > + > + /* copy header */ > + urule->act_ofs = krule->act_ofs; > + urule->cmd_len = krule->cmd_len; > + urule->rulenum = krule->rulenum; > + urule->set = krule->set; > + urule->flags = krule->flags; > + urule->id = krule->id; > + > + /* Copy opcodes */ > + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); > +} > + > + > +/* > + * Copies rule @urule from FreeBSD8 userland format (v0) > + * to kernel @krule. > + * Assume @krule is zeroed. > + */ > +static void > +import_rule0(struct rule_check_info *ci) > +{ > + struct ip_fw_rule0 *urule; > + struct ip_fw *krule; > + int cmdlen, l; > + ipfw_insn *cmd; > + ipfw_insn_limit *lcmd; > + ipfw_insn_if *cmdif; > + > + urule = (struct ip_fw_rule0 *)ci->urule; > + krule = (struct ip_fw *)ci->krule; > + > + /* copy header */ > + krule->act_ofs = urule->act_ofs; > + krule->cmd_len = urule->cmd_len; > + krule->rulenum = urule->rulenum; > + krule->set = urule->set; > + if ((urule->_pad & 1) != 0) > + krule->flags |= IPFW_RULE_NOOPT; > + > + /* Save rulenum offset */ > + ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum); > + > + /* Copy opcodes */ > + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); > + > + /* > + * Alter opcodes: > + * 1) convert tablearg value from 65335 to 0 > + * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room for targ). > + * 3) convert table number in iface opcodes to u16 > + */ > + l = krule->cmd_len; > + cmd = krule->cmd; > + cmdlen = 0; > + > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + switch (cmd->opcode) { > + /* Opcodes supporting tablearg */ > + case O_TAG: > + case O_TAGGED: > + case O_PIPE: > + case O_QUEUE: > + case O_DIVERT: > + case O_TEE: > + case O_SKIPTO: > + case O_CALLRETURN: > + case O_NETGRAPH: > + case O_NGTEE: > + case O_NAT: > + if (cmd->arg1 == 65535) > + cmd->arg1 = IP_FW_TARG; > + break; > + case O_SETFIB: > + case O_SETDSCP: > + if (cmd->arg1 == 65535) > + cmd->arg1 = IP_FW_TARG; > + else > + cmd->arg1 |= 0x8000; > + break; > + case O_LIMIT: > + lcmd = (ipfw_insn_limit *)cmd; > + if (lcmd->conn_limit == 65535) > + lcmd->conn_limit = IP_FW_TARG; > + break; > + /* Interface tables */ > + case O_XMIT: > + case O_RECV: > + case O_VIA: > + /* Interface table, possibly */ > + cmdif = (ipfw_insn_if *)cmd; > + if (cmdif->name[0] != '\1') > + break; > + > + cmdif->p.kidx = (uint16_t)cmdif->p.glob; > + break; > + } > + } > +} > + > +/* > + * Copies rule @krule from kernel to FreeBSD8 userland format (v0) > + */ > +static void > +export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len) > +{ > + int cmdlen, l; > + ipfw_insn *cmd; > + ipfw_insn_limit *lcmd; > + ipfw_insn_if *cmdif; > + > + /* copy header */ > + memset(urule, 0, len); > + urule->act_ofs = krule->act_ofs; > + urule->cmd_len = krule->cmd_len; > + urule->rulenum = krule->rulenum; > + urule->set = krule->set; > + if ((krule->flags & IPFW_RULE_NOOPT) != 0) > + urule->_pad |= 1; > + > + /* Copy opcodes */ > + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); > + > + /* Export counters */ > + export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt); > + > + /* > + * Alter opcodes: > + * 1) convert tablearg value from 0 to 65335 > + * 2) Remove highest bit from O_SETFIB/O_SETDSCP values. > + * 3) convert table number in iface opcodes to int > + */ > + l = urule->cmd_len; > + cmd = urule->cmd; > + cmdlen = 0; > + > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + switch (cmd->opcode) { > + /* Opcodes supporting tablearg */ > + case O_TAG: > + case O_TAGGED: > + case O_PIPE: > + case O_QUEUE: > + case O_DIVERT: > + case O_TEE: > + case O_SKIPTO: > + case O_CALLRETURN: > + case O_NETGRAPH: > + case O_NGTEE: > + case O_NAT: > + if (cmd->arg1 == IP_FW_TARG) > + cmd->arg1 = 65535; > + break; > + case O_SETFIB: > + case O_SETDSCP: > + if (cmd->arg1 == IP_FW_TARG) > + cmd->arg1 = 65535; > + else > + cmd->arg1 &= ~0x8000; > + break; > + case O_LIMIT: > + lcmd = (ipfw_insn_limit *)cmd; > + if (lcmd->conn_limit == IP_FW_TARG) > + lcmd->conn_limit = 65535; > + break; > + /* Interface tables */ > + case O_XMIT: > + case O_RECV: > + case O_VIA: > + /* Interface table, possibly */ > + cmdif = (ipfw_insn_if *)cmd; > + if (cmdif->name[0] != '\1') > + break; > + > + cmdif->p.glob = cmdif->p.kidx; > + break; > + } > + } > +} > + > +/* > + * Add new rule(s) to the list possibly creating rule number for each. > + * Update the rule_number in the input struct so the caller knows it as well. > + * Must be called without IPFW_UH held > + */ > +static int > +commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) > +{ > + int error, i, insert_before, tcount; > + uint16_t rulenum, *pnum; > + struct rule_check_info *ci; > + struct ip_fw *krule; > + struct ip_fw **map; /* the new array of pointers */ > + > + /* Check if we need to do table remap */ > + tcount = 0; > + for (ci = rci, i = 0; i < count; ci++, i++) { > + if (ci->table_opcodes == 0) > + continue; > + > + /* > + * Rule has some table opcodes. > + * Reference & allocate needed tables/ > + */ > + error = ipfw_rewrite_table_uidx(chain, ci); > + if (error != 0) { > + > + /* > + * rewrite failed, state for current rule > + * has been reverted. Check if we need to > + * revert more. > + */ > + if (tcount > 0) { > + > + /* > + * We have some more table rules > + * we need to rollback. > + */ > + > + IPFW_UH_WLOCK(chain); > + while (ci != rci) { > + ci--; > + if (ci->table_opcodes == 0) > + continue; > + ipfw_unref_rule_tables(chain,ci->krule); > + > + } > + IPFW_UH_WUNLOCK(chain); > + > + } > + > + return (error); > + } > + > + tcount++; > + } > + > + /* get_map returns with IPFW_UH_WLOCK if successful */ > + map = get_map(chain, count, 0 /* not locked */); > + if (map == NULL) { > + if (tcount > 0) { > + /* Unbind tables */ > + IPFW_UH_WLOCK(chain); > + for (ci = rci, i = 0; i < count; ci++, i++) { > + if (ci->table_opcodes == 0) > + continue; > + > + ipfw_unref_rule_tables(chain, ci->krule); > + } > + IPFW_UH_WUNLOCK(chain); > + } > + > + return (ENOSPC); > + } > + > + if (V_autoinc_step < 1) > + V_autoinc_step = 1; > + else if (V_autoinc_step > 1000) > + V_autoinc_step = 1000; > + > + /* FIXME: Handle count > 1 */ > + ci = rci; > + krule = ci->krule; > + rulenum = krule->rulenum; > + > + /* find the insertion point, we will insert before */ > + insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; > + i = ipfw_find_rule(chain, insert_before, 0); > + /* duplicate first part */ > + if (i > 0) > + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); > + map[i] = krule; > + /* duplicate remaining part, we always have the default rule */ > + bcopy(chain->map + i, map + i + 1, > + sizeof(struct ip_fw *) *(chain->n_rules - i)); > + if (rulenum == 0) { > + /* Compute rule number and write it back */ > + rulenum = i > 0 ? map[i-1]->rulenum : 0; > + if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) > + rulenum += V_autoinc_step; > + krule->rulenum = rulenum; > + /* Save number to userland rule */ > + pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff); > + *pnum = rulenum; > + } > + > + krule->id = chain->id + 1; > + update_skipto_cache(chain, map); > + map = swap_map(chain, map, chain->n_rules + 1); > + chain->static_len += RULEUSIZE0(krule); > + IPFW_UH_WUNLOCK(chain); > + if (map) > + free(map, M_IPFW); > + return (0); > +} > + > +/* > + * Adds @rule to the list of rules to reap > + */ > +void > +ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, > + struct ip_fw *rule) > +{ > + > + IPFW_UH_WLOCK_ASSERT(chain); > + > + /* Unlink rule from everywhere */ > + ipfw_unref_rule_tables(chain, rule); > + > + *((struct ip_fw **)rule) = *head; > + *head = rule; > +} > + > +/* > + * Reclaim storage associated with a list of rules. This is > + * typically the list created using remove_rule. > + * A NULL pointer on input is handled correctly. > + */ > +void > +ipfw_reap_rules(struct ip_fw *head) > +{ > + struct ip_fw *rule; > + > + while ((rule = head) != NULL) { > + head = *((struct ip_fw **)head); > + free_rule(rule); > + } > +} > + > +/* > + * Rules to keep are > + * (default || reserved || !match_set || !match_number) > + * where > + * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) > + * // the default rule is always protected > + * > + * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) > + * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") > + * > + * match_set ::= (cmd == 0 || rule->set == set) > + * // set number is ignored for cmd == 0 > + * > + * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) > + * // number is ignored for cmd == 1 or n == 0 > + * > + */ > +int > +ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt) > +{ > + > + /* Don't match default rule for modification queries */ > + if (rule->rulenum == IPFW_DEFAULT_RULE && > + (rt->flags & IPFW_RCFLAG_DEFAULT) == 0) > + return (0); > + > + /* Don't match rules in reserved set for flush requests */ > + if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET) > + return (0); > + > + /* If we're filtering by set, don't match other sets */ > + if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set) > + return (0); > + > + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && > + (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule)) > + return (0); > + > + return (1); > +} > + > +/* > + * Delete rules matching range @rt. > + * Saves number of deleted rules in @ndel. > + * > + * Returns 0 on success. > + */ > +static int > +delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel) > +{ > + struct ip_fw *reap, *rule, **map; > + int end, start; > + int i, n, ndyn, ofs; > + > + reap = NULL; > + IPFW_UH_WLOCK(chain); /* arbitrate writers */ > + > + /* > + * Stage 1: Determine range to inspect. > + * Range is half-inclusive, e.g [start, end). > + */ > + start = 0; > + end = chain->n_rules - 1; > + > + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) { > + start = ipfw_find_rule(chain, rt->start_rule, 0); > + > + end = ipfw_find_rule(chain, rt->end_rule, 0); > + if (rt->end_rule != IPFW_DEFAULT_RULE) > + while (chain->map[end]->rulenum == rt->end_rule) > + end++; > + } > + > + /* Allocate new map of the same size */ > + map = get_map(chain, 0, 1 /* locked */); > + if (map == NULL) { > + IPFW_UH_WUNLOCK(chain); > + return (ENOMEM); > + } > + > + n = 0; > + ndyn = 0; > + ofs = start; > + /* 1. bcopy the initial part of the map */ > + if (start > 0) > + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); > + /* 2. copy active rules between start and end */ > + for (i = start; i < end; i++) { > + rule = chain->map[i]; > + if (ipfw_match_range(rule, rt) == 0) { > + map[ofs++] = rule; > + continue; > + } > + > + n++; > + if (ipfw_is_dyn_rule(rule) != 0) > + ndyn++; > + } > + /* 3. copy the final part of the map */ > + bcopy(chain->map + end, map + ofs, > + (chain->n_rules - end) * sizeof(struct ip_fw *)); > + /* 4. recalculate skipto cache */ > + update_skipto_cache(chain, map); > + /* 5. swap the maps (under UH_WLOCK + WHLOCK) */ > + map = swap_map(chain, map, chain->n_rules - n); > + /* 6. Remove all dynamic states originated by deleted rules */ > + if (ndyn > 0) > + ipfw_expire_dyn_rules(chain, rt); > + /* 7. now remove the rules deleted from the old map */ > + for (i = start; i < end; i++) { > + rule = map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + chain->static_len -= RULEUSIZE0(rule); > + ipfw_reap_add(chain, &reap, rule); > + } > + IPFW_UH_WUNLOCK(chain); > + > + ipfw_reap_rules(reap); > + if (map != NULL) > + free(map, M_IPFW); > + *ndel = n; > + return (0); > +} > + > +/* > + * Changes set of given rule rannge @rt > + * with each other. > + * > + * Returns 0 on success. > + */ > +static int > +move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt) > +{ > + struct ip_fw *rule; > + int i; > + > + IPFW_UH_WLOCK(chain); > + > + /* > + * Move rules with matching paramenerts to a new set. > + * This one is much more complex. We have to ensure > + * that all referenced tables (if any) are referenced > + * by given rule subset only. Otherwise, we can't move > + * them to new set and have to return error. > + */ > + if (V_fw_tables_sets != 0) { > + if (ipfw_move_tables_sets(chain, rt, rt->new_set) != 0) { > + IPFW_UH_WUNLOCK(chain); > + return (EBUSY); > + } > + } > + > + /* XXX: We have to do swap holding WLOCK */ > + for (i = 0; i < chain->n_rules; i++) { > + rule = chain->map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + rule->set = rt->new_set; > + } > + > + IPFW_UH_WUNLOCK(chain); > + > + return (0); > +} > + > +/* > + * Clear counters for a specific rule. > + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops > + * so we only care that rules do not disappear. > + */ > +static void > +clear_counters(struct ip_fw *rule, int log_only) > +{ > + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); > + > + if (log_only == 0) > + IPFW_ZERO_RULE_COUNTER(rule); > + if (l->o.opcode == O_LOG) > + l->log_left = l->max_log; > +} > + > +/* > + * Flushes rules counters and/or log values on matching range. > + * > + * Returns number of items cleared. > + */ > +static int > +clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only) > +{ > + struct ip_fw *rule; > + int num; > + int i; > + > + num = 0; > + rt->flags |= IPFW_RCFLAG_DEFAULT; > + > + IPFW_UH_WLOCK(chain); /* arbitrate writers */ > + for (i = 0; i < chain->n_rules; i++) { > + rule = chain->map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + clear_counters(rule, log_only); > + num++; > + } > + IPFW_UH_WUNLOCK(chain); > + > + return (num); > +} > + > +static int > +check_range_tlv(ipfw_range_tlv *rt) > +{ > + > + if (rt->head.length != sizeof(*rt)) > + return (1); > + if (rt->start_rule > rt->end_rule) > + return (1); > + if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) > + return (1); > + > + if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) > + return (1); > + > + return (0); > +} > + > +/* > + * Delete rules matching specified parameters > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_range_tlv ] > + * Reply: [ ipfw_obj_header ipfw_range_tlv ] > + * > + * Saves number of deleted rules in ipfw_range_tlv->new_set. > + * > + * Returns 0 on success. > + */ > +static int > +del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_range_header *rh; > + int error, ndel; > + > + if (sd->valsize != sizeof(*rh)) > + return (EINVAL); > + > + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); > + > + if (check_range_tlv(&rh->range) != 0) > + return (EINVAL); > + > + ndel = 0; > + if ((error = delete_range(chain, &rh->range, &ndel)) != 0) > + return (error); > + > + /* Save number of rules deleted */ > + rh->range.new_set = ndel; > + return (0); > +} > + > +/* > + * Move rules/sets matching specified parameters > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_range_tlv ] > + * > + * Returns 0 on success. > + */ > +static int > +move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_range_header *rh; > + > + if (sd->valsize != sizeof(*rh)) > + return (EINVAL); > + > + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); > + > + if (check_range_tlv(&rh->range) != 0) > + return (EINVAL); > + > + return (move_range(chain, &rh->range)); > +} > + > +/* > + * Clear rule accounting data matching specified parameters > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_range_tlv ] > + * Reply: [ ipfw_obj_header ipfw_range_tlv ] > + * > + * Saves number of cleared rules in ipfw_range_tlv->new_set. > + * > + * Returns 0 on success. > + */ > +static int > +clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_range_header *rh; > + int log_only, num; > + char *msg; > + > + if (sd->valsize != sizeof(*rh)) > + return (EINVAL); > + > + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); > + > + if (check_range_tlv(&rh->range) != 0) > + return (EINVAL); > + > + log_only = (op3->opcode == IP_FW_XRESETLOG); > + > + num = clear_range(chain, &rh->range, log_only); > + > + if (rh->range.flags & IPFW_RCFLAG_ALL) > + msg = log_only ? "All logging counts reset" : > + "Accounting cleared"; > + else > + msg = log_only ? "logging count reset" : "cleared"; > + > + if (V_fw_verbose) { > + int lev = LOG_SECURITY | LOG_NOTICE; > + log(lev, "ipfw: %s.\n", msg); > + } > + > + /* Save number of rules cleared */ > + rh->range.new_set = num; > + return (0); > +} > + > +static void > +enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt) > +{ > + uint32_t v_set; > + > + IPFW_UH_WLOCK_ASSERT(chain); > + > + /* Change enabled/disabled sets mask */ > + v_set = (V_set_disable | rt->set) & ~rt->new_set; > + v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */ > + IPFW_WLOCK(chain); > + V_set_disable = v_set; > + IPFW_WUNLOCK(chain); > +} > + > +static void > +swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv) > +{ > + struct ip_fw *rule; > + int i; > + > + IPFW_UH_WLOCK_ASSERT(chain); > + > + /* Swap or move two sets */ > + for (i = 0; i < chain->n_rules - 1; i++) { > + rule = chain->map[i]; > + if (rule->set == rt->set) > + rule->set = rt->new_set; > + else if (rule->set == rt->new_set && mv == 0) > + rule->set = rt->set; > + } > + if (V_fw_tables_sets != 0) > + ipfw_swap_tables_sets(chain, rt->set, rt->new_set, mv); > +} > + > +/* > + * Swaps or moves set > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_range_tlv ] > + * > + * Returns 0 on success. > + */ > +static int > +manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_range_header *rh; > + > + if (sd->valsize != sizeof(*rh)) > + return (EINVAL); > + > + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); > + > + if (rh->range.head.length != sizeof(ipfw_range_tlv)) > + return (1); > + > + IPFW_UH_WLOCK(chain); > + switch (op3->opcode) { > + case IP_FW_SET_SWAP: > + case IP_FW_SET_MOVE: > + swap_sets(chain, &rh->range, op3->opcode == IP_FW_SET_MOVE); > + break; > + case IP_FW_SET_ENABLE: > + enable_sets(chain, &rh->range); > + break; > + } > + IPFW_UH_WUNLOCK(chain); > + > + return (0); > +} > + > +/** > + * Remove all rules with given number, or do set manipulation. > + * Assumes chain != NULL && *chain != NULL. > + * > + * The argument is an uint32_t. The low 16 bit are the rule or set number; > + * the next 8 bits are the new set; the top 8 bits indicate the command: > + * > + * 0 delete rules numbered "rulenum" > + * 1 delete rules in set "rulenum" > + * 2 move rules "rulenum" to set "new_set" > + * 3 move rules from set "rulenum" to set "new_set" > + * 4 swap sets "rulenum" and "new_set" > + * 5 delete rules "rulenum" and set "new_set" > + */ > +static int > +del_entry(struct ip_fw_chain *chain, uint32_t arg) > +{ > + uint32_t num; /* rule number or old_set */ > + uint8_t cmd, new_set; > + int do_del, ndel; > + int error = 0; > + ipfw_range_tlv rt; > + > + num = arg & 0xffff; > + cmd = (arg >> 24) & 0xff; > + new_set = (arg >> 16) & 0xff; > + > + if (cmd > 5 || new_set > RESVD_SET) > + return EINVAL; > + if (cmd == 0 || cmd == 2 || cmd == 5) { > + if (num >= IPFW_DEFAULT_RULE) > + return EINVAL; > + } else { > + if (num > RESVD_SET) /* old_set */ > + return EINVAL; > + } > + > + /* Convert old requests into new representation */ > + memset(&rt, 0, sizeof(rt)); > + rt.start_rule = num; > + rt.end_rule = num; > + rt.set = num; > + rt.new_set = new_set; > + do_del = 0; > + > + switch (cmd) { > + case 0: /* delete rules numbered "rulenum" */ > + if (num == 0) > + rt.flags |= IPFW_RCFLAG_ALL; > + else > + rt.flags |= IPFW_RCFLAG_RANGE; > + do_del = 1; > + break; > + case 1: /* delete rules in set "rulenum" */ > + rt.flags |= IPFW_RCFLAG_SET; > + do_del = 1; > + break; > + case 5: /* delete rules "rulenum" and set "new_set" */ > + rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET; > + rt.set = new_set; > + rt.new_set = 0; > + do_del = 1; > + break; > + case 2: /* move rules "rulenum" to set "new_set" */ > + rt.flags |= IPFW_RCFLAG_RANGE; > + break; > + case 3: /* move rules from set "rulenum" to set "new_set" */ > + IPFW_UH_WLOCK(chain); > + swap_sets(chain, &rt, 1); > + IPFW_UH_WUNLOCK(chain); > + return (0); > + case 4: /* swap sets "rulenum" and "new_set" */ > + IPFW_UH_WLOCK(chain); > + swap_sets(chain, &rt, 0); > + IPFW_UH_WUNLOCK(chain); > + return (0); > + default: > + return (ENOTSUP); > + } > + > + if (do_del != 0) { > + if ((error = delete_range(chain, &rt, &ndel)) != 0) > + return (error); > + > + if (ndel == 0 && (cmd != 1 && num != 0)) > + return (EINVAL); > + > + return (0); > + } > + > + return (move_range(chain, &rt)); > +} > + > +/** > + * Reset some or all counters on firewall rules. > + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, > + * the next 8 bits are the set number, the top 8 bits are the command: > + * 0 work with rules from all set's; > + * 1 work with rules only from specified set. > + * Specified rule number is zero if we want to clear all entries. > + * log_only is 1 if we only want to reset logs, zero otherwise. > + */ > +static int > +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) > +{ > + struct ip_fw *rule; > + char *msg; > + int i; > + > + uint16_t rulenum = arg & 0xffff; > + uint8_t set = (arg >> 16) & 0xff; > + uint8_t cmd = (arg >> 24) & 0xff; > + > + if (cmd > 1) > + return (EINVAL); > + if (cmd == 1 && set > RESVD_SET) > + return (EINVAL); > + > + IPFW_UH_RLOCK(chain); > + if (rulenum == 0) { > + V_norule_counter = 0; > + for (i = 0; i < chain->n_rules; i++) { > + rule = chain->map[i]; > + /* Skip rules not in our set. */ > + if (cmd == 1 && rule->set != set) > + continue; > + clear_counters(rule, log_only); > + } > + msg = log_only ? "All logging counts reset" : > + "Accounting cleared"; > + } else { > + int cleared = 0; > + for (i = 0; i < chain->n_rules; i++) { > + rule = chain->map[i]; > + if (rule->rulenum == rulenum) { > + if (cmd == 0 || rule->set == set) > + clear_counters(rule, log_only); > + cleared = 1; > + } > + if (rule->rulenum > rulenum) > + break; > + } > + if (!cleared) { /* we did not find any matching rules */ > + IPFW_UH_RUNLOCK(chain); > + return (EINVAL); > + } > + msg = log_only ? "logging count reset" : "cleared"; > + } > + IPFW_UH_RUNLOCK(chain); > + > + if (V_fw_verbose) { > + int lev = LOG_SECURITY | LOG_NOTICE; > + > + if (rulenum) > + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); > + else > + log(lev, "ipfw: %s.\n", msg); > + } > + return (0); > +} > + > + > +/* > + * Check rule head in FreeBSD11 format > + * > + */ > +static int > +check_ipfw_rule1(struct ip_fw_rule *rule, int size, > + struct rule_check_info *ci) > +{ > + int l; > + > + if (size < sizeof(*rule)) { > + printf("ipfw: rule too short\n"); > + return (EINVAL); > + } > + > + /* Check for valid cmd_len */ > + l = roundup2(RULESIZE(rule), sizeof(uint64_t)); > + if (l != size) { > + printf("ipfw: size mismatch (have %d want %d)\n", size, l); > + return (EINVAL); > + } > + if (rule->act_ofs >= rule->cmd_len) { > + printf("ipfw: bogus action offset (%u > %u)\n", > + rule->act_ofs, rule->cmd_len - 1); > + return (EINVAL); > + } > + > + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) > + return (EINVAL); > + > + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); > +} > + > +/* > + * Check rule head in FreeBSD8 format > + * > + */ > +static int > +check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, > + struct rule_check_info *ci) > +{ > + int l; > + > + if (size < sizeof(*rule)) { > + printf("ipfw: rule too short\n"); > + return (EINVAL); > + } > + > + /* Check for valid cmd_len */ > + l = sizeof(*rule) + rule->cmd_len * 4 - 4; > + if (l != size) { > + printf("ipfw: size mismatch (have %d want %d)\n", size, l); > + return (EINVAL); > + } > + if (rule->act_ofs >= rule->cmd_len) { > + printf("ipfw: bogus action offset (%u > %u)\n", > + rule->act_ofs, rule->cmd_len - 1); > + return (EINVAL); > + } > + > + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) > + return (EINVAL); > + > + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); > +} > + > +static int > +check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) > +{ > + int cmdlen, l; > + int have_action; > + > + have_action = 0; > + > + /* > + * Now go for the individual checks. Very simple ones, basically only > + * instruction sizes. > + */ > + for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + if (cmdlen > l) { > + printf("ipfw: opcode %d size truncated\n", > + cmd->opcode); > + return EINVAL; > + } > + switch (cmd->opcode) { > + case O_PROBE_STATE: > + case O_KEEP_STATE: > + case O_PROTO: > + case O_IP_SRC_ME: > + case O_IP_DST_ME: > + case O_LAYER2: > + case O_IN: > + case O_FRAG: > + case O_DIVERTED: > + case O_IPOPT: > + case O_IPTOS: > + case O_IPPRECEDENCE: > + case O_IPVER: > + case O_SOCKARG: > + case O_TCPFLAGS: > + case O_TCPOPTS: > + case O_ESTAB: > + case O_VERREVPATH: > + case O_VERSRCREACH: > + case O_ANTISPOOF: > + case O_IPSEC: > +#ifdef INET6 > + case O_IP6_SRC_ME: > + case O_IP6_DST_ME: > + case O_EXT_HDR: > + case O_IP6: > +#endif > + case O_IP4: > + case O_TAG: > + if (cmdlen != F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > + break; > + > + case O_FIB: > + if (cmdlen != F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > + if (cmd->arg1 >= rt_numfibs) { > + printf("ipfw: invalid fib number %d\n", > + cmd->arg1); > + return EINVAL; > + } > + break; > + > + case O_SETFIB: > + if (cmdlen != F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > + if ((cmd->arg1 != IP_FW_TARG) && > + ((cmd->arg1 & 0x7FFFF) >= rt_numfibs)) { > + printf("ipfw: invalid fib number %d\n", > + cmd->arg1 & 0x7FFFF); > + return EINVAL; > + } > + goto check_action; > + > + case O_UID: > + case O_GID: > + case O_JAIL: > + case O_IP_SRC: > + case O_IP_DST: > + case O_TCPSEQ: > + case O_TCPACK: > + case O_PROB: > + case O_ICMPTYPE: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) > + goto bad_size; > + break; > + > + case O_LIMIT: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) > + goto bad_size; > + break; > + > + case O_LOG: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) > + goto bad_size; > + > + ((ipfw_insn_log *)cmd)->log_left = > + ((ipfw_insn_log *)cmd)->max_log; > + > + break; > + > + case O_IP_SRC_MASK: > + case O_IP_DST_MASK: > + /* only odd command lengths */ > + if ( !(cmdlen & 1) || cmdlen > 31) > + goto bad_size; > + break; > + > + case O_IP_SRC_SET: > + case O_IP_DST_SET: > + if (cmd->arg1 == 0 || cmd->arg1 > 256) { > + printf("ipfw: invalid set size %d\n", > + cmd->arg1); > + return EINVAL; > + } > + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + > + (cmd->arg1+31)/32 ) > + goto bad_size; > + break; > + > + case O_IP_SRC_LOOKUP: > + case O_IP_DST_LOOKUP: > + if (cmd->arg1 >= V_fw_tables_max) { > + printf("ipfw: invalid table number %d\n", > + cmd->arg1); > + return (EINVAL); > + } > + if (cmdlen != F_INSN_SIZE(ipfw_insn) && > + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && > + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) > + goto bad_size; > + ci->table_opcodes++; > + break; > + case O_IP_FLOW_LOOKUP: > + if (cmd->arg1 >= V_fw_tables_max) { > + printf("ipfw: invalid table number %d\n", > + cmd->arg1); > + return (EINVAL); > + } > + if (cmdlen != F_INSN_SIZE(ipfw_insn) && > + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) > + goto bad_size; > + ci->table_opcodes++; > + break; > + case O_MACADDR2: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) > + goto bad_size; > + break; > + > + case O_NOP: > + case O_IPID: > + case O_IPTTL: > + case O_IPLEN: > + case O_TCPDATALEN: > + case O_TCPWIN: > + case O_TAGGED: > + if (cmdlen < 1 || cmdlen > 31) > + goto bad_size; > + break; > + > + case O_DSCP: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1) > + goto bad_size; > + break; > + > + case O_MAC_TYPE: > + case O_IP_SRCPORT: > + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ > + if (cmdlen < 2 || cmdlen > 31) > + goto bad_size; > + break; > + > + case O_RECV: > + case O_XMIT: > + case O_VIA: > + if (((ipfw_insn_if *)cmd)->name[0] == '\1') > + ci->table_opcodes++; > + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) > + goto bad_size; > + break; > + > + case O_ALTQ: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) > + goto bad_size; > + break; > + > + case O_PIPE: > + case O_QUEUE: > + if (cmdlen != F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > + goto check_action; > + > + case O_FORWARD_IP: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) > + goto bad_size; > + goto check_action; > +#ifdef INET6 > + case O_FORWARD_IP6: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) > + goto bad_size; > + goto check_action; > +#endif /* INET6 */ > + > + case O_DIVERT: > + case O_TEE: > + if (ip_divert_ptr == NULL) > + return EINVAL; > + else > + goto check_size; > + case O_NETGRAPH: > + case O_NGTEE: > + if (ng_ipfw_input_p == NULL) > + return EINVAL; > + else > + goto check_size; > + case O_NAT: > + if (!IPFW_NAT_LOADED) > + return EINVAL; > + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) > + goto bad_size; > + goto check_action; > + case O_FORWARD_MAC: /* XXX not implemented yet */ > + case O_CHECK_STATE: > + case O_COUNT: > + case O_ACCEPT: > + case O_DENY: > + case O_REJECT: > + case O_SETDSCP: > +#ifdef INET6 > + case O_UNREACH6: > +#endif > + case O_SKIPTO: > + case O_REASS: > + case O_CALLRETURN: > +check_size: > + if (cmdlen != F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > +check_action: > + if (have_action) { > + printf("ipfw: opcode %d, multiple actions" > + " not allowed\n", > + cmd->opcode); > + return (EINVAL); > + } > + have_action = 1; > + if (l != cmdlen) { > + printf("ipfw: opcode %d, action must be" > + " last opcode\n", > + cmd->opcode); > + return (EINVAL); > + } > + break; > +#ifdef INET6 > + case O_IP6_SRC: > + case O_IP6_DST: > + if (cmdlen != F_INSN_SIZE(struct in6_addr) + > + F_INSN_SIZE(ipfw_insn)) > + goto bad_size; > + break; > + > + case O_FLOW6ID: > + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + > + ((ipfw_insn_u32 *)cmd)->o.arg1) > + goto bad_size; > + break; > + > + case O_IP6_SRC_MASK: > + case O_IP6_DST_MASK: > + if ( !(cmdlen & 1) || cmdlen > 127) > + goto bad_size; > + break; > + case O_ICMP6TYPE: > + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) > + goto bad_size; > + break; > +#endif > + > + default: > + switch (cmd->opcode) { > +#ifndef INET6 > + case O_IP6_SRC_ME: > + case O_IP6_DST_ME: > + case O_EXT_HDR: > + case O_IP6: > + case O_UNREACH6: > + case O_IP6_SRC: > + case O_IP6_DST: > + case O_FLOW6ID: > + case O_IP6_SRC_MASK: > + case O_IP6_DST_MASK: > + case O_ICMP6TYPE: > + printf("ipfw: no IPv6 support in kernel\n"); > + return (EPROTONOSUPPORT); > +#endif > + default: > + printf("ipfw: opcode %d, unknown opcode\n", > + cmd->opcode); > + return (EINVAL); > + } > + } > + } > + if (have_action == 0) { > + printf("ipfw: missing action\n"); > + return (EINVAL); > + } > + return 0; > + > +bad_size: > + printf("ipfw: opcode %d size %d wrong\n", > + cmd->opcode, cmdlen); > + return (EINVAL); > +} > + > + > +/* > + * Translation of requests for compatibility with FreeBSD 7.2/8. > + * a static variable tells us if we have an old client from userland, > + * and if necessary we translate requests and responses between the > + * two formats. > + */ > +static int is7 = 0; > + > +struct ip_fw7 { > + struct ip_fw7 *next; /* linked list of rules */ > + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ > + /* 'next_rule' is used to pass up 'set_disable' status */ > + > + uint16_t act_ofs; /* offset of action in 32-bit units */ > + uint16_t cmd_len; /* # of 32-bit words in cmd */ > + uint16_t rulenum; /* rule number */ > + uint8_t set; /* rule set (0..31) */ > + // #define RESVD_SET 31 /* set for default and persistent rules */ > + uint8_t _pad; /* padding */ > + // uint32_t id; /* rule id, only in v.8 */ > + /* These fields are present in all rules. */ > + uint64_t pcnt; /* Packet counter */ > + uint64_t bcnt; /* Byte counter */ > + uint32_t timestamp; /* tv_sec of last match */ > + > + ipfw_insn cmd[1]; /* storage for commands */ > +}; > + > +static int convert_rule_to_7(struct ip_fw_rule0 *rule); > +static int convert_rule_to_8(struct ip_fw_rule0 *rule); > + > +#ifndef RULESIZE7 > +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ > + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) > +#endif > + > + > +/* > + * Copy the static and dynamic rules to the supplied buffer > + * and return the amount of space actually used. > + * Must be run under IPFW_UH_RLOCK > + */ > +static size_t > +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) > +{ > + char *bp = buf; > + char *ep = bp + space; > + struct ip_fw *rule; > + struct ip_fw_rule0 *dst; > + int error, i, l, warnflag; > + time_t boot_seconds; > + > + warnflag = 0; > + > + boot_seconds = boottime.tv_sec; > + for (i = 0; i < chain->n_rules; i++) { > + rule = chain->map[i]; > + > + if (is7) { > + /* Convert rule to FreeBSd 7.2 format */ > + l = RULESIZE7(rule); > + if (bp + l + sizeof(uint32_t) <= ep) { > + bcopy(rule, bp, l + sizeof(uint32_t)); > + error = ipfw_rewrite_table_kidx(chain, > + (struct ip_fw_rule0 *)bp); > + if (error != 0) > + return (0); > + error = convert_rule_to_7((struct ip_fw_rule0 *) bp); > + if (error) > + return 0; /*XXX correct? */ > + /* > + * XXX HACK. Store the disable mask in the "next" > + * pointer in a wild attempt to keep the ABI the same. > + * Why do we do this on EVERY rule? > + */ > + bcopy(&V_set_disable, > + &(((struct ip_fw7 *)bp)->next_rule), > + sizeof(V_set_disable)); > + if (((struct ip_fw7 *)bp)->timestamp) > + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; > + bp += l; > + } > + continue; /* go to next rule */ > + } > + > + l = RULEUSIZE0(rule); > + if (bp + l > ep) { /* should not happen */ > + printf("overflow dumping static rules\n"); > + break; > + } > + dst = (struct ip_fw_rule0 *)bp; > + export_rule0(rule, dst, l); > + error = ipfw_rewrite_table_kidx(chain, dst); > + > + /* > + * XXX HACK. Store the disable mask in the "next" > + * pointer in a wild attempt to keep the ABI the same. > + * Why do we do this on EVERY rule? > + * > + * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask > + * so we need to fail _after_ saving at least one mask. > + */ > + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); > + if (dst->timestamp) > + dst->timestamp += boot_seconds; > + bp += l; > + > + if (error != 0) { > + if (error == 2) { > + /* Non-fatal table rewrite error. */ > + warnflag = 1; > + continue; > + } > + printf("Stop on rule %d. Fail to convert table\n", > + rule->rulenum); > + break; > + } > + } > + if (warnflag != 0) > + printf("ipfw: process %s is using legacy interfaces," > + " consider rebuilding\n", ""); > + ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */ > + return (bp - (char *)buf); > +} > + > + > +struct dump_args { > + uint32_t b; /* start rule */ > + uint32_t e; /* end rule */ > + uint32_t rcount; /* number of rules */ > + uint32_t rsize; /* rules size */ > + uint32_t tcount; /* number of tables */ > + int rcounters; /* counters */ > +}; > + > +/* > + * Dumps static rules with table TLVs in buffer @sd. > + * > + * Returns 0 on success. > + */ > +static int > +dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, > + uint32_t *bmask, struct sockopt_data *sd) > +{ > + int error; > + int i, l; > + uint32_t tcount; > + ipfw_obj_ctlv *ctlv; > + struct ip_fw *krule; > + caddr_t dst; > + > + /* Dump table names first (if any) */ > + if (da->tcount > 0) { > + /* Header first */ > + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); > + if (ctlv == NULL) > + return (ENOMEM); > + ctlv->head.type = IPFW_TLV_TBLNAME_LIST; > + ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + > + sizeof(*ctlv); > + ctlv->count = da->tcount; > + ctlv->objsize = sizeof(ipfw_obj_ntlv); > + } > + > + i = 0; > + tcount = da->tcount; > + while (tcount > 0) { > + if ((bmask[i / 32] & (1 << (i % 32))) == 0) { > + i++; > + continue; > + } > + > + if ((error = ipfw_export_table_ntlv(chain, i, sd)) != 0) > + return (error); > + > + i++; > + tcount--; > + } > + > + /* Dump rules */ > + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); > + if (ctlv == NULL) > + return (ENOMEM); > + ctlv->head.type = IPFW_TLV_RULE_LIST; > + ctlv->head.length = da->rsize + sizeof(*ctlv); > + ctlv->count = da->rcount; > + > + for (i = da->b; i < da->e; i++) { > + krule = chain->map[i]; > + > + l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv); > + if (da->rcounters != 0) > + l += sizeof(struct ip_fw_bcounter); > + dst = (caddr_t)ipfw_get_sopt_space(sd, l); > + if (dst == NULL) > + return (ENOMEM); > + > + export_rule1(krule, dst, l, da->rcounters); > + } > + > + return (0); > +} > + > +/* > + * Dumps requested objects data > + * Data layout (version 0)(current): > + * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags > + * size = ipfw_cfg_lheader.size > + * Reply: [ ipfw_cfg_lheader > + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) > + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) > + * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ] > + * ] (optional) > + * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional) > + * ] > + * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize. > + * The rest (size, count) are set to zero and needs to be ignored. > + * > + * Returns 0 on success. > + */ > +static int > +dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_cfg_lheader *hdr; > + struct ip_fw *rule; > + size_t sz, rnum; > + uint32_t hdr_flags; > + int error, i; > + struct dump_args da; > + uint32_t *bmask; > + > + hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); > + if (hdr == NULL) > + return (EINVAL); > + > + error = 0; > + bmask = NULL; > + /* Allocate needed state */ > + if (hdr->flags & IPFW_CFG_GET_STATIC) > + bmask = malloc(IPFW_TABLES_MAX / 8, M_TEMP, M_WAITOK | M_ZERO); > + > + IPFW_UH_RLOCK(chain); > + > + /* > + * STAGE 1: Determine size/count for objects in range. > + * Prepare used tables bitmask. > + */ > + sz = sizeof(ipfw_cfg_lheader); > + memset(&da, 0, sizeof(da)); > + > + da.b = 0; > + da.e = chain->n_rules; > + > + if (hdr->end_rule != 0) { > + /* Handle custom range */ > + if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE) > + rnum = IPFW_DEFAULT_RULE; > + da.b = ipfw_find_rule(chain, rnum, 0); > + rnum = hdr->end_rule; > + rnum = (rnum < IPFW_DEFAULT_RULE) ? rnum+1 : IPFW_DEFAULT_RULE; > + da.e = ipfw_find_rule(chain, rnum, 0) + 1; > + } > + > + if (hdr->flags & IPFW_CFG_GET_STATIC) { > + for (i = da.b; i < da.e; i++) { > + rule = chain->map[i]; > + da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); > + da.rcount++; > + da.tcount += ipfw_mark_table_kidx(chain, rule, bmask); > + } > + /* Add counters if requested */ > + if (hdr->flags & IPFW_CFG_GET_COUNTERS) { > + da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount; > + da.rcounters = 1; > + } > + > + if (da.tcount > 0) > + sz += da.tcount * sizeof(ipfw_obj_ntlv) + > + sizeof(ipfw_obj_ctlv); > + sz += da.rsize + sizeof(ipfw_obj_ctlv); > + } > + > + if (hdr->flags & IPFW_CFG_GET_STATES) > + sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) + > + sizeof(ipfw_obj_ctlv); > + > + > + /* > + * Fill header anyway. > + * Note we have to save header fields to stable storage > + * buffer inside @sd can be flushed after dumping rules > + */ > + hdr->size = sz; > + hdr->set_mask = ~V_set_disable; > + hdr_flags = hdr->flags; > + hdr = NULL; > + > + if (sd->valsize < sz) { > + error = ENOMEM; > + goto cleanup; > + } > + > + /* STAGE2: Store actual data */ > + if (hdr_flags & IPFW_CFG_GET_STATIC) { > + error = dump_static_rules(chain, &da, bmask, sd); > + if (error != 0) > + goto cleanup; > + } > + > + if (hdr_flags & IPFW_CFG_GET_STATES) > + error = ipfw_dump_states(chain, sd); > + > +cleanup: > + IPFW_UH_RUNLOCK(chain); > + > + if (bmask != NULL) > + free(bmask, M_TEMP); > + > + return (error); > +} > + > +static int > +check_object_name(ipfw_obj_ntlv *ntlv) > +{ > + int error; > + > + switch (ntlv->head.type) { > + case IPFW_TLV_TBL_NAME: > + error = ipfw_check_table_name(ntlv->name); > + break; > + default: > + error = ENOTSUP; > + } > + > + return (0); > +} > + > +/* > + * Adds one or more rules to ipfw @chain. > + * Data layout (version 0)(current): > + * Request: > + * [ > + * ip_fw3_opheader > + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) > + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3) > + * ] > + * Reply: > + * [ > + * ip_fw3_opheader > + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) > + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] > + * ] > + * > + * Rules in reply are modified to store their actual ruleset number. > + * > + * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending > + * accoring to their idx field and there has to be no duplicates. > + * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. > + * (*3) Each ip_fw structure needs to be aligned to u64 boundary. > + * > + * Returns 0 on success. > + */ > +static int > +add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_obj_ctlv *ctlv, *rtlv, *tstate; > + ipfw_obj_ntlv *ntlv; > + int clen, error, idx; > + uint32_t count, read; > + struct ip_fw_rule *r; > + struct rule_check_info rci, *ci, *cbuf; > + int i, rsize; > + > + op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); > + ctlv = (ipfw_obj_ctlv *)(op3 + 1); > + > + read = sizeof(ip_fw3_opheader); > + rtlv = NULL; > + tstate = NULL; > + cbuf = NULL; > + memset(&rci, 0, sizeof(struct rule_check_info)); > + > + if (read + sizeof(*ctlv) > sd->valsize) > + return (EINVAL); > + > + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { > + clen = ctlv->head.length; > + /* Check size and alignment */ > + if (clen > sd->valsize || clen < sizeof(*ctlv)) > + return (EINVAL); > + if ((clen % sizeof(uint64_t)) != 0) > + return (EINVAL); > + > + /* > + * Some table names or other named objects. > + * Check for validness. > + */ > + count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); > + if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) > + return (EINVAL); > + > + /* > + * Check each TLV. > + * Ensure TLVs are sorted ascending and > + * there are no duplicates. > + */ > + idx = -1; > + ntlv = (ipfw_obj_ntlv *)(ctlv + 1); > + while (count > 0) { > + if (ntlv->head.length != sizeof(ipfw_obj_ntlv)) > + return (EINVAL); > + > + error = check_object_name(ntlv); > + if (error != 0) > + return (error); > + > + if (ntlv->idx <= idx) > + return (EINVAL); > + > + idx = ntlv->idx; > + count--; > + ntlv++; > + } > + > + tstate = ctlv; > + read += ctlv->head.length; > + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); > + } > + > + if (read + sizeof(*ctlv) > sd->valsize) > + return (EINVAL); > + > + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { > + clen = ctlv->head.length; > + if (clen + read > sd->valsize || clen < sizeof(*ctlv)) > + return (EINVAL); > + if ((clen % sizeof(uint64_t)) != 0) > + return (EINVAL); > + > + /* > + * TODO: Permit adding multiple rules at once > + */ > + if (ctlv->count != 1) > + return (ENOTSUP); > + > + clen -= sizeof(*ctlv); > + > + if (ctlv->count > clen / sizeof(struct ip_fw_rule)) > + return (EINVAL); > + > + /* Allocate state for each rule or use stack */ > + if (ctlv->count == 1) { > + memset(&rci, 0, sizeof(struct rule_check_info)); > + cbuf = &rci; > + } else > + cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP, > + M_WAITOK | M_ZERO); > + ci = cbuf; > + > + /* > + * Check each rule for validness. > + * Ensure numbered rules are sorted ascending > + * and properly aligned > + */ > + idx = 0; > + r = (struct ip_fw_rule *)(ctlv + 1); > + count = 0; > + error = 0; > + while (clen > 0) { > + rsize = roundup2(RULESIZE(r), sizeof(uint64_t)); > + if (rsize > clen || ctlv->count <= count) { > + error = EINVAL; > + break; > + } > + > + ci->ctlv = tstate; > + error = check_ipfw_rule1(r, rsize, ci); > + if (error != 0) > + break; > + > + /* Check sorting */ > + if (r->rulenum != 0 && r->rulenum < idx) { > + printf("rulenum %d idx %d\n", r->rulenum, idx); > + error = EINVAL; > + break; > + } > + idx = r->rulenum; > + > + ci->urule = (caddr_t)r; > + > + rsize = roundup2(rsize, sizeof(uint64_t)); > + clen -= rsize; > + r = (struct ip_fw_rule *)((caddr_t)r + rsize); > + count++; > + ci++; > + } > + > + if (ctlv->count != count || error != 0) { > + if (cbuf != &rci) > + free(cbuf, M_TEMP); > + return (EINVAL); > + } > + > + rtlv = ctlv; > + read += ctlv->head.length; > + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); > + } > + > + if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) { > + if (cbuf != NULL && cbuf != &rci) > + free(cbuf, M_TEMP); > + return (EINVAL); > + } > + > + /* > + * Passed rules seems to be valid. > + * Allocate storage and try to add them to chain. > + */ > + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) { > + clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule); > + ci->krule = ipfw_alloc_rule(chain, clen); > + import_rule1(ci); > + } > + > + if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) { > + /* Free allocate krules */ > + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) > + free(ci->krule, M_IPFW); > + } > + > + if (cbuf != NULL && cbuf != &rci) > + free(cbuf, M_TEMP); > + > + return (error); > +} > + > +/* > + * Lists all sopts currently registered. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size > + * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ] > + * > + * Returns 0 on success > + */ > +static int > +dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_lheader *olh; > + ipfw_sopt_info *i; > + struct ipfw_sopt_handler *sh; > + uint32_t count, n, size; > + > + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); > + if (olh == NULL) > + return (EINVAL); > + if (sd->valsize < olh->size) > + return (EINVAL); > + > + CTL3_LOCK(); > + count = ctl3_hsize; > + size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader); > + > + /* Fill in header regadless of buffer size */ > + olh->count = count; > + olh->objsize = sizeof(ipfw_sopt_info); > + > + if (size > olh->size) { > + olh->size = size; > + CTL3_UNLOCK(); > + return (ENOMEM); > + } > + olh->size = size; > + > + for (n = 1; n <= count; n++) { > + i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i)); > + KASSERT(i != 0, ("previously checked buffer is not enough")); > + sh = &ctl3_handlers[n]; > + i->opcode = sh->opcode; > + i->version = sh->version; > + i->refcnt = sh->refcnt; > + } > + CTL3_UNLOCK(); > + > + return (0); > +} > + > +/* > + * Compares two sopt handlers (code, version and handler ptr). > + * Used both as qsort() and bsearch(). > + * Does not compare handler for latter case. > + * > + * Returns 0 if match is found. > + */ > +static int > +compare_sh(const void *_a, const void *_b) > +{ > + const struct ipfw_sopt_handler *a, *b; > + > + a = (const struct ipfw_sopt_handler *)_a; > + b = (const struct ipfw_sopt_handler *)_b; > + > + if (a->opcode < b->opcode) > + return (-1); > + else if (a->opcode > b->opcode) > + return (1); > + > + if (a->version < b->version) > + return (-1); > + else if (a->version > b->version) > + return (1); > + > + /* bsearch helper */ > + if (a->handler == NULL) > + return (0); > + > + if ((uintptr_t)a->handler < (uintptr_t)b->handler) > + return (-1); > + else if ((uintptr_t)b->handler > (uintptr_t)b->handler) > + return (1); > + > + return (0); > +} > + > +/* > + * Finds sopt handler based on @code and @version. > + * > + * Returns pointer to handler or NULL. > + */ > +static struct ipfw_sopt_handler * > +find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler) > +{ > + struct ipfw_sopt_handler *sh, h; > + > + memset(&h, 0, sizeof(h)); > + h.opcode = code; > + h.version = version; > + h.handler = handler; > + > + sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers, > + ctl3_hsize, sizeof(h), compare_sh); > + > + return (sh); > +} > + > +static int > +find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh) > +{ > + struct ipfw_sopt_handler *sh; > + > + CTL3_LOCK(); > + if ((sh = find_sh(opcode, version, NULL)) == NULL) { > + CTL3_UNLOCK(); > + printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n", > + opcode, version); > + return (EINVAL); > + } > + sh->refcnt++; > + ctl3_refct++; > + /* Copy handler data to requested buffer */ > + *psh = *sh; > + CTL3_UNLOCK(); > + > + return (0); > +} > + > +static void > +find_unref_sh(struct ipfw_sopt_handler *psh) > +{ > + struct ipfw_sopt_handler *sh; > + > + CTL3_LOCK(); > + sh = find_sh(psh->opcode, psh->version, NULL); > + KASSERT(sh != NULL, ("ctl3 handler disappeared")); > + sh->refcnt--; > + ctl3_refct--; > + CTL3_UNLOCK(); > +} > + > +void > +ipfw_init_sopt_handler() > +{ > + > + CTL3_LOCK_INIT(); > + IPFW_ADD_SOPT_HANDLER(1, scodes); > +} > + > +void > +ipfw_destroy_sopt_handler() > +{ > + > + IPFW_DEL_SOPT_HANDLER(1, scodes); > + CTL3_LOCK_DESTROY(); > +} > + > +/* > + * Adds one or more sockopt handlers to the global array. > + * Function may sleep. > + */ > +void > +ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) > +{ > + size_t sz; > + struct ipfw_sopt_handler *tmp; > + > + CTL3_LOCK(); > + > + for (;;) { > + sz = ctl3_hsize + count; > + CTL3_UNLOCK(); > + tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO); > + CTL3_LOCK(); > + if (ctl3_hsize + count <= sz) > + break; > + > + /* Retry */ > + free(tmp, M_IPFW); > + } > + > + /* Merge old & new arrays */ > + sz = ctl3_hsize + count; > + memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh)); > + memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh)); > + qsort(tmp, sz, sizeof(*sh), compare_sh); > + /* Switch new and free old */ > + if (ctl3_handlers != NULL) > + free(ctl3_handlers, M_IPFW); > + ctl3_handlers = tmp; > + ctl3_hsize = sz; > + ctl3_gencnt++; > + > + CTL3_UNLOCK(); > +} > + > +/* > + * Removes one or more sockopt handlers from the global array. > + */ > +int > +ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) > +{ > + size_t sz; > + struct ipfw_sopt_handler *tmp, *h; > + int i; > + > + CTL3_LOCK(); > + > + for (i = 0; i < count; i++) { > + tmp = &sh[i]; > + h = find_sh(tmp->opcode, tmp->version, tmp->handler); > + if (h == NULL) > + continue; > + > + sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h); > + memmove(h, h + 1, sz); > + ctl3_hsize--; > + } > + > + if (ctl3_hsize == 0) { > + if (ctl3_handlers != NULL) > + free(ctl3_handlers, M_IPFW); > + ctl3_handlers = NULL; > + } > + > + ctl3_gencnt++; > + > + CTL3_UNLOCK(); > + > + return (0); > +} > + > +/* > + * Writes data accumulated in @sd to sockopt buffer. > + * Zeroes internal @sd buffer. > + */ > +static int > +ipfw_flush_sopt_data(struct sockopt_data *sd) > +{ > + struct sockopt *sopt; > + int error; > + size_t sz; > + > + sz = sd->koff; > + if (sz == 0) > + return (0); > + > + sopt = sd->sopt; > + > + if (sopt->sopt_dir == SOPT_GET) { > + error = copyout(sd->kbuf, sopt->sopt_val, sz); > + if (error != 0) > + return (error); > + } > + > + memset(sd->kbuf, 0, sd->ksize); > + sd->ktotal += sz; > + sd->koff = 0; > + if (sd->ktotal + sd->ksize < sd->valsize) > + sd->kavail = sd->ksize; > + else > + sd->kavail = sd->valsize - sd->ktotal; > + > + /* Update sopt buffer data */ > + sopt->sopt_valsize = sd->ktotal; > + sopt->sopt_val = sd->sopt_val + sd->ktotal; > + > + return (0); > +} > + > +/* > + * Ensures that @sd buffer has contigious @neeeded number of > + * bytes. > + * > + * Returns pointer to requested space or NULL. > + */ > +caddr_t > +ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed) > +{ > + int error; > + caddr_t addr; > + > + if (sd->kavail < needed) { > + /* > + * Flush data and try another time. > + */ > + error = ipfw_flush_sopt_data(sd); > + > + if (sd->kavail < needed || error != 0) > + return (NULL); > + } > + > + addr = sd->kbuf + sd->koff; > + sd->koff += needed; > + sd->kavail -= needed; > + return (addr); > +} > + > +/* > + * Requests @needed contigious bytes from @sd buffer. > + * Function is used to notify subsystem that we are > + * interesed in first @needed bytes (request header) > + * and the rest buffer can be safely zeroed. > + * > + * Returns pointer to requested space or NULL. > + */ > +caddr_t > +ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed) > +{ > + caddr_t addr; > + > + if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL) > + return (NULL); > + > + if (sd->kavail > 0) > + memset(sd->kbuf + sd->koff, 0, sd->kavail); > + > + return (addr); > +} > + > +/* > + * New sockopt handler. > + */ > +int > +ipfw_ctl3(struct sockopt *sopt) > +{ > + int error, locked; > + size_t size, valsize; > + struct ip_fw_chain *chain; > + char xbuf[256]; > + struct sockopt_data sdata; > + struct ipfw_sopt_handler h; > + ip_fw3_opheader *op3 = NULL; > + > + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); > + if (error != 0) > + return (error); > + > + if (sopt->sopt_name != IP_FW3) > + return (ipfw_ctl(sopt)); > + > + chain = &V_layer3_chain; > + error = 0; > + > + /* Save original valsize before it is altered via sooptcopyin() */ > + valsize = sopt->sopt_valsize; > + memset(&sdata, 0, sizeof(sdata)); > + /* Read op3 header first to determine actual operation */ > + op3 = (ip_fw3_opheader *)xbuf; > + error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3)); > + if (error != 0) > + return (error); > + sopt->sopt_valsize = valsize; > + > + /* > + * Find and reference command. > + */ > + error = find_ref_sh(op3->opcode, op3->version, &h); > + if (error != 0) > + return (error); > + > + /* > + * Disallow modifications in really-really secure mode, but still allow > + * the logging counters to be reset. > + */ > + if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) { > + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); > + if (error != 0) { > + find_unref_sh(&h); > + return (error); > + } > + } > + > + /* > + * Fill in sockopt_data structure that may be useful for > + * IP_FW3 get requests. > + */ > + locked = 0; > + if (valsize <= sizeof(xbuf)) { > + /* use on-stack buffer */ > + sdata.kbuf = xbuf; > + sdata.ksize = sizeof(xbuf); > + sdata.kavail = valsize; > + } else { > + > + /* > + * Determine opcode type/buffer size: > + * allocate sliding-window buf for data export or > + * contigious buffer for special ops. > + */ > + if ((h.dir & HDIR_SET) != 0) { > + /* Set request. Allocate contigous buffer. */ > + if (valsize > CTL3_LARGEBUF) { > + find_unref_sh(&h); > + return (EFBIG); > + } > + > + size = valsize; > + } else { > + /* Get request. Allocate sliding window buffer */ > + size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF; > + > + if (size < valsize) { > + /* We have to wire user buffer */ > + error = vslock(sopt->sopt_val, valsize); > + if (error != 0) > + return (error); > + locked = 1; > + } > + } > + > + sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); > + sdata.ksize = size; > + sdata.kavail = size; > + } > + > + sdata.sopt = sopt; > + sdata.sopt_val = sopt->sopt_val; > + sdata.valsize = valsize; > + > + /* > + * Copy either all request (if valsize < bsize_max) > + * or first bsize_max bytes to guarantee most consumers > + * that all necessary data has been copied). > + * Anyway, copy not less than sizeof(ip_fw3_opheader). > + */ > + if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize, > + sizeof(ip_fw3_opheader))) != 0) > + return (error); > + op3 = (ip_fw3_opheader *)sdata.kbuf; > + > + /* Finally, run handler */ > + error = h.handler(chain, op3, &sdata); > + find_unref_sh(&h); > + > + /* Flush state and free buffers */ > + if (error == 0) > + error = ipfw_flush_sopt_data(&sdata); > + else > + ipfw_flush_sopt_data(&sdata); > + > + if (locked != 0) > + vsunlock(sdata.sopt_val, valsize); > + > + /* Restore original pointer and set number of bytes written */ > + sopt->sopt_val = sdata.sopt_val; > + sopt->sopt_valsize = sdata.ktotal; > + if (sdata.kbuf != xbuf) > + free(sdata.kbuf, M_TEMP); > + > + return (error); > +} > + > +/** > + * {set|get}sockopt parser. > + */ > +int > +ipfw_ctl(struct sockopt *sopt) > +{ > +#define RULE_MAXSIZE (512*sizeof(u_int32_t)) > + int error; > + size_t size, valsize; > + struct ip_fw *buf; > + struct ip_fw_rule0 *rule; > + struct ip_fw_chain *chain; > + u_int32_t rulenum[2]; > + uint32_t opt; > + struct rule_check_info ci; > + IPFW_RLOCK_TRACKER; > + > + chain = &V_layer3_chain; > + error = 0; > + > + /* Save original valsize before it is altered via sooptcopyin() */ > + valsize = sopt->sopt_valsize; > + opt = sopt->sopt_name; > + > + /* > + * Disallow modifications in really-really secure mode, but still allow > + * the logging counters to be reset. > + */ > + if (opt == IP_FW_ADD || > + (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) { > + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); > + if (error != 0) > + return (error); > + } > + > + switch (opt) { > + case IP_FW_GET: > + /* > + * pass up a copy of the current rules. Static rules > + * come first (the last of which has number IPFW_DEFAULT_RULE), > + * followed by a possibly empty list of dynamic rule. > + * The last dynamic rule has NULL in the "next" field. > + * > + * Note that the calculated size is used to bound the > + * amount of data returned to the user. The rule set may > + * change between calculating the size and returning the > + * data in which case we'll just return what fits. > + */ > + for (;;) { > + int len = 0, want; > + > + size = chain->static_len; > + size += ipfw_dyn_len(); > + if (size >= sopt->sopt_valsize) > + break; > + buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); > + IPFW_UH_RLOCK(chain); > + /* check again how much space we need */ > + want = chain->static_len + ipfw_dyn_len(); > + if (size >= want) > + len = ipfw_getrules(chain, buf, size); > + IPFW_UH_RUNLOCK(chain); > + if (size >= want) > + error = sooptcopyout(sopt, buf, len); > + free(buf, M_TEMP); > + if (size >= want) > + break; > + } > + break; > + > + case IP_FW_FLUSH: > + /* locking is done within del_entry() */ > + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ > + break; > + > + case IP_FW_ADD: > + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); > + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, > + sizeof(struct ip_fw7) ); > + > + memset(&ci, 0, sizeof(struct rule_check_info)); > + > + /* > + * If the size of commands equals RULESIZE7 then we assume > + * a FreeBSD7.2 binary is talking to us (set is7=1). > + * is7 is persistent so the next 'ipfw list' command > + * will use this format. > + * NOTE: If wrong version is guessed (this can happen if > + * the first ipfw command is 'ipfw [pipe] list') > + * the ipfw binary may crash or loop infinitly... > + */ > + size = sopt->sopt_valsize; > + if (size == RULESIZE7(rule)) { > + is7 = 1; > + error = convert_rule_to_8(rule); > + if (error) { > + free(rule, M_TEMP); > + return error; > + } > + size = RULESIZE(rule); > + } else > + is7 = 0; > + if (error == 0) > + error = check_ipfw_rule0(rule, size, &ci); > + if (error == 0) { > + /* locking is done within add_rule() */ > + struct ip_fw *krule; > + krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule)); > + ci.urule = (caddr_t)rule; > + ci.krule = krule; > + import_rule0(&ci); > + error = commit_rules(chain, &ci, 1); > + if (!error && sopt->sopt_dir == SOPT_GET) { > + if (is7) { > + error = convert_rule_to_7(rule); > + size = RULESIZE7(rule); > + if (error) { > + free(rule, M_TEMP); > + return error; > + } > + } > + error = sooptcopyout(sopt, rule, size); > + } > + } > + free(rule, M_TEMP); > + break; > + > + case IP_FW_DEL: > + /* > + * IP_FW_DEL is used for deleting single rules or sets, > + * and (ab)used to atomically manipulate sets. Argument size > + * is used to distinguish between the two: > + * sizeof(u_int32_t) > + * delete single rule or set of rules, > + * or reassign rules (or sets) to a different set. > + * 2*sizeof(u_int32_t) > + * atomic disable/enable sets. > + * first u_int32_t contains sets to be disabled, > + * second u_int32_t contains sets to be enabled. > + */ > + error = sooptcopyin(sopt, rulenum, > + 2*sizeof(u_int32_t), sizeof(u_int32_t)); > + if (error) > + break; > + size = sopt->sopt_valsize; > + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { > + /* delete or reassign, locking done in del_entry() */ > + error = del_entry(chain, rulenum[0]); > + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ > + IPFW_UH_WLOCK(chain); > + V_set_disable = > + (V_set_disable | rulenum[0]) & ~rulenum[1] & > + ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ > + IPFW_UH_WUNLOCK(chain); > + } else > + error = EINVAL; > + break; > + > + case IP_FW_ZERO: > + case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */ > + rulenum[0] = 0; > + if (sopt->sopt_val != 0) { > + error = sooptcopyin(sopt, rulenum, > + sizeof(u_int32_t), sizeof(u_int32_t)); > + if (error) > + break; > + } > + error = zero_entry(chain, rulenum[0], > + sopt->sopt_name == IP_FW_RESETLOG); > + break; > + > + /*--- TABLE opcodes ---*/ > + case IP_FW_TABLE_ADD: > + case IP_FW_TABLE_DEL: > + { > + ipfw_table_entry ent; > + struct tentry_info tei; > + struct tid_info ti; > + struct table_value v; > + > + error = sooptcopyin(sopt, &ent, > + sizeof(ent), sizeof(ent)); > + if (error) > + break; > + > + memset(&tei, 0, sizeof(tei)); > + tei.paddr = &ent.addr; > + tei.subtype = AF_INET; > + tei.masklen = ent.masklen; > + ipfw_import_table_value_legacy(ent.value, &v); > + tei.pvalue = &v; > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = ent.tbl; > + ti.type = IPFW_TABLE_CIDR; > + > + error = (opt == IP_FW_TABLE_ADD) ? > + add_table_entry(chain, &ti, &tei, 0, 1) : > + del_table_entry(chain, &ti, &tei, 0, 1); > + } > + break; > + > + > + case IP_FW_TABLE_FLUSH: > + { > + u_int16_t tbl; > + struct tid_info ti; > + > + error = sooptcopyin(sopt, &tbl, > + sizeof(tbl), sizeof(tbl)); > + if (error) > + break; > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = tbl; > + error = flush_table(chain, &ti); > + } > + break; > + > + case IP_FW_TABLE_GETSIZE: > + { > + u_int32_t tbl, cnt; > + struct tid_info ti; > + > + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), > + sizeof(tbl)))) > + break; > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = tbl; > + IPFW_RLOCK(chain); > + error = ipfw_count_table(chain, &ti, &cnt); > + IPFW_RUNLOCK(chain); > + if (error) > + break; > + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); > + } > + break; > + > + case IP_FW_TABLE_LIST: > + { > + ipfw_table *tbl; > + struct tid_info ti; > + > + if (sopt->sopt_valsize < sizeof(*tbl)) { > + error = EINVAL; > + break; > + } > + size = sopt->sopt_valsize; > + tbl = malloc(size, M_TEMP, M_WAITOK); > + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); > + if (error) { > + free(tbl, M_TEMP); > + break; > + } > + tbl->size = (size - sizeof(*tbl)) / > + sizeof(ipfw_table_entry); > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = tbl->tbl; > + IPFW_RLOCK(chain); > + error = ipfw_dump_table_legacy(chain, &ti, tbl); > + IPFW_RUNLOCK(chain); > + if (error) { > + free(tbl, M_TEMP); > + break; > + } > + error = sooptcopyout(sopt, tbl, size); > + free(tbl, M_TEMP); > + } > + break; > + > + /*--- NAT operations are protected by the IPFW_LOCK ---*/ > + case IP_FW_NAT_CFG: > + if (IPFW_NAT_LOADED) > + error = ipfw_nat_cfg_ptr(sopt); > + else { > + printf("IP_FW_NAT_CFG: %s\n", > + "ipfw_nat not present, please load it"); > + error = EINVAL; > + } > + break; > + > + case IP_FW_NAT_DEL: > + if (IPFW_NAT_LOADED) > + error = ipfw_nat_del_ptr(sopt); > + else { > + printf("IP_FW_NAT_DEL: %s\n", > + "ipfw_nat not present, please load it"); > + error = EINVAL; > + } > + break; > + > + case IP_FW_NAT_GET_CONFIG: > + if (IPFW_NAT_LOADED) > + error = ipfw_nat_get_cfg_ptr(sopt); > + else { > + printf("IP_FW_NAT_GET_CFG: %s\n", > + "ipfw_nat not present, please load it"); > + error = EINVAL; > + } > + break; > + > + case IP_FW_NAT_GET_LOG: > + if (IPFW_NAT_LOADED) > + error = ipfw_nat_get_log_ptr(sopt); > + else { > + printf("IP_FW_NAT_GET_LOG: %s\n", > + "ipfw_nat not present, please load it"); > + error = EINVAL; > + } > + break; > + > + default: > + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); > + error = EINVAL; > + } > + > + return (error); > +#undef RULE_MAXSIZE > +} > +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) > + > +/* Functions to convert rules 7.2 <==> 8.0 */ > +static int > +convert_rule_to_7(struct ip_fw_rule0 *rule) > +{ > + /* Used to modify original rule */ > + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; > + /* copy of original rule, version 8 */ > + struct ip_fw_rule0 *tmp; > + > + /* Used to copy commands */ > + ipfw_insn *ccmd, *dst; > + int ll = 0, ccmdlen = 0; > + > + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); > + if (tmp == NULL) { > + return 1; //XXX error > + } > + bcopy(rule, tmp, RULE_MAXSIZE); > + > + /* Copy fields */ > + //rule7->_pad = tmp->_pad; > + rule7->set = tmp->set; > + rule7->rulenum = tmp->rulenum; > + rule7->cmd_len = tmp->cmd_len; > + rule7->act_ofs = tmp->act_ofs; > + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; > + rule7->cmd_len = tmp->cmd_len; > + rule7->pcnt = tmp->pcnt; > + rule7->bcnt = tmp->bcnt; > + rule7->timestamp = tmp->timestamp; > + > + /* Copy commands */ > + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; > + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { > + ccmdlen = F_LEN(ccmd); > + > + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); > + > + if (dst->opcode > O_NAT) > + /* O_REASS doesn't exists in 7.2 version, so > + * decrement opcode if it is after O_REASS > + */ > + dst->opcode--; > + > + if (ccmdlen > ll) { > + printf("ipfw: opcode %d size truncated\n", > + ccmd->opcode); > + return EINVAL; > + } > + } > + free(tmp, M_TEMP); > + > + return 0; > +} > + > +static int > +convert_rule_to_8(struct ip_fw_rule0 *rule) > +{ > + /* Used to modify original rule */ > + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; > + > + /* Used to copy commands */ > + ipfw_insn *ccmd, *dst; > + int ll = 0, ccmdlen = 0; > + > + /* Copy of original rule */ > + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); > + if (tmp == NULL) { > + return 1; //XXX error > + } > + > + bcopy(rule7, tmp, RULE_MAXSIZE); > + > + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; > + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { > + ccmdlen = F_LEN(ccmd); > + > + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); > + > + if (dst->opcode > O_NAT) > + /* O_REASS doesn't exists in 7.2 version, so > + * increment opcode if it is after O_REASS > + */ > + dst->opcode++; > + > + if (ccmdlen > ll) { > + printf("ipfw: opcode %d size truncated\n", > + ccmd->opcode); > + return EINVAL; > + } > + } > + > + rule->_pad = tmp->_pad; > + rule->set = tmp->set; > + rule->rulenum = tmp->rulenum; > + rule->cmd_len = tmp->cmd_len; > + rule->act_ofs = tmp->act_ofs; > + rule->next_rule = (struct ip_fw *)tmp->next_rule; > + rule->cmd_len = tmp->cmd_len; > + rule->id = 0; /* XXX see if is ok = 0 */ > + rule->pcnt = tmp->pcnt; > + rule->bcnt = tmp->bcnt; > + rule->timestamp = tmp->timestamp; > + > + free (tmp, M_TEMP); > + return 0; > +} > + > +/* > + * Named object api > + * > + */ > + > +/* > + * Allocate new bitmask which can be used to enlarge/shrink > + * named instance index. > + */ > +void > +ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks) > +{ > + size_t size; > + int max_blocks; > + u_long *idx_mask; > + > + KASSERT((items % BLOCK_ITEMS) == 0, > + ("bitmask size needs to power of 2 and greater or equal to %zu", > + BLOCK_ITEMS)); > + > + max_blocks = items / BLOCK_ITEMS; > + size = items / 8; > + idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK); > + /* Mark all as free */ > + memset(idx_mask, 0xFF, size * IPFW_MAX_SETS); > + *idx_mask &= ~(u_long)1; /* Skip index 0 */ > + > + *idx = idx_mask; > + *pblocks = max_blocks; > +} > + > +/* > + * Copy current bitmask index to new one. > + */ > +void > +ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks) > +{ > + int old_blocks, new_blocks; > + u_long *old_idx, *new_idx; > + int i; > + > + old_idx = ni->idx_mask; > + old_blocks = ni->max_blocks; > + new_idx = *idx; > + new_blocks = *blocks; > + > + for (i = 0; i < IPFW_MAX_SETS; i++) { > + memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i], > + old_blocks * sizeof(u_long)); > + } > +} > + > +/* > + * Swaps current @ni index with new one. > + */ > +void > +ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks) > +{ > + int old_blocks; > + u_long *old_idx; > + > + old_idx = ni->idx_mask; > + old_blocks = ni->max_blocks; > + > + ni->idx_mask = *idx; > + ni->max_blocks = *blocks; > + > + /* Save old values */ > + *idx = old_idx; > + *blocks = old_blocks; > +} > + > +void > +ipfw_objhash_bitmap_free(void *idx, int blocks) > +{ > + > + free(idx, M_IPFW); > +} > + > +/* > + * Creates named hash instance. > + * Must be called without holding any locks. > + * Return pointer to new instance. > + */ > +struct namedobj_instance * > +ipfw_objhash_create(uint32_t items) > +{ > + struct namedobj_instance *ni; > + int i; > + size_t size; > + > + size = sizeof(struct namedobj_instance) + > + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE + > + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE; > + > + ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO); > + ni->nn_size = NAMEDOBJ_HASH_SIZE; > + ni->nv_size = NAMEDOBJ_HASH_SIZE; > + > + ni->names = (struct namedobjects_head *)(ni +1); > + ni->values = &ni->names[ni->nn_size]; > + > + for (i = 0; i < ni->nn_size; i++) > + TAILQ_INIT(&ni->names[i]); > + > + for (i = 0; i < ni->nv_size; i++) > + TAILQ_INIT(&ni->values[i]); > + > + /* Set default hashing/comparison functions */ > + ni->hash_f = objhash_hash_name; > + ni->cmp_f = objhash_cmp_name; > + > + /* Allocate bitmask separately due to possible resize */ > + ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, (int *)&ni->max_blocks); > + > + return (ni); > +} > + > +void > +ipfw_objhash_destroy(struct namedobj_instance *ni) > +{ > + > + free(ni->idx_mask, M_IPFW); > + free(ni, M_IPFW); > +} > + > +void > +ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, > + objhash_cmp_f *cmp_f) > +{ > + > + ni->hash_f = hash_f; > + ni->cmp_f = cmp_f; > +} > + > +static uint32_t > +objhash_hash_name(struct namedobj_instance *ni, void *name, uint32_t set) > +{ > + > + return (fnv_32_str((char *)name, FNV1_32_INIT)); > +} > + > +static int > +objhash_cmp_name(struct named_object *no, void *name, uint32_t set) > +{ > + > + if ((strcmp(no->name, (char *)name) == 0) && (no->set == set)) > + return (0); > + > + return (1); > +} > + > +static uint32_t > +objhash_hash_idx(struct namedobj_instance *ni, uint32_t val) > +{ > + uint32_t v; > + > + v = val % (ni->nv_size - 1); > + > + return (v); > +} > + > +struct named_object * > +ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name) > +{ > + struct named_object *no; > + uint32_t hash; > + > + hash = ni->hash_f(ni, name, set) % ni->nn_size; > + > + TAILQ_FOREACH(no, &ni->names[hash], nn_next) { > + if (ni->cmp_f(no, name, set) == 0) > + return (no); > + } > + > + return (NULL); > +} > + > +struct named_object * > +ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) > +{ > + struct named_object *no; > + uint32_t hash; > + > + hash = objhash_hash_idx(ni, kidx); > + > + TAILQ_FOREACH(no, &ni->values[hash], nv_next) { > + if (no->kidx == kidx) > + return (no); > + } > + > + return (NULL); > +} > + > +int > +ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, > + struct named_object *b) > +{ > + > + if ((strcmp(a->name, b->name) == 0) && a->set == b->set) > + return (1); > + > + return (0); > +} > + > +void > +ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no) > +{ > + uint32_t hash; > + > + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; > + TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next); > + > + hash = objhash_hash_idx(ni, no->kidx); > + TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next); > + > + ni->count++; > +} > + > +void > +ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no) > +{ > + uint32_t hash; > + > + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; > + TAILQ_REMOVE(&ni->names[hash], no, nn_next); > + > + hash = objhash_hash_idx(ni, no->kidx); > + TAILQ_REMOVE(&ni->values[hash], no, nv_next); > + > + ni->count--; > +} > + > +uint32_t > +ipfw_objhash_count(struct namedobj_instance *ni) > +{ > + > + return (ni->count); > +} > + > +/* > + * Runs @func for each found named object. > + * It is safe to delete objects from callback > + */ > +void > +ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg) > +{ > + struct named_object *no, *no_tmp; > + int i; > + > + for (i = 0; i < ni->nn_size; i++) { > + TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) > + f(ni, no, arg); > + } > +} > + > +/* > + * Removes index from given set. > + * Returns 0 on success. > + */ > +int > +ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx) > +{ > + u_long *mask; > + int i, v; > + > + i = idx / BLOCK_ITEMS; > + v = idx % BLOCK_ITEMS; > + > + if (i >= ni->max_blocks) > + return (1); > + > + mask = &ni->idx_mask[i]; > + > + if ((*mask & ((u_long)1 << v)) != 0) > + return (1); > + > + /* Mark as free */ > + *mask |= (u_long)1 << v; > + > + /* Update free offset */ > + if (ni->free_off[0] > i) > + ni->free_off[0] = i; > + > + return (0); > +} > + > +/* > + * Allocate new index in given instance and stores in in @pidx. > + * Returns 0 on success. > + */ > +int > +ipfw_objhash_alloc_idx(void *n, uint16_t *pidx) > +{ > + struct namedobj_instance *ni; > + u_long *mask; > + int i, off, v; > + > + ni = (struct namedobj_instance *)n; > + > + off = ni->free_off[0]; > + mask = &ni->idx_mask[off]; > + > + for (i = off; i < ni->max_blocks; i++, mask++) { > + if ((v = ffsl(*mask)) == 0) > + continue; > + > + /* Mark as busy */ > + *mask &= ~ ((u_long)1 << (v - 1)); > + > + ni->free_off[0] = i; > + > + v = BLOCK_ITEMS * i + v - 1; > + > + *pidx = v; > + return (0); > + } > + > + return (1); > +} > + > +/* end of file */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c > new file mode 100644 > index 0000000..2994528 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c > @@ -0,0 +1,3674 @@ > +/*- > + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. > + * Copyright (c) 2014 Yandex LLC > + * Copyright (c) 2014 Alexander V. Chernikov > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table.c 272840 2014-10-09 19:32:35Z melifaro $"); > + > +/* > + * Lookup table support for ipfw. > + * > + * This file contains handlers for all generic tables' operations: > + * add/del/flush entries, list/dump tables etc.. > + * > + * Table data modification is protected by both UH and runtime lock > + * while reading configuration/data is protected by UH lock. > + * > + * Lookup algorithms for all table types are located in ip_fw_table_algo.c > + */ > + > +#include "opt_ipfw.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/queue.h> > +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ > + > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ > +#include <netinet/ip_fw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/ip_fw_table.h> > + > + /* > + * Table has the following `type` concepts: > + * > + * `no.type` represents lookup key type (addr, ifp, uid, etc..) > + * vmask represents bitmask of table values which are present at the moment. > + * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old > + * single-value-for-all approach. > + */ > +struct table_config { > + struct named_object no; > + uint8_t tflags; /* type flags */ > + uint8_t locked; /* 1 if locked from changes */ > + uint8_t linked; /* 1 if already linked */ > + uint8_t ochanged; /* used by set swapping */ > + uint8_t vshared; /* 1 if using shared value array */ > + uint8_t spare[3]; > + uint32_t count; /* Number of records */ > + uint32_t limit; /* Max number of records */ > + uint32_t vmask; /* bitmask with supported values */ > + uint32_t ocount; /* used by set swapping */ > + uint64_t gencnt; /* generation count */ > + char tablename[64]; /* table name */ > + struct table_algo *ta; /* Callbacks for given algo */ > + void *astate; /* algorithm state */ > + struct table_info ti_copy; /* data to put to table_info */ > + struct namedobj_instance *vi; > +}; > + > +static struct table_config *find_table(struct namedobj_instance *ni, > + struct tid_info *ti); > +static struct table_config *alloc_table_config(struct ip_fw_chain *ch, > + struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags); > +static void free_table_config(struct namedobj_instance *ni, > + struct table_config *tc); > +static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, > + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref); > +static void link_table(struct ip_fw_chain *ch, struct table_config *tc); > +static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc); > +static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc); > +#define OP_ADD 1 > +#define OP_DEL 0 > +static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, > + struct sockopt_data *sd); > +static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, > + ipfw_xtable_info *i); > +static int dump_table_tentry(void *e, void *arg); > +static int dump_table_xentry(void *e, void *arg); > + > +static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, > + struct tid_info *b); > + > +static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, > + struct table_config *tc, struct table_info *ti, uint32_t count); > +static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti); > + > +static struct table_algo *find_table_algo(struct tables_config *tableconf, > + struct tid_info *ti, char *name); > + > +static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); > +static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); > +static int classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); > + > +#define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) > +#define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) > + > +#define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */ > + > +void > +rollback_toperation_state(struct ip_fw_chain *ch, void *object) > +{ > + struct tables_config *tcfg; > + struct op_state *os; > + > + tcfg = CHAIN_TO_TCFG(ch); > + TAILQ_FOREACH(os, &tcfg->state_list, next) > + os->func(object, os); > +} > + > +void > +add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) > +{ > + struct tables_config *tcfg; > + > + tcfg = CHAIN_TO_TCFG(ch); > + TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next); > +} > + > +void > +del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) > +{ > + struct tables_config *tcfg; > + > + tcfg = CHAIN_TO_TCFG(ch); > + TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next); > +} > + > +void > +tc_ref(struct table_config *tc) > +{ > + > + tc->no.refcnt++; > +} > + > +void > +tc_unref(struct table_config *tc) > +{ > + > + tc->no.refcnt--; > +} > + > +static struct table_value * > +get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx) > +{ > + struct table_value *pval; > + > + pval = (struct table_value *)ch->valuestate; > + > + return (&pval[kidx]); > +} > + > + > +/* > + * Checks if we're able to insert/update entry @tei into table > + * w.r.t @tc limits. > + * May alter @tei to indicate insertion error / insert > + * options. > + * > + * Returns 0 if operation can be performed/ > + */ > +static int > +check_table_limit(struct table_config *tc, struct tentry_info *tei) > +{ > + > + if (tc->limit == 0 || tc->count < tc->limit) > + return (0); > + > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) { > + /* Notify userland on error cause */ > + tei->flags |= TEI_FLAGS_LIMIT; > + return (EFBIG); > + } > + > + /* > + * We have UPDATE flag set. > + * Permit updating record (if found), > + * but restrict adding new one since we've > + * already hit the limit. > + */ > + tei->flags |= TEI_FLAGS_DONTADD; > + > + return (0); > +} > + > +/* > + * Convert algorithm callback return code into > + * one of pre-defined states known by userland. > + */ > +static void > +store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num) > +{ > + int flag; > + > + flag = 0; > + > + switch (error) { > + case 0: > + if (op == OP_ADD && num != 0) > + flag = TEI_FLAGS_ADDED; > + if (op == OP_DEL) > + flag = TEI_FLAGS_DELETED; > + break; > + case ENOENT: > + flag = TEI_FLAGS_NOTFOUND; > + break; > + case EEXIST: > + flag = TEI_FLAGS_EXISTS; > + break; > + default: > + flag = TEI_FLAGS_ERROR; > + } > + > + tei->flags |= flag; > +} > + > +/* > + * Creates and references table with default parameters. > + * Saves table config, algo and allocated kidx info @ptc, @pta and > + * @pkidx if non-zero. > + * Used for table auto-creation to support old binaries. > + * > + * Returns 0 on success. > + */ > +static int > +create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti, > + uint16_t *pkidx) > +{ > + ipfw_xtable_info xi; > + int error; > + > + memset(&xi, 0, sizeof(xi)); > + /* Set default value mask for legacy clients */ > + xi.vmask = IPFW_VTYPE_LEGACY; > + > + error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1); > + if (error != 0) > + return (error); > + > + return (0); > +} > + > +/* > + * Find and reference existing table optionally > + * creating new one. > + * > + * Saves found table config into @ptc. > + * Note function may drop/acquire UH_WLOCK. > + * Returns 0 if table was found/created and referenced > + * or non-zero return code. > + */ > +static int > +find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint32_t count, int op, > + struct table_config **ptc) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc; > + uint16_t kidx; > + int error; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + ni = CHAIN_TO_NI(ch); > + tc = NULL; > + if ((tc = find_table(ni, ti)) != NULL) { > + /* check table type */ > + if (tc->no.type != ti->type) > + return (EINVAL); > + > + if (tc->locked != 0) > + return (EACCES); > + > + /* Try to exit early on limit hit */ > + if (op == OP_ADD && count == 1 && > + check_table_limit(tc, tei) != 0) > + return (EFBIG); > + > + /* Reference and return */ > + tc->no.refcnt++; > + *ptc = tc; > + return (0); > + } > + > + if (op == OP_DEL) > + return (ESRCH); > + > + /* Compability mode: create new table for old clients */ > + if ((tei->flags & TEI_FLAGS_COMPAT) == 0) > + return (ESRCH); > + > + IPFW_UH_WUNLOCK(ch); > + error = create_table_compat(ch, ti, &kidx); > + IPFW_UH_WLOCK(ch); > + > + if (error != 0) > + return (error); > + > + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); > + KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx)); > + > + /* OK, now we've got referenced table. */ > + *ptc = tc; > + return (0); > +} > + > +/* > + * Rolls back already @added to @tc entries using state array @ta_buf_m. > + * Assume the following layout: > + * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases > + * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1]) > + * for storing deleted state > + */ > +static void > +rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc, > + struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m, > + uint32_t count, uint32_t added) > +{ > + struct table_algo *ta; > + struct tentry_info *ptei; > + caddr_t v, vv; > + size_t ta_buf_sz; > + int error, i; > + uint32_t num; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + ta = tc->ta; > + ta_buf_sz = ta->ta_buf_size; > + v = ta_buf_m; > + vv = v + count * ta_buf_sz; > + for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) { > + ptei = &tei[i]; > + if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) { > + > + /* > + * We have old value stored by previous > + * call in @ptei->value. Do add once again > + * to restore it. > + */ > + error = ta->add(tc->astate, tinfo, ptei, v, &num); > + KASSERT(error == 0, ("rollback UPDATE fail")); > + KASSERT(num == 0, ("rollback UPDATE fail2")); > + continue; > + } > + > + error = ta->prepare_del(ch, ptei, vv); > + KASSERT(error == 0, ("pre-rollback INSERT failed")); > + error = ta->del(tc->astate, tinfo, ptei, vv, &num); > + KASSERT(error == 0, ("rollback INSERT failed")); > + tc->count -= num; > + } > +} > + > +/* > + * Prepares add/del state for all @count entries in @tei. > + * Uses either stack buffer (@ta_buf) or allocates a new one. > + * Stores pointer to allocated buffer back to @ta_buf. > + * > + * Returns 0 on success. > + */ > +static int > +prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, > + struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf) > +{ > + caddr_t ta_buf_m, v; > + size_t ta_buf_sz, sz; > + struct tentry_info *ptei; > + int error, i; > + > + error = 0; > + ta_buf_sz = ta->ta_buf_size; > + if (count == 1) { > + /* Sigle add/delete, use on-stack buffer */ > + memset(*ta_buf, 0, TA_BUF_SZ); > + ta_buf_m = *ta_buf; > + } else { > + > + /* > + * Multiple adds/deletes, allocate larger buffer > + * > + * Note we need 2xcount buffer for add case: > + * we have hold both ADD state > + * and DELETE state (this may be needed > + * if we need to rollback all changes) > + */ > + sz = count * ta_buf_sz; > + ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP, > + M_WAITOK | M_ZERO); > + } > + > + v = ta_buf_m; > + for (i = 0; i < count; i++, v += ta_buf_sz) { > + ptei = &tei[i]; > + error = (op == OP_ADD) ? > + ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v); > + > + /* > + * Some syntax error (incorrect mask, or address, or > + * anything). Return error regardless of atomicity > + * settings. > + */ > + if (error != 0) > + break; > + } > + > + *ta_buf = ta_buf_m; > + return (error); > +} > + > +/* > + * Flushes allocated state for each @count entries in @tei. > + * Frees @ta_buf_m if differs from stack buffer @ta_buf. > + */ > +static void > +flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, > + struct tentry_info *tei, uint32_t count, int rollback, > + caddr_t ta_buf_m, caddr_t ta_buf) > +{ > + caddr_t v; > + struct tentry_info *ptei; > + size_t ta_buf_sz; > + int i; > + > + ta_buf_sz = ta->ta_buf_size; > + > + /* Run cleaning callback anyway */ > + v = ta_buf_m; > + for (i = 0; i < count; i++, v += ta_buf_sz) { > + ptei = &tei[i]; > + ta->flush_entry(ch, ptei, v); > + if (ptei->ptv != NULL) { > + free(ptei->ptv, M_IPFW); > + ptei->ptv = NULL; > + } > + } > + > + /* Clean up "deleted" state in case of rollback */ > + if (rollback != 0) { > + v = ta_buf_m + count * ta_buf_sz; > + for (i = 0; i < count; i++, v += ta_buf_sz) > + ta->flush_entry(ch, &tei[i], v); > + } > + > + if (ta_buf_m != ta_buf) > + free(ta_buf_m, M_TEMP); > +} > + > + > +static void > +rollback_add_entry(void *object, struct op_state *_state) > +{ > + struct ip_fw_chain *ch; > + struct tableop_state *ts; > + > + ts = (struct tableop_state *)_state; > + > + if (ts->tc != object && ts->ch != object) > + return; > + > + ch = ts->ch; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + /* Call specifid unlockers */ > + rollback_table_values(ts); > + > + /* Indicate we've called */ > + ts->modified = 1; > +} > + > +/* > + * Adds/updates one or more entries in table @ti. > + * > + * Function may drop/reacquire UH wlock multiple times due to > + * items alloc, algorithm callbacks (check_space), value linkage > + * (new values, value storage realloc), etc.. > + * Other processes like other adds (which may involve storage resize), > + * table swaps (which changes table data and may change algo type), > + * table modify (which may change value mask) may be executed > + * simultaneously so we need to deal with it. > + * > + * The following approach was implemented: > + * we have per-chain linked list, protected with UH lock. > + * add_table_entry prepares special on-stack structure wthich is passed > + * to its descendants. Users add this structure to this list before unlock. > + * After performing needed operations and acquiring UH lock back, each user > + * checks if structure has changed. If true, it rolls local state back and > + * returns without error to the caller. > + * add_table_entry() on its own checks if structure has changed and restarts > + * its operation from the beginning (goto restart). > + * > + * Functions which are modifying fields of interest (currently > + * resize_shared_value_storage() and swap_tables() ) > + * traverses given list while holding UH lock immediately before > + * performing their operations calling function provided be list entry > + * ( currently rollback_add_entry ) which performs rollback for all necessary > + * state and sets appropriate values in structure indicating rollback > + * has happened. > + * > + * Algo interaction: > + * Function references @ti first to ensure table won't > + * disappear or change its type. > + * After that, prepare_add callback is called for each @tei entry. > + * Next, we try to add each entry under UH+WHLOCK > + * using add() callback. > + * Finally, we free all state by calling flush_entry callback > + * for each @tei. > + * > + * Returns 0 on success. > + */ > +int > +add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint8_t flags, uint32_t count) > +{ > + struct table_config *tc; > + struct table_algo *ta; > + uint16_t kidx; > + int error, first_error, i, rollback; > + uint32_t num, numadd; > + struct tentry_info *ptei; > + struct tableop_state ts; > + char ta_buf[TA_BUF_SZ]; > + caddr_t ta_buf_m = NULL, v; > + > + memset(&ts, 0, sizeof(ts)); > + ta = NULL; > + IPFW_UH_WLOCK(ch); > + > + /* > + * Find and reference existing table. > + */ > +restart: > + if (ts.modified != 0) { > + IPFW_UH_WUNLOCK(ch); > + flush_batch_buffer(ch, ta, tei, count, rollback, > + ta_buf_m, ta_buf); > + memset(&ts, 0, sizeof(ts)); > + ta = NULL; > + IPFW_UH_WLOCK(ch); > + } > + > + error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc); > + if (error != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (error); > + } > + ta = tc->ta; > + > + /* Fill in tablestate */ > + ts.ch = ch; > + ts.opstate.func = rollback_add_entry; > + ts.tc = tc; > + ts.vshared = tc->vshared; > + ts.vmask = tc->vmask; > + ts.ta = ta; > + ts.tei = tei; > + ts.count = count; > + rollback = 0; > + add_toperation_state(ch, &ts); > + IPFW_UH_WUNLOCK(ch); > + > + /* Allocate memory and prepare record(s) */ > + /* Pass stack buffer by default */ > + ta_buf_m = ta_buf; > + error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m); > + if (error != 0) > + goto cleanup; > + > + IPFW_UH_WLOCK(ch); > + /* Drop reference we've used in first search */ > + tc->no.refcnt--; > + > + /* > + * Check if table swap has happened. > + * (so table algo might be changed). > + * Restart operation to achieve consistent behavior. > + */ > + del_toperation_state(ch, &ts); > + if (ts.modified != 0) > + goto restart; > + > + /* > + * Link all values values to shared/per-table value array. > + * > + * May release/reacquire UH_WLOCK. > + */ > + error = ipfw_link_table_values(ch, &ts); > + if (error != 0) > + goto cleanup; > + if (ts.modified != 0) > + goto restart; > + > + /* > + * Ensure we are able to add all entries without additional > + * memory allocations. May release/reacquire UH_WLOCK. > + */ > + kidx = tc->no.kidx; > + error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count); > + if (error != 0) > + goto cleanup; > + if (ts.modified != 0) > + goto restart; > + > + /* We've got valid table in @tc. Let's try to add data */ > + kidx = tc->no.kidx; > + ta = tc->ta; > + numadd = 0; > + first_error = 0; > + > + IPFW_WLOCK(ch); > + > + v = ta_buf_m; > + for (i = 0; i < count; i++, v += ta->ta_buf_size) { > + ptei = &tei[i]; > + num = 0; > + /* check limit before adding */ > + if ((error = check_table_limit(tc, ptei)) == 0) { > + error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx), > + ptei, v, &num); > + /* Set status flag to inform userland */ > + store_tei_result(ptei, OP_ADD, error, num); > + } > + if (error == 0) { > + /* Update number of records to ease limit checking */ > + tc->count += num; > + numadd += num; > + continue; > + } > + > + if (first_error == 0) > + first_error = error; > + > + /* > + * Some error have happened. Check our atomicity > + * settings: continue if atomicity is not required, > + * rollback changes otherwise. > + */ > + if ((flags & IPFW_CTF_ATOMIC) == 0) > + continue; > + > + rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx), > + tei, ta_buf_m, count, i); > + > + rollback = 1; > + break; > + } > + > + IPFW_WUNLOCK(ch); > + > + ipfw_garbage_table_values(ch, tc, tei, count, rollback); > + > + /* Permit post-add algorithm grow/rehash. */ > + if (numadd != 0) > + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); > + > + /* Return first error to user, if any */ > + error = first_error; > + > +cleanup: > + IPFW_UH_WUNLOCK(ch); > + > + flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); > + > + return (error); > +} > + > +/* > + * Deletes one or more entries in table @ti. > + * > + * Returns 0 on success. > + */ > +int > +del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint8_t flags, uint32_t count) > +{ > + struct table_config *tc; > + struct table_algo *ta; > + struct tentry_info *ptei; > + uint16_t kidx; > + int error, first_error, i; > + uint32_t num, numdel; > + char ta_buf[TA_BUF_SZ]; > + caddr_t ta_buf_m, v; > + > + /* > + * Find and reference existing table. > + */ > + IPFW_UH_WLOCK(ch); > + error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc); > + if (error != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (error); > + } > + ta = tc->ta; > + IPFW_UH_WUNLOCK(ch); > + > + /* Allocate memory and prepare record(s) */ > + /* Pass stack buffer by default */ > + ta_buf_m = ta_buf; > + error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m); > + if (error != 0) > + goto cleanup; > + > + IPFW_UH_WLOCK(ch); > + > + /* Drop reference we've used in first search */ > + tc->no.refcnt--; > + > + /* > + * Check if table algo is still the same. > + * (changed ta may be the result of table swap). > + */ > + if (ta != tc->ta) { > + IPFW_UH_WUNLOCK(ch); > + error = EINVAL; > + goto cleanup; > + } > + > + kidx = tc->no.kidx; > + numdel = 0; > + first_error = 0; > + > + IPFW_WLOCK(ch); > + v = ta_buf_m; > + for (i = 0; i < count; i++, v += ta->ta_buf_size) { > + ptei = &tei[i]; > + num = 0; > + error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, > + &num); > + /* Save state for userland */ > + store_tei_result(ptei, OP_DEL, error, num); > + if (error != 0 && first_error == 0) > + first_error = error; > + tc->count -= num; > + numdel += num; > + } > + IPFW_WUNLOCK(ch); > + > + /* Unlink non-used values */ > + ipfw_garbage_table_values(ch, tc, tei, count, 0); > + > + if (numdel != 0) { > + /* Run post-del hook to permit shrinking */ > + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); > + } > + > + IPFW_UH_WUNLOCK(ch); > + > + /* Return first error to user, if any */ > + error = first_error; > + > +cleanup: > + flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf); > + > + return (error); > +} > + > +/* > + * Ensure that table @tc has enough space to add @count entries without > + * need for reallocation. > + * > + * Callbacks order: > + * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize. > + * > + * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags. > + * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage > + * 3) modify (UH_WLOCK + WLOCK) - switch pointers > + * 4) flush_modify (UH_WLOCK) - free state, if needed > + * > + * Returns 0 on success. > + */ > +static int > +check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, > + struct table_config *tc, struct table_info *ti, uint32_t count) > +{ > + struct table_algo *ta; > + uint64_t pflags; > + char ta_buf[TA_BUF_SZ]; > + int error; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + error = 0; > + ta = tc->ta; > + if (ta->need_modify == NULL) > + return (0); > + > + /* Acquire reference not to loose @tc between locks/unlocks */ > + tc->no.refcnt++; > + > + /* > + * TODO: think about avoiding race between large add/large delete > + * operation on algorithm which implements shrinking along with > + * growing. > + */ > + while (true) { > + pflags = 0; > + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { > + error = 0; > + break; > + } > + > + /* We have to shrink/grow table */ > + if (ts != NULL) > + add_toperation_state(ch, ts); > + IPFW_UH_WUNLOCK(ch); > + > + memset(&ta_buf, 0, sizeof(ta_buf)); > + error = ta->prepare_mod(ta_buf, &pflags); > + > + IPFW_UH_WLOCK(ch); > + if (ts != NULL) > + del_toperation_state(ch, ts); > + > + if (error != 0) > + break; > + > + if (ts != NULL && ts->modified != 0) { > + > + /* > + * Swap operation has happened > + * so we're currently operating on other > + * table data. Stop doing this. > + */ > + ta->flush_mod(ta_buf); > + break; > + } > + > + /* Check if we still need to alter table */ > + ti = KIDX_TO_TI(ch, tc->no.kidx); > + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { > + IPFW_UH_WUNLOCK(ch); > + > + /* > + * Other thread has already performed resize. > + * Flush our state and return. > + */ > + ta->flush_mod(ta_buf); > + break; > + } > + > + error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags); > + if (error == 0) { > + /* Do actual modification */ > + IPFW_WLOCK(ch); > + ta->modify(tc->astate, ti, ta_buf, pflags); > + IPFW_WUNLOCK(ch); > + } > + > + /* Anyway, flush data and retry */ > + ta->flush_mod(ta_buf); > + } > + > + tc->no.refcnt--; > + return (error); > +} > + > +/* > + * Adds or deletes record in table. > + * Data layout (v0): > + * Request: [ ip_fw3_opheader ipfw_table_xentry ] > + * > + * Returns 0 on success > + */ > +static int > +manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_table_xentry *xent; > + struct tentry_info tei; > + struct tid_info ti; > + struct table_value v; > + int error, hdrlen, read; > + > + hdrlen = offsetof(ipfw_table_xentry, k); > + > + /* Check minimum header size */ > + if (sd->valsize < (sizeof(*op3) + hdrlen)) > + return (EINVAL); > + > + read = sizeof(ip_fw3_opheader); > + > + /* Check if xentry len field is valid */ > + xent = (ipfw_table_xentry *)(op3 + 1); > + if (xent->len < hdrlen || xent->len + read > sd->valsize) > + return (EINVAL); > + > + memset(&tei, 0, sizeof(tei)); > + tei.paddr = &xent->k; > + tei.masklen = xent->masklen; > + ipfw_import_table_value_legacy(xent->value, &v); > + tei.pvalue = &v; > + /* Old requests compability */ > + tei.flags = TEI_FLAGS_COMPAT; > + if (xent->type == IPFW_TABLE_ADDR) { > + if (xent->len - hdrlen == sizeof(in_addr_t)) > + tei.subtype = AF_INET; > + else > + tei.subtype = AF_INET6; > + } > + > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = xent->tbl; > + ti.type = xent->type; > + > + error = (op3->opcode == IP_FW_TABLE_XADD) ? > + add_table_entry(ch, &ti, &tei, 0, 1) : > + del_table_entry(ch, &ti, &tei, 0, 1); > + > + return (error); > +} > + > +/* > + * Adds or deletes record in table. > + * Data layout (v1)(current): > + * Request: [ ipfw_obj_header > + * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ] > + * ] > + * > + * Returns 0 on success > + */ > +static int > +manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_obj_tentry *tent, *ptent; > + ipfw_obj_ctlv *ctlv; > + ipfw_obj_header *oh; > + struct tentry_info *ptei, tei, *tei_buf; > + struct tid_info ti; > + int error, i, kidx, read; > + > + /* Check minimum header size */ > + if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv))) > + return (EINVAL); > + > + /* Check if passed data is too long */ > + if (sd->valsize != sd->kavail) > + return (EINVAL); > + > + oh = (ipfw_obj_header *)sd->kbuf; > + > + /* Basic length checks for TLVs */ > + if (oh->ntlv.head.length != sizeof(oh->ntlv)) > + return (EINVAL); > + > + read = sizeof(*oh); > + > + ctlv = (ipfw_obj_ctlv *)(oh + 1); > + if (ctlv->head.length + read != sd->valsize) > + return (EINVAL); > + > + read += sizeof(*ctlv); > + tent = (ipfw_obj_tentry *)(ctlv + 1); > + if (ctlv->count * sizeof(*tent) + read != sd->valsize) > + return (EINVAL); > + > + if (ctlv->count == 0) > + return (0); > + > + /* > + * Mark entire buffer as "read". > + * This instructs sopt api write it back > + * after function return. > + */ > + ipfw_get_sopt_header(sd, sd->valsize); > + > + /* Perform basic checks for each entry */ > + ptent = tent; > + kidx = tent->idx; > + for (i = 0; i < ctlv->count; i++, ptent++) { > + if (ptent->head.length != sizeof(*ptent)) > + return (EINVAL); > + if (ptent->idx != kidx) > + return (ENOTSUP); > + } > + > + /* Convert data into kernel request objects */ > + objheader_to_ti(oh, &ti); > + ti.type = oh->ntlv.type; > + ti.uidx = kidx; > + > + /* Use on-stack buffer for single add/del */ > + if (ctlv->count == 1) { > + memset(&tei, 0, sizeof(tei)); > + tei_buf = &tei; > + } else > + tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP, > + M_WAITOK | M_ZERO); > + > + ptei = tei_buf; > + ptent = tent; > + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { > + ptei->paddr = &ptent->k; > + ptei->subtype = ptent->subtype; > + ptei->masklen = ptent->masklen; > + if (ptent->head.flags & IPFW_TF_UPDATE) > + ptei->flags |= TEI_FLAGS_UPDATE; > + > + ipfw_import_table_value_v1(&ptent->v.value); > + ptei->pvalue = (struct table_value *)&ptent->v.value; > + } > + > + error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ? > + add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) : > + del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count); > + > + /* Translate result back to userland */ > + ptei = tei_buf; > + ptent = tent; > + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { > + if (ptei->flags & TEI_FLAGS_ADDED) > + ptent->result = IPFW_TR_ADDED; > + else if (ptei->flags & TEI_FLAGS_DELETED) > + ptent->result = IPFW_TR_DELETED; > + else if (ptei->flags & TEI_FLAGS_UPDATED) > + ptent->result = IPFW_TR_UPDATED; > + else if (ptei->flags & TEI_FLAGS_LIMIT) > + ptent->result = IPFW_TR_LIMIT; > + else if (ptei->flags & TEI_FLAGS_ERROR) > + ptent->result = IPFW_TR_ERROR; > + else if (ptei->flags & TEI_FLAGS_NOTFOUND) > + ptent->result = IPFW_TR_NOTFOUND; > + else if (ptei->flags & TEI_FLAGS_EXISTS) > + ptent->result = IPFW_TR_EXISTS; > + ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value); > + } > + > + if (tei_buf != &tei) > + free(tei_buf, M_TEMP); > + > + return (error); > +} > + > +/* > + * Looks up an entry in given table. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_obj_tentry ] > + * Reply: [ ipfw_obj_header ipfw_obj_tentry ] > + * > + * Returns 0 on success > + */ > +static int > +find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_obj_tentry *tent; > + ipfw_obj_header *oh; > + struct tid_info ti; > + struct table_config *tc; > + struct table_algo *ta; > + struct table_info *kti; > + struct namedobj_instance *ni; > + int error; > + size_t sz; > + > + /* Check minimum header size */ > + sz = sizeof(*oh) + sizeof(*tent); > + if (sd->valsize != sz) > + return (EINVAL); > + > + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); > + tent = (ipfw_obj_tentry *)(oh + 1); > + > + /* Basic length checks for TLVs */ > + if (oh->ntlv.head.length != sizeof(oh->ntlv)) > + return (EINVAL); > + > + objheader_to_ti(oh, &ti); > + ti.type = oh->ntlv.type; > + ti.uidx = tent->idx; > + > + IPFW_UH_RLOCK(ch); > + ni = CHAIN_TO_NI(ch); > + > + /* > + * Find existing table and check its type . > + */ > + ta = NULL; > + if ((tc = find_table(ni, &ti)) == NULL) { > + IPFW_UH_RUNLOCK(ch); > + return (ESRCH); > + } > + > + /* check table type */ > + if (tc->no.type != ti.type) { > + IPFW_UH_RUNLOCK(ch); > + return (EINVAL); > + } > + > + kti = KIDX_TO_TI(ch, tc->no.kidx); > + ta = tc->ta; > + > + if (ta->find_tentry == NULL) > + return (ENOTSUP); > + > + error = ta->find_tentry(tc->astate, kti, tent); > + > + IPFW_UH_RUNLOCK(ch); > + > + return (error); > +} > + > +/* > + * Flushes all entries or destroys given table. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ] > + * > + * Returns 0 on success > + */ > +static int > +flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + int error; > + struct _ipfw_obj_header *oh; > + struct tid_info ti; > + > + if (sd->valsize != sizeof(*oh)) > + return (EINVAL); > + > + oh = (struct _ipfw_obj_header *)op3; > + objheader_to_ti(oh, &ti); > + > + if (op3->opcode == IP_FW_TABLE_XDESTROY) > + error = destroy_table(ch, &ti); > + else if (op3->opcode == IP_FW_TABLE_XFLUSH) > + error = flush_table(ch, &ti); > + else > + return (ENOTSUP); > + > + return (error); > +} > + > +static void > +restart_flush(void *object, struct op_state *_state) > +{ > + struct tableop_state *ts; > + > + ts = (struct tableop_state *)_state; > + > + if (ts->tc != object) > + return; > + > + /* Indicate we've called */ > + ts->modified = 1; > +} > + > +/* > + * Flushes given table. > + * > + * Function create new table instance with the same > + * parameters, swaps it with old one and > + * flushes state without holding runtime WLOCK. > + * > + * Returns 0 on success. > + */ > +int > +flush_table(struct ip_fw_chain *ch, struct tid_info *ti) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc; > + struct table_algo *ta; > + struct table_info ti_old, ti_new, *tablestate; > + void *astate_old, *astate_new; > + char algostate[64], *pstate; > + struct tableop_state ts; > + int error; > + uint16_t kidx; > + uint8_t tflags; > + > + /* > + * Stage 1: save table algoritm. > + * Reference found table to ensure it won't disappear. > + */ > + IPFW_UH_WLOCK(ch); > + ni = CHAIN_TO_NI(ch); > + if ((tc = find_table(ni, ti)) == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return (ESRCH); > + } > +restart: > + /* Set up swap handler */ > + memset(&ts, 0, sizeof(ts)); > + ts.opstate.func = restart_flush; > + ts.tc = tc; > + > + ta = tc->ta; > + /* Do not flush readonly tables */ > + if ((ta->flags & TA_FLAG_READONLY) != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (EACCES); > + } > + /* Save startup algo parameters */ > + if (ta->print_config != NULL) { > + ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx), > + algostate, sizeof(algostate)); > + pstate = algostate; > + } else > + pstate = NULL; > + tflags = tc->tflags; > + tc->no.refcnt++; > + add_toperation_state(ch, &ts); > + IPFW_UH_WUNLOCK(ch); > + > + /* > + * Stage 2: allocate new table instance using same algo. > + */ > + memset(&ti_new, 0, sizeof(struct table_info)); > + error = ta->init(ch, &astate_new, &ti_new, pstate, tflags); > + > + /* > + * Stage 3: swap old state pointers with newly-allocated ones. > + * Decrease refcount. > + */ > + IPFW_UH_WLOCK(ch); > + tc->no.refcnt--; > + del_toperation_state(ch, &ts); > + > + if (error != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (error); > + } > + > + /* > + * Restart operation if table swap has happened: > + * even if algo may be the same, algo init parameters > + * may change. Restart operation instead of doing > + * complex checks. > + */ > + if (ts.modified != 0) { > + ta->destroy(astate_new, &ti_new); > + goto restart; > + } > + > + ni = CHAIN_TO_NI(ch); > + kidx = tc->no.kidx; > + tablestate = (struct table_info *)ch->tablestate; > + > + IPFW_WLOCK(ch); > + ti_old = tablestate[kidx]; > + tablestate[kidx] = ti_new; > + IPFW_WUNLOCK(ch); > + > + astate_old = tc->astate; > + tc->astate = astate_new; > + tc->ti_copy = ti_new; > + tc->count = 0; > + > + /* Notify algo on real @ti address */ > + if (ta->change_ti != NULL) > + ta->change_ti(tc->astate, &tablestate[kidx]); > + > + /* > + * Stage 4: unref values. > + */ > + ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old); > + IPFW_UH_WUNLOCK(ch); > + > + /* > + * Stage 5: perform real flush/destroy. > + */ > + ta->destroy(astate_old, &ti_old); > + > + return (0); > +} > + > +/* > + * Swaps two tables. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_obj_ntlv ] > + * > + * Returns 0 on success > + */ > +static int > +swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + int error; > + struct _ipfw_obj_header *oh; > + struct tid_info ti_a, ti_b; > + > + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv)) > + return (EINVAL); > + > + oh = (struct _ipfw_obj_header *)op3; > + ntlv_to_ti(&oh->ntlv, &ti_a); > + ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b); > + > + error = swap_tables(ch, &ti_a, &ti_b); > + > + return (error); > +} > + > +/* > + * Swaps two tables of the same type/valtype. > + * > + * Checks if tables are compatible and limits > + * permits swap, than actually perform swap. > + * > + * Each table consists of 2 different parts: > + * config: > + * @tc (with name, set, kidx) and rule bindings, which is "stable". > + * number of items > + * table algo > + * runtime: > + * runtime data @ti (ch->tablestate) > + * runtime cache in @tc > + * algo-specific data (@tc->astate) > + * > + * So we switch: > + * all runtime data > + * number of items > + * table algo > + * > + * After that we call @ti change handler for each table. > + * > + * Note that referencing @tc won't protect tc->ta from change. > + * XXX: Do we need to restrict swap between locked tables? > + * XXX: Do we need to exchange ftype? > + * > + * Returns 0 on success. > + */ > +static int > +swap_tables(struct ip_fw_chain *ch, struct tid_info *a, > + struct tid_info *b) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc_a, *tc_b; > + struct table_algo *ta; > + struct table_info ti, *tablestate; > + void *astate; > + uint32_t count; > + > + /* > + * Stage 1: find both tables and ensure they are of > + * the same type. > + */ > + IPFW_UH_WLOCK(ch); > + ni = CHAIN_TO_NI(ch); > + if ((tc_a = find_table(ni, a)) == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return (ESRCH); > + } > + if ((tc_b = find_table(ni, b)) == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return (ESRCH); > + } > + > + /* It is very easy to swap between the same table */ > + if (tc_a == tc_b) { > + IPFW_UH_WUNLOCK(ch); > + return (0); > + } > + > + /* Check type and value are the same */ > + if (tc_a->no.type != tc_b->no.type || tc_a->tflags != tc_b->tflags) { > + IPFW_UH_WUNLOCK(ch); > + return (EINVAL); > + } > + > + /* Check limits before swap */ > + if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) || > + (tc_b->limit != 0 && tc_a->count > tc_b->limit)) { > + IPFW_UH_WUNLOCK(ch); > + return (EFBIG); > + } > + > + /* Check if one of the tables is readonly */ > + if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (EACCES); > + } > + > + /* Notify we're going to swap */ > + rollback_toperation_state(ch, tc_a); > + rollback_toperation_state(ch, tc_b); > + > + /* Everything is fine, prepare to swap */ > + tablestate = (struct table_info *)ch->tablestate; > + ti = tablestate[tc_a->no.kidx]; > + ta = tc_a->ta; > + astate = tc_a->astate; > + count = tc_a->count; > + > + IPFW_WLOCK(ch); > + /* a <- b */ > + tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx]; > + tc_a->ta = tc_b->ta; > + tc_a->astate = tc_b->astate; > + tc_a->count = tc_b->count; > + /* b <- a */ > + tablestate[tc_b->no.kidx] = ti; > + tc_b->ta = ta; > + tc_b->astate = astate; > + tc_b->count = count; > + IPFW_WUNLOCK(ch); > + > + /* Ensure tc.ti copies are in sync */ > + tc_a->ti_copy = tablestate[tc_a->no.kidx]; > + tc_b->ti_copy = tablestate[tc_b->no.kidx]; > + > + /* Notify both tables on @ti change */ > + if (tc_a->ta->change_ti != NULL) > + tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]); > + if (tc_b->ta->change_ti != NULL) > + tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]); > + > + IPFW_UH_WUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Destroys table specified by @ti. > + * Data layout (v0)(current): > + * Request: [ ip_fw3_opheader ] > + * > + * Returns 0 on success > + */ > +static int > +destroy_table(struct ip_fw_chain *ch, struct tid_info *ti) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc; > + > + IPFW_UH_WLOCK(ch); > + > + ni = CHAIN_TO_NI(ch); > + if ((tc = find_table(ni, ti)) == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return (ESRCH); > + } > + > + /* Do not permit destroying referenced tables */ > + if (tc->no.refcnt > 0) { > + IPFW_UH_WUNLOCK(ch); > + return (EBUSY); > + } > + > + IPFW_WLOCK(ch); > + unlink_table(ch, tc); > + IPFW_WUNLOCK(ch); > + > + /* Free obj index */ > + if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0) > + printf("Error unlinking kidx %d from table %s\n", > + tc->no.kidx, tc->tablename); > + > + /* Unref values used in tables while holding UH lock */ > + ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy); > + IPFW_UH_WUNLOCK(ch); > + > + free_table_config(ni, tc); > + > + return (0); > +} > + > +static uint32_t > +roundup2p(uint32_t v) > +{ > + > + v--; > + v |= v >> 1; > + v |= v >> 2; > + v |= v >> 4; > + v |= v >> 8; > + v |= v >> 16; > + v++; > + > + return (v); > +} > + > +/* > + * Grow tables index. > + * > + * Returns 0 on success. > + */ > +int > +ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) > +{ > + unsigned int ntables_old, tbl; > + struct namedobj_instance *ni; > + void *new_idx, *old_tablestate, *tablestate; > + struct table_info *ti; > + struct table_config *tc; > + int i, new_blocks; > + > + /* Check new value for validity */ > + if (ntables == 0) > + return (EINVAL); > + if (ntables > IPFW_TABLES_MAX) > + ntables = IPFW_TABLES_MAX; > + /* Alight to nearest power of 2 */ > + ntables = (unsigned int)roundup2p(ntables); > + > + /* Allocate new pointers */ > + tablestate = malloc(ntables * sizeof(struct table_info), > + M_IPFW, M_WAITOK | M_ZERO); > + > + ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks); > + > + IPFW_UH_WLOCK(ch); > + > + tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; > + ni = CHAIN_TO_NI(ch); > + > + /* Temporary restrict decreasing max_tables */ > + if (ntables < V_fw_tables_max) { > + > + /* > + * FIXME: Check if we really can shrink > + */ > + IPFW_UH_WUNLOCK(ch); > + return (EINVAL); > + } > + > + /* Copy table info/indices */ > + memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl); > + ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks); > + > + IPFW_WLOCK(ch); > + > + /* Change pointers */ > + old_tablestate = ch->tablestate; > + ch->tablestate = tablestate; > + ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks); > + > + ntables_old = V_fw_tables_max; > + V_fw_tables_max = ntables; > + > + IPFW_WUNLOCK(ch); > + > + /* Notify all consumers that their @ti pointer has changed */ > + ti = (struct table_info *)ch->tablestate; > + for (i = 0; i < tbl; i++, ti++) { > + if (ti->lookup == NULL) > + continue; > + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i); > + if (tc == NULL || tc->ta->change_ti == NULL) > + continue; > + > + tc->ta->change_ti(tc->astate, ti); > + } > + > + IPFW_UH_WUNLOCK(ch); > + > + /* Free old pointers */ > + free(old_tablestate, M_IPFW); > + ipfw_objhash_bitmap_free(new_idx, new_blocks); > + > + return (0); > +} > + > +/* > + * Switch between "set 0" and "rule's set" table binding, > + * Check all ruleset bindings and permits changing > + * IFF each binding has both rule AND table in default set (set 0). > + * > + * Returns 0 on success. > + */ > +int > +ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) > +{ > + struct namedobj_instance *ni; > + struct named_object *no; > + struct ip_fw *rule; > + ipfw_insn *cmd; > + int cmdlen, i, l; > + uint16_t kidx; > + uint8_t type; > + > + IPFW_UH_WLOCK(ch); > + > + if (V_fw_tables_sets == sets) { > + IPFW_UH_WUNLOCK(ch); > + return (0); > + } > + > + ni = CHAIN_TO_NI(ch); > + > + /* > + * Scan all rules and examine tables opcodes. > + */ > + for (i = 0; i < ch->n_rules; i++) { > + rule = ch->map[i]; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + > + /* Check if both table object and rule has the set 0 */ > + if (no->set != 0 || rule->set != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (EBUSY); > + } > + > + } > + } > + V_fw_tables_sets = sets; > + > + IPFW_UH_WUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Lookup an IP @addr in table @tbl. > + * Stores found value in @val. > + * > + * Returns 1 if @addr was found. > + */ > +int > +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, > + uint32_t *val) > +{ > + struct table_info *ti; > + > + ti = KIDX_TO_TI(ch, tbl); > + > + return (ti->lookup(ti, &addr, sizeof(in_addr_t), val)); > +} > + > +/* > + * Lookup an arbtrary key @paddr of legth @plen in table @tbl. > + * Stores found value in @val. > + * > + * Returns 1 if key was found. > + */ > +int > +ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, > + void *paddr, uint32_t *val) > +{ > + struct table_info *ti; > + > + ti = KIDX_TO_TI(ch, tbl); > + > + return (ti->lookup(ti, paddr, plen, val)); > +} > + > +/* > + * Info/List/dump support for tables. > + * > + */ > + > +/* > + * High-level 'get' cmds sysctl handlers > + */ > + > +/* > + * Lists all tables currently available in kernel. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size > + * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ] > + * > + * Returns 0 on success > + */ > +static int > +list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_lheader *olh; > + int error; > + > + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); > + if (olh == NULL) > + return (EINVAL); > + if (sd->valsize < olh->size) > + return (EINVAL); > + > + IPFW_UH_RLOCK(ch); > + error = export_tables(ch, olh, sd); > + IPFW_UH_RUNLOCK(ch); > + > + return (error); > +} > + > +/* > + * Store table info to buffer provided by @sd. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_xtable_info(empty)] > + * Reply: [ ipfw_obj_header ipfw_xtable_info ] > + * > + * Returns 0 on success. > + */ > +static int > +describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_header *oh; > + struct table_config *tc; > + struct tid_info ti; > + size_t sz; > + > + sz = sizeof(*oh) + sizeof(ipfw_xtable_info); > + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); > + if (oh == NULL) > + return (EINVAL); > + > + objheader_to_ti(oh, &ti); > + > + IPFW_UH_RLOCK(ch); > + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { > + IPFW_UH_RUNLOCK(ch); > + return (ESRCH); > + } > + > + export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1)); > + IPFW_UH_RUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Modifies existing table. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_xtable_info ] > + * > + * Returns 0 on success > + */ > +static int > +modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_header *oh; > + ipfw_xtable_info *i; > + char *tname; > + struct tid_info ti; > + struct namedobj_instance *ni; > + struct table_config *tc; > + > + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) > + return (EINVAL); > + > + oh = (struct _ipfw_obj_header *)sd->kbuf; > + i = (ipfw_xtable_info *)(oh + 1); > + > + /* > + * Verify user-supplied strings. > + * Check for null-terminated/zero-length strings/ > + */ > + tname = oh->ntlv.name; > + if (ipfw_check_table_name(tname) != 0) > + return (EINVAL); > + > + objheader_to_ti(oh, &ti); > + ti.type = i->type; > + > + IPFW_UH_WLOCK(ch); > + ni = CHAIN_TO_NI(ch); > + if ((tc = find_table(ni, &ti)) == NULL) { > + IPFW_UH_WUNLOCK(ch); > + return (ESRCH); > + } > + > + /* Do not support any modifications for readonly tables */ > + if ((tc->ta->flags & TA_FLAG_READONLY) != 0) { > + IPFW_UH_WUNLOCK(ch); > + return (EACCES); > + } > + > + if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0) > + tc->limit = i->limit; > + if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0) > + tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0); > + IPFW_UH_WUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Creates new table. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_header ipfw_xtable_info ] > + * > + * Returns 0 on success > + */ > +static int > +create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_header *oh; > + ipfw_xtable_info *i; > + char *tname, *aname; > + struct tid_info ti; > + struct namedobj_instance *ni; > + struct table_config *tc; > + > + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) > + return (EINVAL); > + > + oh = (struct _ipfw_obj_header *)sd->kbuf; > + i = (ipfw_xtable_info *)(oh + 1); > + > + /* > + * Verify user-supplied strings. > + * Check for null-terminated/zero-length strings/ > + */ > + tname = oh->ntlv.name; > + aname = i->algoname; > + if (ipfw_check_table_name(tname) != 0 || > + strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname)) > + return (EINVAL); > + > + if (aname[0] == '\0') { > + /* Use default algorithm */ > + aname = NULL; > + } > + > + objheader_to_ti(oh, &ti); > + ti.type = i->type; > + > + ni = CHAIN_TO_NI(ch); > + > + IPFW_UH_RLOCK(ch); > + if ((tc = find_table(ni, &ti)) != NULL) { > + IPFW_UH_RUNLOCK(ch); > + return (EEXIST); > + } > + IPFW_UH_RUNLOCK(ch); > + > + return (create_table_internal(ch, &ti, aname, i, NULL, 0)); > +} > + > +/* > + * Creates new table based on @ti and @aname. > + * > + * Relies on table name checking inside find_name_tlv() > + * Assume @aname to be checked and valid. > + * Stores allocated table kidx inside @pkidx (if non-NULL). > + * Reference created table if @compat is non-zero. > + * > + * Returns 0 on success. > + */ > +static int > +create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, > + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc, *tc_new, *tmp; > + struct table_algo *ta; > + uint16_t kidx; > + > + ni = CHAIN_TO_NI(ch); > + > + ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname); > + if (ta == NULL) > + return (ENOTSUP); > + > + tc = alloc_table_config(ch, ti, ta, aname, i->tflags); > + if (tc == NULL) > + return (ENOMEM); > + > + tc->vmask = i->vmask; > + tc->limit = i->limit; > + if (ta->flags & TA_FLAG_READONLY) > + tc->locked = 1; > + else > + tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0; > + > + IPFW_UH_WLOCK(ch); > + > + /* Check if table has been already created */ > + tc_new = find_table(ni, ti); > + if (tc_new != NULL) { > + > + /* > + * Compat: do not fail if we're > + * requesting to create existing table > + * which has the same type > + */ > + if (compat == 0 || tc_new->no.type != tc->no.type) { > + IPFW_UH_WUNLOCK(ch); > + free_table_config(ni, tc); > + return (EEXIST); > + } > + > + /* Exchange tc and tc_new for proper refcounting & freeing */ > + tmp = tc; > + tc = tc_new; > + tc_new = tmp; > + } else { > + /* New table */ > + if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) { > + IPFW_UH_WUNLOCK(ch); > + printf("Unable to allocate table index." > + " Consider increasing net.inet.ip.fw.tables_max"); > + free_table_config(ni, tc); > + return (EBUSY); > + } > + tc->no.kidx = kidx; > + > + IPFW_WLOCK(ch); > + link_table(ch, tc); > + IPFW_WUNLOCK(ch); > + } > + > + if (compat != 0) > + tc->no.refcnt++; > + if (pkidx != NULL) > + *pkidx = tc->no.kidx; > + > + IPFW_UH_WUNLOCK(ch); > + > + if (tc_new != NULL) > + free_table_config(ni, tc_new); > + > + return (0); > +} > + > +static void > +ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti) > +{ > + > + memset(ti, 0, sizeof(struct tid_info)); > + ti->set = ntlv->set; > + ti->uidx = ntlv->idx; > + ti->tlvs = ntlv; > + ti->tlen = ntlv->head.length; > +} > + > +static void > +objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) > +{ > + > + ntlv_to_ti(&oh->ntlv, ti); > +} > + > +/* > + * Exports basic table info as name TLV. > + * Used inside dump_static_rules() to provide info > + * about all tables referenced by current ruleset. > + * > + * Returns 0 on success. > + */ > +int > +ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, > + struct sockopt_data *sd) > +{ > + struct namedobj_instance *ni; > + struct named_object *no; > + ipfw_obj_ntlv *ntlv; > + > + ni = CHAIN_TO_NI(ch); > + > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + KASSERT(no != NULL, ("invalid table kidx passed")); > + > + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); > + if (ntlv == NULL) > + return (ENOMEM); > + > + ntlv->head.type = IPFW_TLV_TBL_NAME; > + ntlv->head.length = sizeof(*ntlv); > + ntlv->idx = no->kidx; > + strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); > + > + return (0); > +} > + > +/* > + * Marks every table kidx used in @rule with bit in @bmask. > + * Used to generate bitmask of referenced tables for given ruleset. > + * > + * Returns number of newly-referenced tables. > + */ > +int > +ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, > + uint32_t *bmask) > +{ > + int cmdlen, l, count; > + ipfw_insn *cmd; > + uint16_t kidx; > + uint8_t type; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + count = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + > + if ((bmask[kidx / 32] & (1 << (kidx % 32))) == 0) > + count++; > + > + bmask[kidx / 32] |= 1 << (kidx % 32); > + } > + > + return (count); > +} > + > +struct dump_args { > + struct ip_fw_chain *ch; > + struct table_info *ti; > + struct table_config *tc; > + struct sockopt_data *sd; > + uint32_t cnt; > + uint16_t uidx; > + int error; > + uint32_t size; > + ipfw_table_entry *ent; > + ta_foreach_f *f; > + void *farg; > + ipfw_obj_tentry tent; > +}; > + > +static int > +count_ext_entries(void *e, void *arg) > +{ > + struct dump_args *da; > + > + da = (struct dump_args *)arg; > + da->cnt++; > + > + return (0); > +} > + > +/* > + * Gets number of items from table either using > + * internal counter or calling algo callback for > + * externally-managed tables. > + * > + * Returns number of records. > + */ > +static uint32_t > +table_get_count(struct ip_fw_chain *ch, struct table_config *tc) > +{ > + struct table_info *ti; > + struct table_algo *ta; > + struct dump_args da; > + > + ti = KIDX_TO_TI(ch, tc->no.kidx); > + ta = tc->ta; > + > + /* Use internal counter for self-managed tables */ > + if ((ta->flags & TA_FLAG_READONLY) == 0) > + return (tc->count); > + > + /* Use callback to quickly get number of items */ > + if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0) > + return (ta->get_count(tc->astate, ti)); > + > + /* Count number of iterms ourselves */ > + memset(&da, 0, sizeof(da)); > + ta->foreach(tc->astate, ti, count_ext_entries, &da); > + > + return (da.cnt); > +} > + > +/* > + * Exports table @tc info into standard ipfw_xtable_info format. > + */ > +static void > +export_table_info(struct ip_fw_chain *ch, struct table_config *tc, > + ipfw_xtable_info *i) > +{ > + struct table_info *ti; > + struct table_algo *ta; > + > + i->type = tc->no.type; > + i->tflags = tc->tflags; > + i->vmask = tc->vmask; > + i->set = tc->no.set; > + i->kidx = tc->no.kidx; > + i->refcnt = tc->no.refcnt; > + i->count = table_get_count(ch, tc); > + i->limit = tc->limit; > + i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0; > + i->size = tc->count * sizeof(ipfw_obj_tentry); > + i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); > + strlcpy(i->tablename, tc->tablename, sizeof(i->tablename)); > + ti = KIDX_TO_TI(ch, tc->no.kidx); > + ta = tc->ta; > + if (ta->print_config != NULL) { > + /* Use algo function to print table config to string */ > + ta->print_config(tc->astate, ti, i->algoname, > + sizeof(i->algoname)); > + } else > + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); > + /* Dump algo-specific data, if possible */ > + if (ta->dump_tinfo != NULL) { > + ta->dump_tinfo(tc->astate, ti, &i->ta_info); > + i->ta_info.flags |= IPFW_TATFLAGS_DATA; > + } > +} > + > +struct dump_table_args { > + struct ip_fw_chain *ch; > + struct sockopt_data *sd; > +}; > + > +static void > +export_table_internal(struct namedobj_instance *ni, struct named_object *no, > + void *arg) > +{ > + ipfw_xtable_info *i; > + struct dump_table_args *dta; > + > + dta = (struct dump_table_args *)arg; > + > + i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i)); > + KASSERT(i != 0, ("previously checked buffer is not enough")); > + > + export_table_info(dta->ch, (struct table_config *)no, i); > +} > + > +/* > + * Export all tables as ipfw_xtable_info structures to > + * storage provided by @sd. > + * > + * If supplied buffer is too small, fills in required size > + * and returns ENOMEM. > + * Returns 0 on success. > + */ > +static int > +export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, > + struct sockopt_data *sd) > +{ > + uint32_t size; > + uint32_t count; > + struct dump_table_args dta; > + > + count = ipfw_objhash_count(CHAIN_TO_NI(ch)); > + size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader); > + > + /* Fill in header regadless of buffer size */ > + olh->count = count; > + olh->objsize = sizeof(ipfw_xtable_info); > + > + if (size > olh->size) { > + olh->size = size; > + return (ENOMEM); > + } > + > + olh->size = size; > + > + dta.ch = ch; > + dta.sd = sd; > + > + ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta); > + > + return (0); > +} > + > +/* > + * Dumps all table data > + * Data layout (v1)(current): > + * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size > + * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ] > + * > + * Returns 0 on success > + */ > +static int > +dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_header *oh; > + ipfw_xtable_info *i; > + struct tid_info ti; > + struct table_config *tc; > + struct table_algo *ta; > + struct dump_args da; > + uint32_t sz; > + > + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); > + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); > + if (oh == NULL) > + return (EINVAL); > + > + i = (ipfw_xtable_info *)(oh + 1); > + objheader_to_ti(oh, &ti); > + > + IPFW_UH_RLOCK(ch); > + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { > + IPFW_UH_RUNLOCK(ch); > + return (ESRCH); > + } > + export_table_info(ch, tc, i); > + > + if (sd->valsize < i->size) { > + > + /* > + * Submitted buffer size is not enough. > + * WE've already filled in @i structure with > + * relevant table info including size, so we > + * can return. Buffer will be flushed automatically. > + */ > + IPFW_UH_RUNLOCK(ch); > + return (ENOMEM); > + } > + > + /* > + * Do the actual dump in eXtended format > + */ > + memset(&da, 0, sizeof(da)); > + da.ch = ch; > + da.ti = KIDX_TO_TI(ch, tc->no.kidx); > + da.tc = tc; > + da.sd = sd; > + > + ta = tc->ta; > + > + ta->foreach(tc->astate, da.ti, dump_table_tentry, &da); > + IPFW_UH_RUNLOCK(ch); > + > + return (da.error); > +} > + > +/* > + * Dumps all table data > + * Data layout (version 0)(legacy): > + * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE() > + * Reply: [ ipfw_xtable ipfw_table_xentry x N ] > + * > + * Returns 0 on success > + */ > +static int > +dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + ipfw_xtable *xtbl; > + struct tid_info ti; > + struct table_config *tc; > + struct table_algo *ta; > + struct dump_args da; > + size_t sz, count; > + > + xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable)); > + if (xtbl == NULL) > + return (EINVAL); > + > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = xtbl->tbl; > + > + IPFW_UH_RLOCK(ch); > + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { > + IPFW_UH_RUNLOCK(ch); > + return (0); > + } > + count = table_get_count(ch, tc); > + sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable); > + > + xtbl->cnt = count; > + xtbl->size = sz; > + xtbl->type = tc->no.type; > + xtbl->tbl = ti.uidx; > + > + if (sd->valsize < sz) { > + > + /* > + * Submitted buffer size is not enough. > + * WE've already filled in @i structure with > + * relevant table info including size, so we > + * can return. Buffer will be flushed automatically. > + */ > + IPFW_UH_RUNLOCK(ch); > + return (ENOMEM); > + } > + > + /* Do the actual dump in eXtended format */ > + memset(&da, 0, sizeof(da)); > + da.ch = ch; > + da.ti = KIDX_TO_TI(ch, tc->no.kidx); > + da.tc = tc; > + da.sd = sd; > + > + ta = tc->ta; > + > + ta->foreach(tc->astate, da.ti, dump_table_xentry, &da); > + IPFW_UH_RUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Legacy function to retrieve number of items in table. > + */ > +static int > +get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + uint32_t *tbl; > + struct tid_info ti; > + size_t sz; > + int error; > + > + sz = sizeof(*op3) + sizeof(uint32_t); > + op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz); > + if (op3 == NULL) > + return (EINVAL); > + > + tbl = (uint32_t *)(op3 + 1); > + memset(&ti, 0, sizeof(ti)); > + ti.uidx = *tbl; > + IPFW_UH_RLOCK(ch); > + error = ipfw_count_xtable(ch, &ti, tbl); > + IPFW_UH_RUNLOCK(ch); > + return (error); > +} > + > +/* > + * Legacy IP_FW_TABLE_GETSIZE handler > + */ > +int > +ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) > +{ > + struct table_config *tc; > + > + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) > + return (ESRCH); > + *cnt = table_get_count(ch, tc); > + return (0); > +} > + > +/* > + * Legacy IP_FW_TABLE_XGETSIZE handler > + */ > +int > +ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) > +{ > + struct table_config *tc; > + uint32_t count; > + > + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) { > + *cnt = 0; > + return (0); /* 'table all list' requires success */ > + } > + > + count = table_get_count(ch, tc); > + *cnt = count * sizeof(ipfw_table_xentry); > + if (count > 0) > + *cnt += sizeof(ipfw_xtable); > + return (0); > +} > + > +static int > +dump_table_entry(void *e, void *arg) > +{ > + struct dump_args *da; > + struct table_config *tc; > + struct table_algo *ta; > + ipfw_table_entry *ent; > + struct table_value *pval; > + int error; > + > + da = (struct dump_args *)arg; > + > + tc = da->tc; > + ta = tc->ta; > + > + /* Out of memory, returning */ > + if (da->cnt == da->size) > + return (1); > + ent = da->ent++; > + ent->tbl = da->uidx; > + da->cnt++; > + > + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); > + if (error != 0) > + return (error); > + > + ent->addr = da->tent.k.addr.s_addr; > + ent->masklen = da->tent.masklen; > + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); > + ent->value = ipfw_export_table_value_legacy(pval); > + > + return (0); > +} > + > +/* > + * Dumps table in pre-8.1 legacy format. > + */ > +int > +ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, > + ipfw_table *tbl) > +{ > + struct table_config *tc; > + struct table_algo *ta; > + struct dump_args da; > + > + tbl->cnt = 0; > + > + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) > + return (0); /* XXX: We should return ESRCH */ > + > + ta = tc->ta; > + > + /* This dump format supports IPv4 only */ > + if (tc->no.type != IPFW_TABLE_ADDR) > + return (0); > + > + memset(&da, 0, sizeof(da)); > + da.ch = ch; > + da.ti = KIDX_TO_TI(ch, tc->no.kidx); > + da.tc = tc; > + da.ent = &tbl->ent[0]; > + da.size = tbl->size; > + > + tbl->cnt = 0; > + ta->foreach(tc->astate, da.ti, dump_table_entry, &da); > + tbl->cnt = da.cnt; > + > + return (0); > +} > + > +/* > + * Dumps table entry in eXtended format (v1)(current). > + */ > +static int > +dump_table_tentry(void *e, void *arg) > +{ > + struct dump_args *da; > + struct table_config *tc; > + struct table_algo *ta; > + struct table_value *pval; > + ipfw_obj_tentry *tent; > + int error; > + > + da = (struct dump_args *)arg; > + > + tc = da->tc; > + ta = tc->ta; > + > + tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent)); > + /* Out of memory, returning */ > + if (tent == NULL) { > + da->error = ENOMEM; > + return (1); > + } > + tent->head.length = sizeof(ipfw_obj_tentry); > + tent->idx = da->uidx; > + > + error = ta->dump_tentry(tc->astate, da->ti, e, tent); > + if (error != 0) > + return (error); > + > + pval = get_table_value(da->ch, da->tc, tent->v.kidx); > + ipfw_export_table_value_v1(pval, &tent->v.value); > + > + return (0); > +} > + > +/* > + * Dumps table entry in eXtended format (v0). > + */ > +static int > +dump_table_xentry(void *e, void *arg) > +{ > + struct dump_args *da; > + struct table_config *tc; > + struct table_algo *ta; > + ipfw_table_xentry *xent; > + ipfw_obj_tentry *tent; > + struct table_value *pval; > + int error; > + > + da = (struct dump_args *)arg; > + > + tc = da->tc; > + ta = tc->ta; > + > + xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent)); > + /* Out of memory, returning */ > + if (xent == NULL) > + return (1); > + xent->len = sizeof(ipfw_table_xentry); > + xent->tbl = da->uidx; > + > + memset(&da->tent, 0, sizeof(da->tent)); > + tent = &da->tent; > + error = ta->dump_tentry(tc->astate, da->ti, e, tent); > + if (error != 0) > + return (error); > + > + /* Convert current format to previous one */ > + xent->masklen = tent->masklen; > + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); > + xent->value = ipfw_export_table_value_legacy(pval); > + /* Apply some hacks */ > + if (tc->no.type == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { > + xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; > + xent->flags = IPFW_TCF_INET; > + } else > + memcpy(&xent->k, &tent->k, sizeof(xent->k)); > + > + return (0); > +} > + > +/* > + * Helper function to export table algo data > + * to tentry format before calling user function. > + * > + * Returns 0 on success. > + */ > +static int > +prepare_table_tentry(void *e, void *arg) > +{ > + struct dump_args *da; > + struct table_config *tc; > + struct table_algo *ta; > + int error; > + > + da = (struct dump_args *)arg; > + > + tc = da->tc; > + ta = tc->ta; > + > + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); > + if (error != 0) > + return (error); > + > + da->f(&da->tent, da->farg); > + > + return (0); > +} > + > +/* > + * Allow external consumers to read table entries in standard format. > + */ > +int > +ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, > + ta_foreach_f *f, void *arg) > +{ > + struct namedobj_instance *ni; > + struct table_config *tc; > + struct table_algo *ta; > + struct dump_args da; > + > + ni = CHAIN_TO_NI(ch); > + > + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); > + if (tc == NULL) > + return (ESRCH); > + > + ta = tc->ta; > + > + memset(&da, 0, sizeof(da)); > + da.ch = ch; > + da.ti = KIDX_TO_TI(ch, tc->no.kidx); > + da.tc = tc; > + da.f = f; > + da.farg = arg; > + > + ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da); > + > + return (0); > +} > + > +/* > + * Table algorithms > + */ > + > +/* > + * Finds algoritm by index, table type or supplied name. > + * > + * Returns pointer to algo or NULL. > + */ > +static struct table_algo * > +find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name) > +{ > + int i, l; > + struct table_algo *ta; > + > + if (ti->type > IPFW_TABLE_MAXTYPE) > + return (NULL); > + > + /* Search by index */ > + if (ti->atype != 0) { > + if (ti->atype > tcfg->algo_count) > + return (NULL); > + return (tcfg->algo[ti->atype]); > + } > + > + if (name == NULL) { > + /* Return default algorithm for given type if set */ > + return (tcfg->def_algo[ti->type]); > + } > + > + /* Search by name */ > + /* TODO: better search */ > + for (i = 1; i <= tcfg->algo_count; i++) { > + ta = tcfg->algo[i]; > + > + /* > + * One can supply additional algorithm > + * parameters so we compare only the first word > + * of supplied name: > + * 'addr:chash hsize=32' > + * '^^^^^^^^^' > + * > + */ > + l = strlen(ta->name); > + if (strncmp(name, ta->name, l) != 0) > + continue; > + if (name[l] != '\0' && name[l] != ' ') > + continue; > + /* Check if we're requesting proper table type */ > + if (ti->type != 0 && ti->type != ta->type) > + return (NULL); > + return (ta); > + } > + > + return (NULL); > +} > + > +/* > + * Register new table algo @ta. > + * Stores algo id inside @idx. > + * > + * Returns 0 on success. > + */ > +int > +ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, > + uint32_t *idx) > +{ > + struct tables_config *tcfg; > + struct table_algo *ta_new; > + size_t sz; > + > + if (size > sizeof(struct table_algo)) > + return (EINVAL); > + > + /* Check for the required on-stack size for add/del */ > + sz = roundup2(ta->ta_buf_size, sizeof(void *)); > + if (sz > TA_BUF_SZ) > + return (EINVAL); > + > + KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE")); > + > + /* Copy algorithm data to stable storage. */ > + ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO); > + memcpy(ta_new, ta, size); > + > + tcfg = CHAIN_TO_TCFG(ch); > + > + KASSERT(tcfg->algo_count < 255, ("Increase algo array size")); > + > + tcfg->algo[++tcfg->algo_count] = ta_new; > + ta_new->idx = tcfg->algo_count; > + > + /* Set algorithm as default one for given type */ > + if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 && > + tcfg->def_algo[ta_new->type] == NULL) > + tcfg->def_algo[ta_new->type] = ta_new; > + > + *idx = ta_new->idx; > + > + return (0); > +} > + > +/* > + * Unregisters table algo using @idx as id. > + * XXX: It is NOT safe to call this function in any place > + * other than ipfw instance destroy handler. > + */ > +void > +ipfw_del_table_algo(struct ip_fw_chain *ch, int idx) > +{ > + struct tables_config *tcfg; > + struct table_algo *ta; > + > + tcfg = CHAIN_TO_TCFG(ch); > + > + KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d", > + idx, tcfg->algo_count)); > + > + ta = tcfg->algo[idx]; > + KASSERT(ta != NULL, ("algo idx %d is NULL", idx)); > + > + if (tcfg->def_algo[ta->type] == ta) > + tcfg->def_algo[ta->type] = NULL; > + > + free(ta, M_IPFW); > +} > + > +/* > + * Lists all table algorithms currently available. > + * Data layout (v0)(current): > + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size > + * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ] > + * > + * Returns 0 on success > + */ > +static int > +list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_lheader *olh; > + struct tables_config *tcfg; > + ipfw_ta_info *i; > + struct table_algo *ta; > + uint32_t count, n, size; > + > + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); > + if (olh == NULL) > + return (EINVAL); > + if (sd->valsize < olh->size) > + return (EINVAL); > + > + IPFW_UH_RLOCK(ch); > + tcfg = CHAIN_TO_TCFG(ch); > + count = tcfg->algo_count; > + size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader); > + > + /* Fill in header regadless of buffer size */ > + olh->count = count; > + olh->objsize = sizeof(ipfw_ta_info); > + > + if (size > olh->size) { > + olh->size = size; > + IPFW_UH_RUNLOCK(ch); > + return (ENOMEM); > + } > + olh->size = size; > + > + for (n = 1; n <= count; n++) { > + i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i)); > + KASSERT(i != 0, ("previously checked buffer is not enough")); > + ta = tcfg->algo[n]; > + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); > + i->type = ta->type; > + i->refcnt = ta->refcnt; > + } > + > + IPFW_UH_RUNLOCK(ch); > + > + return (0); > +} > + > +/* > + * Tables rewriting code > + */ > + > +/* > + * Determine table number and lookup type for @cmd. > + * Fill @tbl and @type with appropriate values. > + * Returns 0 for relevant opcodes, 1 otherwise. > + */ > +static int > +classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) > +{ > + ipfw_insn_if *cmdif; > + int skip; > + uint16_t v; > + > + skip = 1; > + > + switch (cmd->opcode) { > + case O_IP_SRC_LOOKUP: > + case O_IP_DST_LOOKUP: > + /* Basic IPv4/IPv6 or u32 lookups */ > + *puidx = cmd->arg1; > + /* Assume ADDR by default */ > + *ptype = IPFW_TABLE_ADDR; > + skip = 0; > + > + if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { > + /* > + * generic lookup. The key must be > + * in 32bit big-endian format. > + */ > + v = ((ipfw_insn_u32 *)cmd)->d[1]; > + switch (v) { > + case 0: > + case 1: > + /* IPv4 src/dst */ > + break; > + case 2: > + case 3: > + /* src/dst port */ > + *ptype = IPFW_TABLE_NUMBER; > + break; > + case 4: > + /* uid/gid */ > + *ptype = IPFW_TABLE_NUMBER; > + break; > + case 5: > + /* jid */ > + *ptype = IPFW_TABLE_NUMBER; > + break; > + case 6: > + /* dscp */ > + *ptype = IPFW_TABLE_NUMBER; > + break; > + } > + } > + break; > + case O_XMIT: > + case O_RECV: > + case O_VIA: > + /* Interface table, possibly */ > + cmdif = (ipfw_insn_if *)cmd; > + if (cmdif->name[0] != '\1') > + break; > + > + *ptype = IPFW_TABLE_INTERFACE; > + *puidx = cmdif->p.kidx; > + skip = 0; > + break; > + case O_IP_FLOW_LOOKUP: > + *puidx = cmd->arg1; > + *ptype = IPFW_TABLE_FLOW; > + skip = 0; > + break; > + } > + > + return (skip); > +} > + > +/* > + * Sets new table value for given opcode. > + * Assume the same opcodes as classify_table_opcode() > + */ > +static void > +update_table_opcode(ipfw_insn *cmd, uint16_t idx) > +{ > + ipfw_insn_if *cmdif; > + > + switch (cmd->opcode) { > + case O_IP_SRC_LOOKUP: > + case O_IP_DST_LOOKUP: > + /* Basic IPv4/IPv6 or u32 lookups */ > + cmd->arg1 = idx; > + break; > + case O_XMIT: > + case O_RECV: > + case O_VIA: > + /* Interface table, possibly */ > + cmdif = (ipfw_insn_if *)cmd; > + cmdif->p.kidx = idx; > + break; > + case O_IP_FLOW_LOOKUP: > + cmd->arg1 = idx; > + break; > + } > +} > + > +/* > + * Checks table name for validity. > + * Enforce basic length checks, the rest > + * should be done in userland. > + * > + * Returns 0 if name is considered valid. > + */ > +int > +ipfw_check_table_name(char *name) > +{ > + int nsize; > + ipfw_obj_ntlv *ntlv = NULL; > + > + nsize = sizeof(ntlv->name); > + > + if (strnlen(name, nsize) == nsize) > + return (EINVAL); > + > + if (name[0] == '\0') > + return (EINVAL); > + > + /* > + * TODO: do some more complicated checks > + */ > + > + return (0); > +} > + > +/* > + * Find tablename TLV by @uid. > + * Check @tlvs for valid data inside. > + * > + * Returns pointer to found TLV or NULL. > + */ > +static ipfw_obj_ntlv * > +find_name_tlv(void *tlvs, int len, uint16_t uidx) > +{ > + ipfw_obj_ntlv *ntlv; > + uintptr_t pa, pe; > + int l; > + > + pa = (uintptr_t)tlvs; > + pe = pa + len; > + l = 0; > + for (; pa < pe; pa += l) { > + ntlv = (ipfw_obj_ntlv *)pa; > + l = ntlv->head.length; > + > + if (l != sizeof(*ntlv)) > + return (NULL); > + > + if (ntlv->head.type != IPFW_TLV_TBL_NAME) > + continue; > + > + if (ntlv->idx != uidx) > + continue; > + > + if (ipfw_check_table_name(ntlv->name) != 0) > + return (NULL); > + > + return (ntlv); > + } > + > + return (NULL); > +} > + > +/* > + * Finds table config based on either legacy index > + * or name in ntlv. > + * Note @ti structure contains unchecked data from userland. > + * > + * Returns pointer to table_config or NULL. > + */ > +static struct table_config * > +find_table(struct namedobj_instance *ni, struct tid_info *ti) > +{ > + char *name, bname[16]; > + struct named_object *no; > + ipfw_obj_ntlv *ntlv; > + uint32_t set; > + > + if (ti->tlvs != NULL) { > + ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); > + if (ntlv == NULL) > + return (NULL); > + name = ntlv->name; > + > + /* > + * Use set provided by @ti instead of @ntlv one. > + * This is needed due to different sets behavior > + * controlled by V_fw_tables_sets. > + */ > + set = ti->set; > + } else { > + snprintf(bname, sizeof(bname), "%d", ti->uidx); > + name = bname; > + set = 0; > + } > + > + no = ipfw_objhash_lookup_name(ni, set, name); > + > + return ((struct table_config *)no); > +} > + > +/* > + * Allocate new table config structure using > + * specified @algo and @aname. > + * > + * Returns pointer to config or NULL. > + */ > +static struct table_config * > +alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, > + struct table_algo *ta, char *aname, uint8_t tflags) > +{ > + char *name, bname[16]; > + struct table_config *tc; > + int error; > + ipfw_obj_ntlv *ntlv; > + uint32_t set; > + > + if (ti->tlvs != NULL) { > + ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); > + if (ntlv == NULL) > + return (NULL); > + name = ntlv->name; > + set = ntlv->set; > + } else { > + snprintf(bname, sizeof(bname), "%d", ti->uidx); > + name = bname; > + set = 0; > + } > + > + tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); > + tc->no.name = tc->tablename; > + tc->no.type = ta->type; > + tc->no.set = set; > + tc->tflags = tflags; > + tc->ta = ta; > + strlcpy(tc->tablename, name, sizeof(tc->tablename)); > + /* Set "shared" value type by default */ > + tc->vshared = 1; > + > + if (ti->tlvs == NULL) { > + tc->no.compat = 1; > + tc->no.uidx = ti->uidx; > + } > + > + /* Preallocate data structures for new tables */ > + error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); > + if (error != 0) { > + free(tc, M_IPFW); > + return (NULL); > + } > + > + return (tc); > +} > + > +/* > + * Destroys table state and config. > + */ > +static void > +free_table_config(struct namedobj_instance *ni, struct table_config *tc) > +{ > + > + KASSERT(tc->linked == 0, ("free() on linked config")); > + > + /* > + * We're using ta without any locking/referencing. > + * TODO: fix this if we're going to use unloadable algos. > + */ > + tc->ta->destroy(tc->astate, &tc->ti_copy); > + free(tc, M_IPFW); > +} > + > +/* > + * Links @tc to @chain table named instance. > + * Sets appropriate type/states in @chain table info. > + */ > +static void > +link_table(struct ip_fw_chain *ch, struct table_config *tc) > +{ > + struct namedobj_instance *ni; > + struct table_info *ti; > + uint16_t kidx; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + IPFW_WLOCK_ASSERT(ch); > + > + ni = CHAIN_TO_NI(ch); > + kidx = tc->no.kidx; > + > + ipfw_objhash_add(ni, &tc->no); > + > + ti = KIDX_TO_TI(ch, kidx); > + *ti = tc->ti_copy; > + > + /* Notify algo on real @ti address */ > + if (tc->ta->change_ti != NULL) > + tc->ta->change_ti(tc->astate, ti); > + > + tc->linked = 1; > + tc->ta->refcnt++; > +} > + > +/* > + * Unlinks @tc from @chain table named instance. > + * Zeroes states in @chain and stores them in @tc. > + */ > +static void > +unlink_table(struct ip_fw_chain *ch, struct table_config *tc) > +{ > + struct namedobj_instance *ni; > + struct table_info *ti; > + uint16_t kidx; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + IPFW_WLOCK_ASSERT(ch); > + > + ni = CHAIN_TO_NI(ch); > + kidx = tc->no.kidx; > + > + /* Clear state. @ti copy is already saved inside @tc */ > + ipfw_objhash_del(ni, &tc->no); > + ti = KIDX_TO_TI(ch, kidx); > + memset(ti, 0, sizeof(struct table_info)); > + tc->linked = 0; > + tc->ta->refcnt--; > + > + /* Notify algo on real @ti address */ > + if (tc->ta->change_ti != NULL) > + tc->ta->change_ti(tc->astate, NULL); > +} > + > +struct swap_table_args { > + int set; > + int new_set; > + int mv; > +}; > + > +/* > + * Change set for each matching table. > + * > + * Ensure we dispatch each table once by setting/checking ochange > + * fields. > + */ > +static void > +swap_table_set(struct namedobj_instance *ni, struct named_object *no, > + void *arg) > +{ > + struct table_config *tc; > + struct swap_table_args *sta; > + > + tc = (struct table_config *)no; > + sta = (struct swap_table_args *)arg; > + > + if (no->set != sta->set && (no->set != sta->new_set || sta->mv != 0)) > + return; > + > + if (tc->ochanged != 0) > + return; > + > + tc->ochanged = 1; > + ipfw_objhash_del(ni, no); > + if (no->set == sta->set) > + no->set = sta->new_set; > + else > + no->set = sta->set; > + ipfw_objhash_add(ni, no); > +} > + > +/* > + * Cleans up ochange field for all tables. > + */ > +static void > +clean_table_set_data(struct namedobj_instance *ni, struct named_object *no, > + void *arg) > +{ > + struct table_config *tc; > + struct swap_table_args *sta; > + > + tc = (struct table_config *)no; > + sta = (struct swap_table_args *)arg; > + > + tc->ochanged = 0; > +} > + > +/* > + * Swaps tables within two sets. > + */ > +void > +ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t set, > + uint32_t new_set, int mv) > +{ > + struct swap_table_args sta; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + sta.set = set; > + sta.new_set = new_set; > + sta.mv = mv; > + > + ipfw_objhash_foreach(CHAIN_TO_NI(ch), swap_table_set, &sta); > + ipfw_objhash_foreach(CHAIN_TO_NI(ch), clean_table_set_data, &sta); > +} > + > +/* > + * Move all tables which are reference by rules in @rr to set @new_set. > + * Makes sure that all relevant tables are referenced ONLLY by given rules. > + * > + * Retuns 0 on success, > + */ > +int > +ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, > + uint32_t new_set) > +{ > + struct ip_fw *rule; > + struct table_config *tc; > + struct named_object *no; > + struct namedobj_instance *ni; > + int bad, i, l, cmdlen; > + uint16_t kidx; > + uint8_t type; > + ipfw_insn *cmd; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + ni = CHAIN_TO_NI(ch); > + > + /* Stage 1: count number of references by given rules */ > + for (i = 0; i < ch->n_rules - 1; i++) { > + rule = ch->map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + KASSERT(no != NULL, > + ("objhash lookup failed on index %d", kidx)); > + tc = (struct table_config *)no; > + tc->ocount++; > + } > + > + } > + > + /* Stage 2: verify "ownership" */ > + bad = 0; > + for (i = 0; i < ch->n_rules - 1; i++) { > + rule = ch->map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + KASSERT(no != NULL, > + ("objhash lookup failed on index %d", kidx)); > + tc = (struct table_config *)no; > + if (tc->no.refcnt != tc->ocount) { > + > + /* > + * Number of references differ: > + * Other rule(s) are holding reference to given > + * table, so it is not possible to change its set. > + * > + * Note that refcnt may account > + * references to some going-to-be-added rules. > + * Since we don't know their numbers (and event > + * if they will be added) it is perfectly OK > + * to return error here. > + */ > + bad = 1; > + break; > + } > + } > + > + if (bad != 0) > + break; > + } > + > + /* Stage 3: change set or cleanup */ > + for (i = 0; i < ch->n_rules - 1; i++) { > + rule = ch->map[i]; > + if (ipfw_match_range(rule, rt) == 0) > + continue; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + KASSERT(no != NULL, > + ("objhash lookup failed on index %d", kidx)); > + tc = (struct table_config *)no; > + > + tc->ocount = 0; > + if (bad != 0) > + continue; > + > + /* Actually change set. */ > + ipfw_objhash_del(ni, no); > + no->set = new_set; > + ipfw_objhash_add(ni, no); > + } > + } > + > + return (bad); > +} > + > +/* > + * Finds and bumps refcount for tables referenced by given @rule. > + * Auto-creates non-existing tables. > + * Fills in @oib array with userland/kernel indexes. > + * First free oidx pointer is saved back in @oib. > + * > + * Returns 0 on success. > + */ > +static int > +find_ref_rule_tables(struct ip_fw_chain *ch, struct ip_fw *rule, > + struct rule_check_info *ci, struct obj_idx **oib, struct tid_info *ti) > +{ > + struct table_config *tc; > + struct namedobj_instance *ni; > + struct named_object *no; > + int cmdlen, error, l, numnew; > + uint16_t kidx; > + ipfw_insn *cmd; > + struct obj_idx *pidx, *pidx_first, *p; > + > + pidx_first = *oib; > + pidx = pidx_first; > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + error = 0; > + numnew = 0; > + > + IPFW_UH_WLOCK(ch); > + ni = CHAIN_TO_NI(ch); > + > + /* Increase refcount on each existing referenced table. */ > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + if (classify_table_opcode(cmd, &ti->uidx, &ti->type) != 0) > + continue; > + > + pidx->uidx = ti->uidx; > + pidx->type = ti->type; > + > + if ((tc = find_table(ni, ti)) != NULL) { > + if (tc->no.type != ti->type) { > + /* Incompatible types */ > + error = EINVAL; > + break; > + } > + > + /* Reference found table and save kidx */ > + tc->no.refcnt++; > + pidx->kidx = tc->no.kidx; > + pidx++; > + continue; > + } > + > + /* > + * Compability stuff for old clients: > + * prepare to manually create non-existing tables. > + */ > + pidx++; > + numnew++; > + } > + > + if (error != 0) { > + /* Unref everything we have already done */ > + for (p = *oib; p < pidx; p++) { > + if (p->kidx == 0) > + continue; > + > + /* Find & unref by existing idx */ > + no = ipfw_objhash_lookup_kidx(ni, p->kidx); > + KASSERT(no != NULL, ("Ref'd table %d disappeared", > + p->kidx)); > + > + no->refcnt--; > + } > + } > + > + IPFW_UH_WUNLOCK(ch); > + > + if (numnew == 0) { > + *oib = pidx; > + return (error); > + } > + > + /* > + * Compatibility stuff: do actual creation for non-existing, > + * but referenced tables. > + */ > + for (p = pidx_first; p < pidx; p++) { > + if (p->kidx != 0) > + continue; > + > + ti->uidx = p->uidx; > + ti->type = p->type; > + ti->atype = 0; > + > + error = create_table_compat(ch, ti, &kidx); > + if (error == 0) { > + p->kidx = kidx; > + continue; > + } > + > + /* Error. We have to drop references */ > + IPFW_UH_WLOCK(ch); > + for (p = pidx_first; p < pidx; p++) { > + if (p->kidx == 0) > + continue; > + > + /* Find & unref by existing idx */ > + no = ipfw_objhash_lookup_kidx(ni, p->kidx); > + KASSERT(no != NULL, ("Ref'd table %d disappeared", > + p->kidx)); > + > + no->refcnt--; > + } > + IPFW_UH_WUNLOCK(ch); > + > + return (error); > + } > + > + *oib = pidx; > + > + return (error); > +} > + > +/* > + * Remove references from every table used in @rule. > + */ > +void > +ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule) > +{ > + int cmdlen, l; > + ipfw_insn *cmd; > + struct namedobj_instance *ni; > + struct named_object *no; > + uint16_t kidx; > + uint8_t type; > + > + IPFW_UH_WLOCK_ASSERT(chain); > + ni = CHAIN_TO_NI(chain); > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + > + no = ipfw_objhash_lookup_kidx(ni, kidx); > + > + KASSERT(no != NULL, ("table id %d not found", kidx)); > + KASSERT(no->type == type, ("wrong type %d (%d) for table id %d", > + no->type, type, kidx)); > + KASSERT(no->refcnt > 0, ("refcount for table %d is %d", > + kidx, no->refcnt)); > + > + no->refcnt--; > + } > +} > + > +/* > + * Compatibility function for old ipfw(8) binaries. > + * Rewrites table kernel indices with userland ones. > + * Convert tables matching '/^\d+$/' to their atoi() value. > + * Use number 65535 for other tables. > + * > + * Returns 0 on success. > + */ > +int > +ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, struct ip_fw_rule0 *rule) > +{ > + int cmdlen, error, l; > + ipfw_insn *cmd; > + uint16_t kidx, uidx; > + uint8_t type; > + struct named_object *no; > + struct namedobj_instance *ni; > + > + ni = CHAIN_TO_NI(chain); > + error = 0; > + > + l = rule->cmd_len; > + cmd = rule->cmd; > + cmdlen = 0; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + > + if (classify_table_opcode(cmd, &kidx, &type) != 0) > + continue; > + > + if ((no = ipfw_objhash_lookup_kidx(ni, kidx)) == NULL) > + return (1); > + > + uidx = no->uidx; > + if (no->compat == 0) { > + > + /* > + * We are called via legacy opcode. > + * Save error and show table as fake number > + * not to make ipfw(8) hang. > + */ > + uidx = 65535; > + error = 2; > + } > + > + update_table_opcode(cmd, uidx); > + } > + > + return (error); > +} > + > +/* > + * Checks is opcode is referencing table of appropriate type. > + * Adds reference count for found table if true. > + * Rewrites user-supplied opcode values with kernel ones. > + * > + * Returns 0 on success and appropriate error code otherwise. > + */ > +int > +ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, > + struct rule_check_info *ci) > +{ > + int cmdlen, error, l; > + ipfw_insn *cmd; > + uint16_t uidx; > + uint8_t type; > + struct namedobj_instance *ni; > + struct obj_idx *p, *pidx_first, *pidx_last; > + struct tid_info ti; > + > + ni = CHAIN_TO_NI(chain); > + > + /* > + * Prepare an array for storing opcode indices. > + * Use stack allocation by default. > + */ > + if (ci->table_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { > + /* Stack */ > + pidx_first = ci->obuf; > + } else > + pidx_first = malloc(ci->table_opcodes * sizeof(struct obj_idx), > + M_IPFW, M_WAITOK | M_ZERO); > + > + pidx_last = pidx_first; > + error = 0; > + type = 0; > + memset(&ti, 0, sizeof(ti)); > + > + /* > + * Use default set for looking up tables (old way) or > + * use set rule is assigned to (new way). > + */ > + ti.set = (V_fw_tables_sets != 0) ? ci->krule->set : 0; > + if (ci->ctlv != NULL) { > + ti.tlvs = (void *)(ci->ctlv + 1); > + ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); > + } > + > + /* Reference all used tables */ > + error = find_ref_rule_tables(chain, ci->krule, ci, &pidx_last, &ti); > + if (error != 0) > + goto free; > + > + IPFW_UH_WLOCK(chain); > + > + /* Perform rule rewrite */ > + l = ci->krule->cmd_len; > + cmd = ci->krule->cmd; > + cmdlen = 0; > + p = pidx_first; > + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { > + cmdlen = F_LEN(cmd); > + if (classify_table_opcode(cmd, &uidx, &type) != 0) > + continue; > + update_table_opcode(cmd, p->kidx); > + p++; > + } > + > + IPFW_UH_WUNLOCK(chain); > + > +free: > + if (pidx_first != ci->obuf) > + free(pidx_first, M_IPFW); > + > + return (error); > +} > + > +static struct ipfw_sopt_handler scodes[] = { > + { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table }, > + { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 }, > + { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 }, > + { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table }, > + { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table }, > + { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables }, > + { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 }, > + { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 }, > + { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 }, > + { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 }, > + { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 }, > + { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 }, > + { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry }, > + { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table }, > + { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo }, > + { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size }, > +}; > + > +static void > +destroy_table_locked(struct namedobj_instance *ni, struct named_object *no, > + void *arg) > +{ > + > + unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no); > + if (ipfw_objhash_free_idx(ni, no->kidx) != 0) > + printf("Error unlinking kidx %d from table %s\n", > + no->kidx, no->name); > + free_table_config(ni, (struct table_config *)no); > +} > + > +/* > + * Shuts tables module down. > + */ > +void > +ipfw_destroy_tables(struct ip_fw_chain *ch, int last) > +{ > + > + IPFW_DEL_SOPT_HANDLER(last, scodes); > + > + /* Remove all tables from working set */ > + IPFW_UH_WLOCK(ch); > + IPFW_WLOCK(ch); > + ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch); > + IPFW_WUNLOCK(ch); > + IPFW_UH_WUNLOCK(ch); > + > + /* Free pointers itself */ > + free(ch->tablestate, M_IPFW); > + > + ipfw_table_value_destroy(ch, last); > + ipfw_table_algo_destroy(ch); > + > + ipfw_objhash_destroy(CHAIN_TO_NI(ch)); > + free(CHAIN_TO_TCFG(ch), M_IPFW); > +} > + > +/* > + * Starts tables module. > + */ > +int > +ipfw_init_tables(struct ip_fw_chain *ch, int first) > +{ > + struct tables_config *tcfg; > + > + /* Allocate pointers */ > + ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info), > + M_IPFW, M_WAITOK | M_ZERO); > + > + tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO); > + tcfg->namehash = ipfw_objhash_create(V_fw_tables_max); > + ch->tblcfg = tcfg; > + > + ipfw_table_value_init(ch, first); > + ipfw_table_algo_init(ch); > + > + IPFW_ADD_SOPT_HANDLER(first, scodes); > + return (0); > +} > + > + > + > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h > new file mode 100644 > index 0000000..216d713 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h > @@ -0,0 +1,246 @@ > +/*- > + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + * > + * $FreeBSD: head/sys/netpfil/ipfw/ip_fw_table.h 272840 2014-10-09 19:32:35Z melifaro $ > + */ > + > +#ifndef _IPFW2_TABLE_H > +#define _IPFW2_TABLE_H > + > +/* > + * Internal constants and data structures used by ipfw tables > + * not meant to be exported outside the kernel. > + */ > +#ifdef _KERNEL > + > +struct table_algo; > +struct tables_config { > + struct namedobj_instance *namehash; > + struct namedobj_instance *valhash; > + uint32_t val_size; > + uint32_t algo_count; > + struct table_algo *algo[256]; > + struct table_algo *def_algo[IPFW_TABLE_MAXTYPE + 1]; > + TAILQ_HEAD(op_state_l,op_state) state_list; > +}; > +#define CHAIN_TO_TCFG(chain) ((struct tables_config *)(chain)->tblcfg) > + > +struct table_info { > + table_lookup_t *lookup; /* Lookup function */ > + void *state; /* Lookup radix/other structure */ > + void *xstate; /* eXtended state */ > + u_long data; /* Hints for given func */ > +}; > + > +/* Internal structures for handling sockopt data */ > +struct tid_info { > + uint32_t set; /* table set */ > + uint16_t uidx; /* table index */ > + uint8_t type; /* table type */ > + uint8_t atype; > + void *tlvs; /* Pointer to first TLV */ > + int tlen; /* Total TLV size block */ > +}; > + > +struct table_value; > +struct tentry_info { > + void *paddr; > + struct table_value *pvalue; > + void *ptv; /* Temporary field to hold obj */ > + uint8_t masklen; /* mask length */ > + uint8_t subtype; > + uint16_t flags; /* record flags */ > + uint32_t value; /* value index */ > +}; > +#define TEI_FLAGS_UPDATE 0x0001 /* Add or update rec if exists */ > +#define TEI_FLAGS_UPDATED 0x0002 /* Entry has been updated */ > +#define TEI_FLAGS_COMPAT 0x0004 /* Called from old ABI */ > +#define TEI_FLAGS_DONTADD 0x0008 /* Do not create new rec */ > +#define TEI_FLAGS_ADDED 0x0010 /* Entry was added */ > +#define TEI_FLAGS_DELETED 0x0020 /* Entry was deleted */ > +#define TEI_FLAGS_LIMIT 0x0040 /* Limit was hit */ > +#define TEI_FLAGS_ERROR 0x0080 /* Unknown request error */ > +#define TEI_FLAGS_NOTFOUND 0x0100 /* Entry was not found */ > +#define TEI_FLAGS_EXISTS 0x0200 /* Entry already exists */ > + > +typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +typedef void (ta_destroy)(void *ta_state, struct table_info *ti); > +typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +typedef int (ta_add)(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +typedef int (ta_del)(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +typedef void (ta_flush_entry)(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > + > +typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > +typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); > +typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t *pflags); > +typedef void (ta_modify)(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t pflags); > +typedef void (ta_flush_mod)(void *ta_buf); > + > +typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); > +typedef void (ta_print_config)(void *ta_state, struct table_info *ti, char *buf, > + size_t bufsize); > + > +typedef int ta_foreach_f(void *node, void *arg); > +typedef void ta_foreach(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg); > +typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent); > +typedef int ta_find_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +typedef uint32_t ta_get_count(void *ta_state, struct table_info *ti); > + > +struct table_algo { > + char name[16]; > + uint32_t idx; > + uint32_t type; > + uint32_t refcnt; > + uint32_t flags; > + uint32_t vlimit; > + size_t ta_buf_size; > + ta_init *init; > + ta_destroy *destroy; > + ta_prepare_add *prepare_add; > + ta_prepare_del *prepare_del; > + ta_add *add; > + ta_del *del; > + ta_flush_entry *flush_entry; > + ta_find_tentry *find_tentry; > + ta_need_modify *need_modify; > + ta_prepare_mod *prepare_mod; > + ta_fill_mod *fill_mod; > + ta_modify *modify; > + ta_flush_mod *flush_mod; > + ta_change_ti *change_ti; > + ta_foreach *foreach; > + ta_dump_tentry *dump_tentry; > + ta_print_config *print_config; > + ta_dump_tinfo *dump_tinfo; > + ta_get_count *get_count; > +}; > +#define TA_FLAG_DEFAULT 0x01 /* Algo is default for given type */ > +#define TA_FLAG_READONLY 0x02 /* Algo does not support modifications*/ > +#define TA_FLAG_EXTCOUNTER 0x04 /* Algo has external counter available*/ > + > +int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, > + size_t size, uint32_t *idx); > +void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx); > + > +void ipfw_table_algo_init(struct ip_fw_chain *chain); > +void ipfw_table_algo_destroy(struct ip_fw_chain *chain); > + > +MALLOC_DECLARE(M_IPFW_TBL); > +/* Exported to support legacy opcodes */ > +int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint8_t flags, uint32_t count); > +int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, > + struct tentry_info *tei, uint8_t flags, uint32_t count); > +int flush_table(struct ip_fw_chain *ch, struct tid_info *ti); > +void ipfw_import_table_value_legacy(uint32_t value, struct table_value *v); > +uint32_t ipfw_export_table_value_legacy(struct table_value *v); > +int ipfw_get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > + > +/* ipfw_table_value.c functions */ > +struct table_config; > +struct tableop_state; > +void ipfw_table_value_init(struct ip_fw_chain *ch, int first); > +void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last); > +int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts); > +void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, > + struct tentry_info *tei, uint32_t count, int rollback); > +void ipfw_import_table_value_v1(ipfw_table_value *iv); > +void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *iv); > +void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, > + struct table_algo *ta, void *astate, struct table_info *ti); > +void rollback_table_values(struct tableop_state *ts); > + > +int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, > + struct rule_check_info *ci); > +int ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, > + struct ip_fw_rule0 *rule); > +int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, > + uint32_t *bmask); > +int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, > + struct sockopt_data *sd); > +void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule); > + > +/* utility functions */ > +int ipfw_check_table_name(char *name); > +int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, > + uint32_t new_set); > +void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t old_set, > + uint32_t new_set, int mv); > +int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, > + ta_foreach_f f, void *arg); > + > +/* internal functions */ > +void tc_ref(struct table_config *tc); > +void tc_unref(struct table_config *tc); > + > +struct op_state; > +typedef void (op_rollback_f)(void *object, struct op_state *state); > +struct op_state { > + TAILQ_ENTRY(op_state) next; /* chain link */ > + op_rollback_f *func; > +}; > + > +struct tableop_state { > + struct op_state opstate; > + struct ip_fw_chain *ch; > + struct table_config *tc; > + struct table_algo *ta; > + struct tentry_info *tei; > + uint32_t count; > + uint32_t vmask; > + int vshared; > + int modified; > +}; > + > +void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); > +void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); > +void rollback_toperation_state(struct ip_fw_chain *ch, void *object); > + > +/* Legacy interfaces */ > +int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, > + uint32_t *cnt); > +int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, > + uint32_t *cnt); > +int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, > + ipfw_table *tbl); > + > + > +#endif /* _KERNEL */ > +#endif /* _IPFW2_TABLE_H */ > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c > new file mode 100644 > index 0000000..d9d0547 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c > @@ -0,0 +1,4081 @@ > +/*- > + * Copyright (c) 2014 Yandex LLC > + * Copyright (c) 2014 Alexander V. Chernikov > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table_algo.c 272912 2014-10-10 20:37:06Z melifaro $"); > + > +/* > + * Lookup table algorithms. > + * > + */ > + > +#include "opt_ipfw.h" > +#include "opt_inet.h" > +#ifndef INET > +#error IPFIREWALL requires INET. > +#endif /* INET */ > +#include "opt_inet6.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/kernel.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/queue.h> > +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ > +#include <net/radix.h> > +#include <net/route.h> > + > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ > +#include <netinet/ip_fw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/ip_fw_table.h> > + > + > +/* > + * IPFW table lookup algorithms. > + * > + * What is needed to add another table algo? > + * > + * Algo init: > + * * struct table_algo has to be filled with: > + * name: "type:algoname" format, e.g. "addr:radix". Currently > + * there are the following types: "addr", "iface", "number" and "flow". > + * type: one of IPFW_TABLE_* types > + * flags: one or more TA_FLAGS_* > + * ta_buf_size: size of structure used to store add/del item state. > + * Needs to be less than TA_BUF_SZ. > + * callbacks: see below for description. > + * * ipfw_add_table_algo / ipfw_del_table_algo has to be called > + * > + * Callbacks description: > + * > + * -init: request to initialize new table instance. > + * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, > + * struct table_info *ti, char *data, uint8_t tflags); > + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. > + * > + * Allocate all structures needed for normal operations. > + * * Caller may want to parse @data for some algo-specific > + * options provided by userland. > + * * Caller may want to save configuration state pointer to @ta_state > + * * Caller needs to save desired runtime structure pointer(s) > + * inside @ti fields. Note that it is not correct to save > + * @ti pointer at this moment. Use -change_ti hook for that. > + * * Caller has to fill in ti->lookup to appropriate function > + * pointer. > + * > + * > + * > + * -destroy: request to destroy table instance. > + * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); > + * MANDATORY, may be locked (UH+WLOCK). (M_NOWAIT). > + * > + * Frees all table entries and all tables structures allocated by -init. > + * > + * > + * > + * -prepare_add: request to allocate state for adding new entry. > + * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, > + * void *ta_buf); > + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. > + * > + * Allocates state and fills it in with all necessary data (EXCEPT value) > + * from @tei to minimize operations needed to be done under WLOCK. > + * "value" field has to be copied to new entry in @add callback. > + * Buffer ta_buf of size ta->ta_buf_sz may be used to store > + * allocated state. > + * > + * > + * > + * -prepare_del: request to set state for deleting existing entry. > + * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, > + * void *ta_buf); > + * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. > + * > + * Buffer ta_buf of size ta->ta_buf_sz may be used to store > + * allocated state. Caller should use on-stack ta_buf allocation > + * instead of doing malloc(). > + * > + * > + * > + * -add: request to insert new entry into runtime/config structures. > + * typedef int (ta_add)(void *ta_state, struct table_info *ti, > + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. > + * > + * Insert new entry using previously-allocated state in @ta_buf. > + * * @tei may have the following flags: > + * TEI_FLAGS_UPDATE: request to add or update entry. > + * TEI_FLAGS_DONTADD: request to update (but not add) entry. > + * * Caller is required to do the following: > + * copy real entry value from @tei > + * entry added: return 0, set 1 to @pnum > + * entry updated: return 0, store 0 to @pnum, store old value in @tei, > + * add TEI_FLAGS_UPDATED flag to @tei. > + * entry exists: return EEXIST > + * entry not found: return ENOENT > + * other error: return non-zero error code. > + * > + * > + * > + * -del: request to delete existing entry from runtime/config structures. > + * typedef int (ta_del)(void *ta_state, struct table_info *ti, > + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. > + * > + * Delete entry using previously set up in @ta_buf. > + * * Caller is required to do the following: > + * entry deleted: return 0, set 1 to @pnum, store old value in @tei. > + * entry not found: return ENOENT > + * other error: return non-zero error code. > + * > + * > + * > + * -flush_entry: flush entry state created by -prepare_add / -del / others > + * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, > + * struct tentry_info *tei, void *ta_buf); > + * MANDATORY, may be locked. (M_NOWAIT). > + * > + * Delete state allocated by: > + * -prepare_add (-add returned EEXIST|UPDATED) > + * -prepare_del (if any) > + * -del > + * * Caller is required to handle empty @ta_buf correctly. > + * > + * > + * -find_tentry: finds entry specified by key @tei > + * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, > + * ipfw_obj_tentry *tent); > + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. > + * > + * Finds entry specified by given key. > + * * Caller is requred to do the following: > + * entry found: returns 0, export entry to @tent > + * entry not found: returns ENOENT > + * > + * > + * -need_modify: checks if @ti has enough space to hold another @count items. > + * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, > + * uint32_t count, uint64_t *pflags); > + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. > + * > + * Checks if given table has enough space to add @count items without > + * resize. Caller may use @pflags to store desired modification data. > + * > + * > + * > + * -prepare_mod: allocate structures for table modification. > + * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); > + * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. > + * > + * Allocate all needed state for table modification. Caller > + * should use `struct mod_item` to store new state in @ta_buf. > + * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. > + * > + * > + * > + * -fill_mod: copy some data to new state/ > + * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, > + * void *ta_buf, uint64_t *pflags); > + * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. > + * > + * Copy as much data as we can to minimize changes under WLOCK. > + * For example, array can be merged inside this callback. > + * > + * > + * > + * -modify: perform final modification. > + * typedef void (ta_modify)(void *ta_state, struct table_info *ti, > + * void *ta_buf, uint64_t pflags); > + * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). > + * > + * Performs all changes necessary to switch to new structures. > + * * Caller should save old pointers to @ta_buf storage. > + * > + * > + * > + * -flush_mod: flush table modification state. > + * typedef void (ta_flush_mod)(void *ta_buf); > + * OPTIONAL(need_modify), unlocked. (M_WAITOK). > + * > + * Performs flush for the following: > + * - prepare_mod (modification was not necessary) > + * - modify (for the old state) > + * > + * > + * > + * -change_gi: monitor table info pointer changes > + * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); > + * OPTIONAL, locked (UH). (M_NOWAIT). > + * > + * Called on @ti pointer changed. Called immediately after -init > + * to set initial state. > + * > + * > + * > + * -foreach: calls @f for each table entry > + * typedef void ta_foreach(void *ta_state, struct table_info *ti, > + * ta_foreach_f *f, void *arg); > + * MANDATORY, locked(UH). (M_NOWAIT). > + * > + * Runs callback with specified argument for each table entry, > + * Typically used for dumping table entries. > + * > + * > + * > + * -dump_tentry: dump table entry in current @tentry format. > + * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, > + * ipfw_obj_tentry *tent); > + * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. > + * > + * Dumps entry @e to @tent. > + * > + * > + * -print_config: prints custom algoritm options into buffer. > + * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, > + * char *buf, size_t bufsize); > + * OPTIONAL. locked(UH). (M_NOWAIT). > + * > + * Prints custom algorithm options in the format suitable to pass > + * back to -init callback. > + * > + * > + * > + * -dump_tinfo: dumps algo-specific info. > + * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, > + * ipfw_ta_tinfo *tinfo); > + * OPTIONAL. locked(UH). (M_NOWAIT). > + * > + * Dumps options like items size/hash size, etc. > + */ > + > +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); > + > +/* > + * Utility structures/functions common to more than one algo > + */ > + > +struct mod_item { > + void *main_ptr; > + size_t size; > + void *main_ptr6; > + size_t size6; > +}; > + > +static int badd(const void *key, void *item, void *base, size_t nmemb, > + size_t size, int (*compar) (const void *, const void *)); > +static int bdel(const void *key, void *base, size_t nmemb, size_t size, > + int (*compar) (const void *, const void *)); > + > + > +/* > + * ADDR implementation using radix > + * > + */ > + > +/* > + * The radix code expects addr and mask to be array of bytes, > + * with the first byte being the length of the array. rn_inithead > + * is called with the offset in bits of the lookup key within the > + * array. If we use a sockaddr_in as the underlying type, > + * sin_len is conveniently located at offset 0, sin_addr is at > + * offset 4 and normally aligned. > + * But for portability, let's avoid assumption and make the code explicit > + */ > +#define KEY_LEN(v) *((uint8_t *)&(v)) > +/* > + * Do not require radix to compare more than actual IPv4/IPv6 address > + */ > +#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) > +#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) > + > +#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) > +#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) > + > +struct radix_addr_entry { > + struct radix_node rn[2]; > + struct sockaddr_in addr; > + uint32_t value; > + uint8_t masklen; > +}; > + > +struct sa_in6 { > + uint8_t sin6_len; > + uint8_t sin6_family; > + uint8_t pad[2]; > + struct in6_addr sin6_addr; > +}; > + > +struct radix_addr_xentry { > + struct radix_node rn[2]; > + struct sa_in6 addr6; > + uint32_t value; > + uint8_t masklen; > +}; > + > +struct radix_cfg { > + struct radix_node_head *head4; > + struct radix_node_head *head6; > + size_t count4; > + size_t count6; > +}; > + > +struct ta_buf_radix > +{ > + void *ent_ptr; > + struct sockaddr *addr_ptr; > + struct sockaddr *mask_ptr; > + union { > + struct { > + struct sockaddr_in sa; > + struct sockaddr_in ma; > + } a4; > + struct { > + struct sa_in6 sa; > + struct sa_in6 ma; > + } a6; > + } addr; > +}; > + > +static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +static int flush_radix_entry(struct radix_node *rn, void *arg); > +static void ta_destroy_radix(void *ta_state, struct table_info *ti); > +static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, > + void *e, ipfw_obj_tentry *tent); > +static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void ta_foreach_radix(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > +static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, > + struct sockaddr *ma, int *set_mask); > +static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_add_radix(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_del_radix(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_need_modify_radix(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > + > +static int > +ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct radix_node_head *rnh; > + > + if (keylen == sizeof(in_addr_t)) { > + struct radix_addr_entry *ent; > + struct sockaddr_in sa; > + KEY_LEN(sa) = KEY_LEN_INET; > + sa.sin_addr.s_addr = *((in_addr_t *)key); > + rnh = (struct radix_node_head *)ti->state; > + ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, rnh)); > + if (ent != NULL) { > + *val = ent->value; > + return (1); > + } > + } else { > + struct radix_addr_xentry *xent; > + struct sa_in6 sa6; > + KEY_LEN(sa6) = KEY_LEN_INET6; > + memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); > + rnh = (struct radix_node_head *)ti->xstate; > + xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, rnh)); > + if (xent != NULL) { > + *val = xent->value; > + return (1); > + } > + } > + > + return (0); > +} > + > +/* > + * New table > + */ > +static int > +ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + struct radix_cfg *cfg; > + > + if (!rn_inithead(&ti->state, OFF_LEN_INET)) > + return (ENOMEM); > + if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { > + rn_detachhead(&ti->state); > + return (ENOMEM); > + } > + > + cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); > + > + *ta_state = cfg; > + ti->lookup = ta_lookup_radix; > + > + return (0); > +} > + > +static int > +flush_radix_entry(struct radix_node *rn, void *arg) > +{ > + struct radix_node_head * const rnh = arg; > + struct radix_addr_entry *ent; > + > + ent = (struct radix_addr_entry *) > + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); > + if (ent != NULL) > + free(ent, M_IPFW_TBL); > + return (0); > +} > + > +static void > +ta_destroy_radix(void *ta_state, struct table_info *ti) > +{ > + struct radix_cfg *cfg; > + struct radix_node_head *rnh; > + > + cfg = (struct radix_cfg *)ta_state; > + > + rnh = (struct radix_node_head *)(ti->state); > + rnh->rnh_walktree(rnh, flush_radix_entry, rnh); > + rn_detachhead(&ti->state); > + > + rnh = (struct radix_node_head *)(ti->xstate); > + rnh->rnh_walktree(rnh, flush_radix_entry, rnh); > + rn_detachhead(&ti->xstate); > + > + free(cfg, M_IPFW); > +} > + > +/* > + * Provide algo-specific table info > + */ > +static void > +ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + struct radix_cfg *cfg; > + > + cfg = (struct radix_cfg *)ta_state; > + > + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; > + tinfo->taclass4 = IPFW_TACLASS_RADIX; > + tinfo->count4 = cfg->count4; > + tinfo->itemsize4 = sizeof(struct radix_addr_entry); > + tinfo->taclass6 = IPFW_TACLASS_RADIX; > + tinfo->count6 = cfg->count6; > + tinfo->itemsize6 = sizeof(struct radix_addr_xentry); > +} > + > +static int > +ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct radix_addr_entry *n; > +#ifdef INET6 > + struct radix_addr_xentry *xn; > +#endif > + > + n = (struct radix_addr_entry *)e; > + > + /* Guess IPv4/IPv6 radix by sockaddr family */ > + if (n->addr.sin_family == AF_INET) { > + tent->k.addr.s_addr = n->addr.sin_addr.s_addr; > + tent->masklen = n->masklen; > + tent->subtype = AF_INET; > + tent->v.kidx = n->value; > +#ifdef INET6 > + } else { > + xn = (struct radix_addr_xentry *)e; > + memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); > + tent->masklen = xn->masklen; > + tent->subtype = AF_INET6; > + tent->v.kidx = xn->value; > +#endif > + } > + > + return (0); > +} > + > +static int > +ta_find_radix_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct radix_node_head *rnh; > + void *e; > + > + e = NULL; > + if (tent->subtype == AF_INET) { > + struct sockaddr_in sa; > + KEY_LEN(sa) = KEY_LEN_INET; > + sa.sin_addr.s_addr = tent->k.addr.s_addr; > + rnh = (struct radix_node_head *)ti->state; > + e = rnh->rnh_matchaddr(&sa, rnh); > + } else { > + struct sa_in6 sa6; > + KEY_LEN(sa6) = KEY_LEN_INET6; > + memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); > + rnh = (struct radix_node_head *)ti->xstate; > + e = rnh->rnh_matchaddr(&sa6, rnh); > + } > + > + if (e != NULL) { > + ta_dump_radix_tentry(ta_state, ti, e, tent); > + return (0); > + } > + > + return (ENOENT); > +} > + > +static void > +ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct radix_node_head *rnh; > + > + rnh = (struct radix_node_head *)(ti->state); > + rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); > + > + rnh = (struct radix_node_head *)(ti->xstate); > + rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); > +} > + > + > +#ifdef INET6 > +static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); > + > +static inline void > +ipv6_writemask(struct in6_addr *addr6, uint8_t mask) > +{ > + uint32_t *cp; > + > + for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) > + *cp++ = 0xFFFFFFFF; > + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); > +} > +#endif > + > +static void > +tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, > + struct sockaddr *ma, int *set_mask) > +{ > + int mlen; > +#ifdef INET > + struct sockaddr_in *addr, *mask; > +#endif > +#ifdef INET6 > + struct sa_in6 *addr6, *mask6; > +#endif > + in_addr_t a4; > + > + mlen = tei->masklen; > + > + if (tei->subtype == AF_INET) { > +#ifdef INET > + addr = (struct sockaddr_in *)sa; > + mask = (struct sockaddr_in *)ma; > + /* Set 'total' structure length */ > + KEY_LEN(*addr) = KEY_LEN_INET; > + KEY_LEN(*mask) = KEY_LEN_INET; > + addr->sin_family = AF_INET; > + mask->sin_addr.s_addr = > + htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); > + a4 = *((in_addr_t *)tei->paddr); > + addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; > + if (mlen != 32) > + *set_mask = 1; > + else > + *set_mask = 0; > +#endif > +#ifdef INET6 > + } else if (tei->subtype == AF_INET6) { > + /* IPv6 case */ > + addr6 = (struct sa_in6 *)sa; > + mask6 = (struct sa_in6 *)ma; > + /* Set 'total' structure length */ > + KEY_LEN(*addr6) = KEY_LEN_INET6; > + KEY_LEN(*mask6) = KEY_LEN_INET6; > + addr6->sin6_family = AF_INET6; > + ipv6_writemask(&mask6->sin6_addr, mlen); > + memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); > + APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); > + if (mlen != 128) > + *set_mask = 1; > + else > + *set_mask = 0; > +#endif > + } > +} > + > +static int > +ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_radix *tb; > + struct radix_addr_entry *ent; > +#ifdef INET6 > + struct radix_addr_xentry *xent; > +#endif > + struct sockaddr *addr, *mask; > + int mlen, set_mask; > + > + tb = (struct ta_buf_radix *)ta_buf; > + > + mlen = tei->masklen; > + set_mask = 0; > + > + if (tei->subtype == AF_INET) { > +#ifdef INET > + if (mlen > 32) > + return (EINVAL); > + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); > + ent->masklen = mlen; > + > + addr = (struct sockaddr *)&ent->addr; > + mask = (struct sockaddr *)&tb->addr.a4.ma; > + tb->ent_ptr = ent; > +#endif > +#ifdef INET6 > + } else if (tei->subtype == AF_INET6) { > + /* IPv6 case */ > + if (mlen > 128) > + return (EINVAL); > + xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); > + xent->masklen = mlen; > + > + addr = (struct sockaddr *)&xent->addr6; > + mask = (struct sockaddr *)&tb->addr.a6.ma; > + tb->ent_ptr = xent; > +#endif > + } else { > + /* Unknown CIDR type */ > + return (EINVAL); > + } > + > + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); > + /* Set pointers */ > + tb->addr_ptr = addr; > + if (set_mask != 0) > + tb->mask_ptr = mask; > + > + return (0); > +} > + > +static int > +ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct radix_cfg *cfg; > + struct radix_node_head *rnh; > + struct radix_node *rn; > + struct ta_buf_radix *tb; > + uint32_t *old_value, value; > + > + cfg = (struct radix_cfg *)ta_state; > + tb = (struct ta_buf_radix *)ta_buf; > + > + /* Save current entry value from @tei */ > + if (tei->subtype == AF_INET) { > + rnh = ti->state; > + ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; > + } else { > + rnh = ti->xstate; > + ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; > + } > + > + /* Search for an entry first */ > + rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, rnh); > + if (rn != NULL) { > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) > + return (EEXIST); > + /* Record already exists. Update value if we're asked to */ > + if (tei->subtype == AF_INET) > + old_value = &((struct radix_addr_entry *)rn)->value; > + else > + old_value = &((struct radix_addr_xentry *)rn)->value; > + > + value = *old_value; > + *old_value = tei->value; > + tei->value = value; > + > + /* Indicate that update has happened instead of addition */ > + tei->flags |= TEI_FLAGS_UPDATED; > + *pnum = 0; > + > + return (0); > + } > + > + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) > + return (EFBIG); > + > + rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, rnh, tb->ent_ptr); > + if (rn == NULL) { > + /* Unknown error */ > + return (EINVAL); > + } > + > + if (tei->subtype == AF_INET) > + cfg->count4++; > + else > + cfg->count6++; > + tb->ent_ptr = NULL; > + *pnum = 1; > + > + return (0); > +} > + > +static int > +ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_radix *tb; > + struct sockaddr *addr, *mask; > + int mlen, set_mask; > + > + tb = (struct ta_buf_radix *)ta_buf; > + > + mlen = tei->masklen; > + set_mask = 0; > + > + if (tei->subtype == AF_INET) { > + if (mlen > 32) > + return (EINVAL); > + > + addr = (struct sockaddr *)&tb->addr.a4.sa; > + mask = (struct sockaddr *)&tb->addr.a4.ma; > +#ifdef INET6 > + } else if (tei->subtype == AF_INET6) { > + if (mlen > 128) > + return (EINVAL); > + > + addr = (struct sockaddr *)&tb->addr.a6.sa; > + mask = (struct sockaddr *)&tb->addr.a6.ma; > +#endif > + } else > + return (EINVAL); > + > + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); > + tb->addr_ptr = addr; > + if (set_mask != 0) > + tb->mask_ptr = mask; > + > + return (0); > +} > + > +static int > +ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct radix_cfg *cfg; > + struct radix_node_head *rnh; > + struct radix_node *rn; > + struct ta_buf_radix *tb; > + > + cfg = (struct radix_cfg *)ta_state; > + tb = (struct ta_buf_radix *)ta_buf; > + > + if (tei->subtype == AF_INET) > + rnh = ti->state; > + else > + rnh = ti->xstate; > + > + rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, rnh); > + > + if (rn == NULL) > + return (ENOENT); > + > + /* Save entry value to @tei */ > + if (tei->subtype == AF_INET) > + tei->value = ((struct radix_addr_entry *)rn)->value; > + else > + tei->value = ((struct radix_addr_xentry *)rn)->value; > + > + tb->ent_ptr = rn; > + > + if (tei->subtype == AF_INET) > + cfg->count4--; > + else > + cfg->count6--; > + *pnum = 1; > + > + return (0); > +} > + > +static void > +ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_radix *tb; > + > + tb = (struct ta_buf_radix *)ta_buf; > + > + if (tb->ent_ptr != NULL) > + free(tb->ent_ptr, M_IPFW_TBL); > +} > + > +static int > +ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, > + uint64_t *pflags) > +{ > + > + /* > + * radix does not require additional memory allocations > + * other than nodes itself. Adding new masks to the tree do > + * but we don't have any API to call (and we don't known which > + * sizes do we need). > + */ > + return (0); > +} > + > +struct table_algo addr_radix = { > + .name = "addr:radix", > + .type = IPFW_TABLE_ADDR, > + .flags = TA_FLAG_DEFAULT, > + .ta_buf_size = sizeof(struct ta_buf_radix), > + .init = ta_init_radix, > + .destroy = ta_destroy_radix, > + .prepare_add = ta_prepare_add_radix, > + .prepare_del = ta_prepare_del_radix, > + .add = ta_add_radix, > + .del = ta_del_radix, > + .flush_entry = ta_flush_radix_entry, > + .foreach = ta_foreach_radix, > + .dump_tentry = ta_dump_radix_tentry, > + .find_tentry = ta_find_radix_tentry, > + .dump_tinfo = ta_dump_radix_tinfo, > + .need_modify = ta_need_modify_radix, > +}; > + > + > +/* > + * addr:hash cmds > + * > + * > + * ti->data: > + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] > + * [ 8][ 8[ 8][ 8] > + * > + * inv.mask4: 32 - mask > + * inv.mask6: > + * 1) _slow lookup: mask > + * 2) _aligned: (128 - mask) / 8 > + * 3) _64: 8 > + * > + * > + * pflags: > + * [v4=1/v6=0][hsize] > + * [ 32][ 32] > + */ > + > +struct chashentry; > + > +SLIST_HEAD(chashbhead, chashentry); > + > +struct chash_cfg { > + struct chashbhead *head4; > + struct chashbhead *head6; > + size_t size4; > + size_t size6; > + size_t items4; > + size_t items6; > + uint8_t mask4; > + uint8_t mask6; > +}; > + > +struct chashentry { > + SLIST_ENTRY(chashentry) next; > + uint32_t value; > + uint32_t type; > + union { > + uint32_t a4; /* Host format */ > + struct in6_addr a6; /* Network format */ > + } a; > +}; > + > +struct ta_buf_chash > +{ > + void *ent_ptr; > + struct chashentry ent; > +}; > + > +#ifdef INET > +static __inline uint32_t hash_ip(uint32_t addr, int hsize); > +#endif > +#ifdef INET6 > +static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); > +static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); > +static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, > + int mask, int hsize); > +static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, > + int hsize); > +#endif > +static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int ta_lookup_chash_aligned(struct table_info *ti, void *key, > + uint32_t keylen, uint32_t *val); > +static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int chash_parse_opts(struct chash_cfg *cfg, char *data); > +static void ta_print_chash_config(void *ta_state, struct table_info *ti, > + char *buf, size_t bufsize); > +static int ta_log2(uint32_t v); > +static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +static void ta_destroy_chash(void *ta_state, struct table_info *ti); > +static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, > + void *e, ipfw_obj_tentry *tent); > +static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, > + uint32_t size); > +static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); > +static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void ta_foreach_chash(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > +static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_add_chash(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_del_chash(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_need_modify_chash(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > +static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); > +static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t *pflags); > +static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags); > +static void ta_flush_mod_chash(void *ta_buf); > + > + > +#ifdef INET > +static __inline uint32_t > +hash_ip(uint32_t addr, int hsize) > +{ > + > + return (addr % (hsize - 1)); > +} > +#endif > + > +#ifdef INET6 > +static __inline uint32_t > +hash_ip6(struct in6_addr *addr6, int hsize) > +{ > + uint32_t i; > + > + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ > + addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; > + > + return (i % (hsize - 1)); > +} > + > + > +static __inline uint16_t > +hash_ip64(struct in6_addr *addr6, int hsize) > +{ > + uint32_t i; > + > + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; > + > + return (i % (hsize - 1)); > +} > + > + > +static __inline uint32_t > +hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) > +{ > + struct in6_addr mask6; > + > + ipv6_writemask(&mask6, mask); > + memcpy(addr6, key, sizeof(struct in6_addr)); > + APPLY_MASK(addr6, &mask6); > + return (hash_ip6(addr6, hsize)); > +} > + > +static __inline uint32_t > +hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) > +{ > + uint64_t *paddr; > + > + paddr = (uint64_t *)addr6; > + *paddr = 0; > + *(paddr + 1) = 0; > + memcpy(addr6, key, mask); > + return (hash_ip6(addr6, hsize)); > +} > +#endif > + > +static int > +ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct chashbhead *head; > + struct chashentry *ent; > + uint16_t hash, hsize; > + uint8_t imask; > + > + if (keylen == sizeof(in_addr_t)) { > +#ifdef INET > + head = (struct chashbhead *)ti->state; > + imask = ti->data >> 24; > + hsize = 1 << ((ti->data & 0xFFFF) >> 8); > + uint32_t a; > + a = ntohl(*((in_addr_t *)key)); > + a = a >> imask; > + hash = hash_ip(a, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (ent->a.a4 == a) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } else { > +#ifdef INET6 > + /* IPv6: worst scenario: non-round mask */ > + struct in6_addr addr6; > + head = (struct chashbhead *)ti->xstate; > + imask = (ti->data & 0xFF0000) >> 16; > + hsize = 1 << (ti->data & 0xFF); > + hash = hash_ip6_slow(&addr6, key, imask, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (memcmp(&ent->a.a6, &addr6, 16) == 0) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } > + > + return (0); > +} > + > +static int > +ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct chashbhead *head; > + struct chashentry *ent; > + uint16_t hash, hsize; > + uint8_t imask; > + > + if (keylen == sizeof(in_addr_t)) { > +#ifdef INET > + head = (struct chashbhead *)ti->state; > + imask = ti->data >> 24; > + hsize = 1 << ((ti->data & 0xFFFF) >> 8); > + uint32_t a; > + a = ntohl(*((in_addr_t *)key)); > + a = a >> imask; > + hash = hash_ip(a, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (ent->a.a4 == a) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } else { > +#ifdef INET6 > + /* IPv6: aligned to 8bit mask */ > + struct in6_addr addr6; > + uint64_t *paddr, *ptmp; > + head = (struct chashbhead *)ti->xstate; > + imask = (ti->data & 0xFF0000) >> 16; > + hsize = 1 << (ti->data & 0xFF); > + > + hash = hash_ip6_al(&addr6, key, imask, hsize); > + paddr = (uint64_t *)&addr6; > + SLIST_FOREACH(ent, &head[hash], next) { > + ptmp = (uint64_t *)&ent->a.a6; > + if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } > + > + return (0); > +} > + > +static int > +ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct chashbhead *head; > + struct chashentry *ent; > + uint16_t hash, hsize; > + uint8_t imask; > + > + if (keylen == sizeof(in_addr_t)) { > +#ifdef INET > + head = (struct chashbhead *)ti->state; > + imask = ti->data >> 24; > + hsize = 1 << ((ti->data & 0xFFFF) >> 8); > + uint32_t a; > + a = ntohl(*((in_addr_t *)key)); > + a = a >> imask; > + hash = hash_ip(a, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (ent->a.a4 == a) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } else { > +#ifdef INET6 > + /* IPv6: /64 */ > + uint64_t a6, *paddr; > + head = (struct chashbhead *)ti->xstate; > + paddr = (uint64_t *)key; > + hsize = 1 << (ti->data & 0xFF); > + a6 = *paddr; > + hash = hash_ip64((struct in6_addr *)key, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + paddr = (uint64_t *)&ent->a.a6; > + if (a6 == *paddr) { > + *val = ent->value; > + return (1); > + } > + } > +#endif > + } > + > + return (0); > +} > + > +static int > +chash_parse_opts(struct chash_cfg *cfg, char *data) > +{ > + char *pdel, *pend, *s; > + int mask4, mask6; > + > + mask4 = cfg->mask4; > + mask6 = cfg->mask6; > + > + if (data == NULL) > + return (0); > + if ((pdel = strchr(data, ' ')) == NULL) > + return (0); > + while (*pdel == ' ') > + pdel++; > + if (strncmp(pdel, "masks=", 6) != 0) > + return (EINVAL); > + if ((s = strchr(pdel, ' ')) != NULL) > + *s++ = '\0'; > + > + pdel += 6; > + /* Need /XX[,/YY] */ > + if (*pdel++ != '/') > + return (EINVAL); > + mask4 = strtol(pdel, &pend, 10); > + if (*pend == ',') { > + /* ,/YY */ > + pdel = pend + 1; > + if (*pdel++ != '/') > + return (EINVAL); > + mask6 = strtol(pdel, &pend, 10); > + if (*pend != '\0') > + return (EINVAL); > + } else if (*pend != '\0') > + return (EINVAL); > + > + if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) > + return (EINVAL); > + > + cfg->mask4 = mask4; > + cfg->mask6 = mask6; > + > + return (0); > +} > + > +static void > +ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, > + size_t bufsize) > +{ > + struct chash_cfg *cfg; > + > + cfg = (struct chash_cfg *)ta_state; > + > + if (cfg->mask4 != 32 || cfg->mask6 != 128) > + snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", > + cfg->mask4, cfg->mask6); > + else > + snprintf(buf, bufsize, "%s", "addr:hash"); > +} > + > +static int > +ta_log2(uint32_t v) > +{ > + uint32_t r; > + > + r = 0; > + while (v >>= 1) > + r++; > + > + return (r); > +} > + > +/* > + * New table. > + * We assume 'data' to be either NULL or the following format: > + * 'addr:hash [masks=/32[,/128]]' > + */ > +static int > +ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + int error, i; > + uint32_t hsize; > + struct chash_cfg *cfg; > + > + cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); > + > + cfg->mask4 = 32; > + cfg->mask6 = 128; > + > + if ((error = chash_parse_opts(cfg, data)) != 0) { > + free(cfg, M_IPFW); > + return (error); > + } > + > + cfg->size4 = 128; > + cfg->size6 = 128; > + > + cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, > + M_WAITOK | M_ZERO); > + cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, > + M_WAITOK | M_ZERO); > + for (i = 0; i < cfg->size4; i++) > + SLIST_INIT(&cfg->head4[i]); > + for (i = 0; i < cfg->size6; i++) > + SLIST_INIT(&cfg->head6[i]); > + > + > + *ta_state = cfg; > + ti->state = cfg->head4; > + ti->xstate = cfg->head6; > + > + /* Store data depending on v6 mask length */ > + hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); > + if (cfg->mask6 == 64) { > + ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| > + hsize; > + ti->lookup = ta_lookup_chash_64; > + } else if ((cfg->mask6 % 8) == 0) { > + ti->data = (32 - cfg->mask4) << 24 | > + cfg->mask6 << 13 | hsize; > + ti->lookup = ta_lookup_chash_aligned; > + } else { > + /* don't do that! */ > + ti->data = (32 - cfg->mask4) << 24 | > + cfg->mask6 << 16 | hsize; > + ti->lookup = ta_lookup_chash_slow; > + } > + > + return (0); > +} > + > +static void > +ta_destroy_chash(void *ta_state, struct table_info *ti) > +{ > + struct chash_cfg *cfg; > + struct chashentry *ent, *ent_next; > + int i; > + > + cfg = (struct chash_cfg *)ta_state; > + > + for (i = 0; i < cfg->size4; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) > + free(ent, M_IPFW_TBL); > + > + for (i = 0; i < cfg->size6; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) > + free(ent, M_IPFW_TBL); > + > + free(cfg->head4, M_IPFW); > + free(cfg->head6, M_IPFW); > + > + free(cfg, M_IPFW); > +} > + > +static void > +ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + struct chash_cfg *cfg; > + > + cfg = (struct chash_cfg *)ta_state; > + > + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; > + tinfo->taclass4 = IPFW_TACLASS_HASH; > + tinfo->size4 = cfg->size4; > + tinfo->count4 = cfg->items4; > + tinfo->itemsize4 = sizeof(struct chashentry); > + tinfo->taclass6 = IPFW_TACLASS_HASH; > + tinfo->size6 = cfg->size6; > + tinfo->count6 = cfg->items6; > + tinfo->itemsize6 = sizeof(struct chashentry); > +} > + > +static int > +ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct chash_cfg *cfg; > + struct chashentry *ent; > + > + cfg = (struct chash_cfg *)ta_state; > + ent = (struct chashentry *)e; > + > + if (ent->type == AF_INET) { > + tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); > + tent->masklen = cfg->mask4; > + tent->subtype = AF_INET; > + tent->v.kidx = ent->value; > +#ifdef INET6 > + } else { > + memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr)); > + tent->masklen = cfg->mask6; > + tent->subtype = AF_INET6; > + tent->v.kidx = ent->value; > +#endif > + } > + > + return (0); > +} > + > +static uint32_t > +hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) > +{ > + uint32_t hash; > + > + hash = 0; > + > + if (af == AF_INET) { > +#ifdef INET > + hash = hash_ip(ent->a.a4, size); > +#endif > + } else { > +#ifdef INET6 > + if (mlen == 64) > + hash = hash_ip64(&ent->a.a6, size); > + else > + hash = hash_ip6(&ent->a.a6, size); > +#endif > + } > + > + return (hash); > +} > + > +static int > +tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) > +{ > + int mlen; > +#ifdef INET6 > + struct in6_addr mask6; > +#endif > + > + > + mlen = tei->masklen; > + > + if (tei->subtype == AF_INET) { > +#ifdef INET > + if (mlen > 32) > + return (EINVAL); > + ent->type = AF_INET; > + > + /* Calculate masked address */ > + ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); > +#endif > +#ifdef INET6 > + } else if (tei->subtype == AF_INET6) { > + /* IPv6 case */ > + if (mlen > 128) > + return (EINVAL); > + ent->type = AF_INET6; > + > + ipv6_writemask(&mask6, mlen); > + memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); > + APPLY_MASK(&ent->a.a6, &mask6); > +#endif > + } else { > + /* Unknown CIDR type */ > + return (EINVAL); > + } > + > + return (0); > +} > + > +static int > +ta_find_chash_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct chash_cfg *cfg; > + struct chashbhead *head; > + struct chashentry ent, *tmp; > + struct tentry_info tei; > + int error; > + uint32_t hash; > + > + cfg = (struct chash_cfg *)ta_state; > + > + memset(&ent, 0, sizeof(ent)); > + memset(&tei, 0, sizeof(tei)); > + > + if (tent->subtype == AF_INET) { > + tei.paddr = &tent->k.addr; > + tei.masklen = cfg->mask4; > + tei.subtype = AF_INET; > + > + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) > + return (error); > + > + head = cfg->head4; > + hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (tmp->a.a4 != ent.a.a4) > + continue; > + > + ta_dump_chash_tentry(ta_state, ti, tmp, tent); > + return (0); > + } > + } else { > + tei.paddr = &tent->k.addr6; > + tei.masklen = cfg->mask6; > + tei.subtype = AF_INET6; > + > + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) > + return (error); > + > + head = cfg->head6; > + hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) > + continue; > + ta_dump_chash_tentry(ta_state, ti, tmp, tent); > + return (0); > + } > + } > + > + return (ENOENT); > +} > + > +static void > +ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct chash_cfg *cfg; > + struct chashentry *ent, *ent_next; > + int i; > + > + cfg = (struct chash_cfg *)ta_state; > + > + for (i = 0; i < cfg->size4; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) > + f(ent, arg); > + > + for (i = 0; i < cfg->size6; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) > + f(ent, arg); > +} > + > +static int > +ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_chash *tb; > + struct chashentry *ent; > + int error; > + > + tb = (struct ta_buf_chash *)ta_buf; > + > + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); > + > + error = tei_to_chash_ent(tei, ent); > + if (error != 0) { > + free(ent, M_IPFW_TBL); > + return (error); > + } > + tb->ent_ptr = ent; > + > + return (0); > +} > + > +static int > +ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct chash_cfg *cfg; > + struct chashbhead *head; > + struct chashentry *ent, *tmp; > + struct ta_buf_chash *tb; > + int exists; > + uint32_t hash, value; > + > + cfg = (struct chash_cfg *)ta_state; > + tb = (struct ta_buf_chash *)ta_buf; > + ent = (struct chashentry *)tb->ent_ptr; > + hash = 0; > + exists = 0; > + > + /* Read current value from @tei */ > + ent->value = tei->value; > + > + /* Read cuurrent value */ > + if (tei->subtype == AF_INET) { > + if (tei->masklen != cfg->mask4) > + return (EINVAL); > + head = cfg->head4; > + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); > + > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (tmp->a.a4 == ent->a.a4) { > + exists = 1; > + break; > + } > + } > + } else { > + if (tei->masklen != cfg->mask6) > + return (EINVAL); > + head = cfg->head6; > + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { > + exists = 1; > + break; > + } > + } > + } > + > + if (exists == 1) { > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) > + return (EEXIST); > + /* Record already exists. Update value if we're asked to */ > + value = tmp->value; > + tmp->value = tei->value; > + tei->value = value; > + /* Indicate that update has happened instead of addition */ > + tei->flags |= TEI_FLAGS_UPDATED; > + *pnum = 0; > + } else { > + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) > + return (EFBIG); > + SLIST_INSERT_HEAD(&head[hash], ent, next); > + tb->ent_ptr = NULL; > + *pnum = 1; > + > + /* Update counters */ > + if (tei->subtype == AF_INET) > + cfg->items4++; > + else > + cfg->items6++; > + } > + > + return (0); > +} > + > +static int > +ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_chash *tb; > + > + tb = (struct ta_buf_chash *)ta_buf; > + > + return (tei_to_chash_ent(tei, &tb->ent)); > +} > + > +static int > +ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct chash_cfg *cfg; > + struct chashbhead *head; > + struct chashentry *tmp, *tmp_next, *ent; > + struct ta_buf_chash *tb; > + uint32_t hash; > + > + cfg = (struct chash_cfg *)ta_state; > + tb = (struct ta_buf_chash *)ta_buf; > + ent = &tb->ent; > + > + if (tei->subtype == AF_INET) { > + if (tei->masklen != cfg->mask4) > + return (EINVAL); > + head = cfg->head4; > + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); > + > + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { > + if (tmp->a.a4 != ent->a.a4) > + continue; > + > + SLIST_REMOVE(&head[hash], tmp, chashentry, next); > + cfg->items4--; > + tb->ent_ptr = tmp; > + tei->value = tmp->value; > + *pnum = 1; > + return (0); > + } > + } else { > + if (tei->masklen != cfg->mask6) > + return (EINVAL); > + head = cfg->head6; > + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); > + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { > + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) > + continue; > + > + SLIST_REMOVE(&head[hash], tmp, chashentry, next); > + cfg->items6--; > + tb->ent_ptr = tmp; > + tei->value = tmp->value; > + *pnum = 1; > + return (0); > + } > + } > + > + return (ENOENT); > +} > + > +static void > +ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_chash *tb; > + > + tb = (struct ta_buf_chash *)ta_buf; > + > + if (tb->ent_ptr != NULL) > + free(tb->ent_ptr, M_IPFW_TBL); > +} > + > +/* > + * Hash growing callbacks. > + */ > + > +static int > +ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, > + uint64_t *pflags) > +{ > + struct chash_cfg *cfg; > + uint64_t data; > + > + /* > + * Since we don't know exact number of IPv4/IPv6 records in @count, > + * ignore non-zero @count value at all. Check current hash sizes > + * and return appropriate data. > + */ > + > + cfg = (struct chash_cfg *)ta_state; > + > + data = 0; > + if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) > + data |= (cfg->size4 * 2) << 16; > + if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) > + data |= cfg->size6 * 2; > + > + if (data != 0) { > + *pflags = data; > + return (1); > + } > + > + return (0); > +} > + > +/* > + * Allocate new, larger chash. > + */ > +static int > +ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) > +{ > + struct mod_item *mi; > + struct chashbhead *head; > + int i; > + > + mi = (struct mod_item *)ta_buf; > + > + memset(mi, 0, sizeof(struct mod_item)); > + mi->size = (*pflags >> 16) & 0xFFFF; > + mi->size6 = *pflags & 0xFFFF; > + if (mi->size > 0) { > + head = malloc(sizeof(struct chashbhead) * mi->size, > + M_IPFW, M_WAITOK | M_ZERO); > + for (i = 0; i < mi->size; i++) > + SLIST_INIT(&head[i]); > + mi->main_ptr = head; > + } > + > + if (mi->size6 > 0) { > + head = malloc(sizeof(struct chashbhead) * mi->size6, > + M_IPFW, M_WAITOK | M_ZERO); > + for (i = 0; i < mi->size6; i++) > + SLIST_INIT(&head[i]); > + mi->main_ptr6 = head; > + } > + > + return (0); > +} > + > +/* > + * Copy data from old runtime array to new one. > + */ > +static int > +ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t *pflags) > +{ > + > + /* In is not possible to do rehash if we're not holidng WLOCK. */ > + return (0); > +} > + > +/* > + * Switch old & new arrays. > + */ > +static void > +ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags) > +{ > + struct mod_item *mi; > + struct chash_cfg *cfg; > + struct chashbhead *old_head, *new_head; > + struct chashentry *ent, *ent_next; > + int af, i, mlen; > + uint32_t nhash; > + size_t old_size, new_size; > + > + mi = (struct mod_item *)ta_buf; > + cfg = (struct chash_cfg *)ta_state; > + > + /* Check which hash we need to grow and do we still need that */ > + if (mi->size > 0 && cfg->size4 < mi->size) { > + new_head = (struct chashbhead *)mi->main_ptr; > + new_size = mi->size; > + old_size = cfg->size4; > + old_head = ti->state; > + mlen = cfg->mask4; > + af = AF_INET; > + > + for (i = 0; i < old_size; i++) { > + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { > + nhash = hash_ent(ent, af, mlen, new_size); > + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); > + } > + } > + > + ti->state = new_head; > + cfg->head4 = new_head; > + cfg->size4 = mi->size; > + mi->main_ptr = old_head; > + } > + > + if (mi->size6 > 0 && cfg->size6 < mi->size6) { > + new_head = (struct chashbhead *)mi->main_ptr6; > + new_size = mi->size6; > + old_size = cfg->size6; > + old_head = ti->xstate; > + mlen = cfg->mask6; > + af = AF_INET6; > + > + for (i = 0; i < old_size; i++) { > + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { > + nhash = hash_ent(ent, af, mlen, new_size); > + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); > + } > + } > + > + ti->xstate = new_head; > + cfg->head6 = new_head; > + cfg->size6 = mi->size6; > + mi->main_ptr6 = old_head; > + } > + > + /* Update lower 32 bits with new values */ > + ti->data &= 0xFFFFFFFF00000000; > + ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); > +} > + > +/* > + * Free unneded array. > + */ > +static void > +ta_flush_mod_chash(void *ta_buf) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + if (mi->main_ptr != NULL) > + free(mi->main_ptr, M_IPFW); > + if (mi->main_ptr6 != NULL) > + free(mi->main_ptr6, M_IPFW); > +} > + > +struct table_algo addr_hash = { > + .name = "addr:hash", > + .type = IPFW_TABLE_ADDR, > + .ta_buf_size = sizeof(struct ta_buf_chash), > + .init = ta_init_chash, > + .destroy = ta_destroy_chash, > + .prepare_add = ta_prepare_add_chash, > + .prepare_del = ta_prepare_del_chash, > + .add = ta_add_chash, > + .del = ta_del_chash, > + .flush_entry = ta_flush_chash_entry, > + .foreach = ta_foreach_chash, > + .dump_tentry = ta_dump_chash_tentry, > + .find_tentry = ta_find_chash_tentry, > + .print_config = ta_print_chash_config, > + .dump_tinfo = ta_dump_chash_tinfo, > + .need_modify = ta_need_modify_chash, > + .prepare_mod = ta_prepare_mod_chash, > + .fill_mod = ta_fill_mod_chash, > + .modify = ta_modify_chash, > + .flush_mod = ta_flush_mod_chash, > +}; > + > + > +/* > + * Iface table cmds. > + * > + * Implementation: > + * > + * Runtime part: > + * - sorted array of "struct ifidx" pointed by ti->state. > + * Array is allocated with rounding up to IFIDX_CHUNK. Only existing > + * interfaces are stored in array, however its allocated size is > + * sufficient to hold all table records if needed. > + * - current array size is stored in ti->data > + * > + * Table data: > + * - "struct iftable_cfg" is allocated to store table state (ta_state). > + * - All table records are stored inside namedobj instance. > + * > + */ > + > +struct ifidx { > + uint16_t kidx; > + uint16_t spare; > + uint32_t value; > +}; > +#define DEFAULT_IFIDX_SIZE 64 > + > +struct iftable_cfg; > + > +struct ifentry { > + struct named_object no; > + struct ipfw_ifc ic; > + struct iftable_cfg *icfg; > + uint32_t value; > + int linked; > +}; > + > +struct iftable_cfg { > + struct namedobj_instance *ii; > + struct ip_fw_chain *ch; > + struct table_info *ti; > + void *main_ptr; > + size_t size; /* Number of items allocated in array */ > + size_t count; /* Number of all items */ > + size_t used; /* Number of items _active_ now */ > +}; > + > +struct ta_buf_ifidx > +{ > + struct ifentry *ife; > + uint32_t value; > +}; > + > +int compare_ifidx(const void *k, const void *v); > +static struct ifidx * ifidx_find(struct table_info *ti, void *key); > +static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); > +static void destroy_ifidx_locked(struct namedobj_instance *ii, > + struct named_object *no, void *arg); > +static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); > +static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_add_ifidx(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_del_ifidx(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, > + struct tentry_info *tei, void *ta_buf); > +static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); > +static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > +static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); > +static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t *pflags); > +static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags); > +static void ta_flush_mod_ifidx(void *ta_buf); > +static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent); > +static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, > + void *arg); > +static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > + > +int > +compare_ifidx(const void *k, const void *v) > +{ > + const struct ifidx *ifidx; > + uint16_t key; > + > + key = *((const uint16_t *)k); > + ifidx = (const struct ifidx *)v; > + > + if (key < ifidx->kidx) > + return (-1); > + else if (key > ifidx->kidx) > + return (1); > + > + return (0); > +} > + > +/* > + * Adds item @item with key @key into ascending-sorted array @base. > + * Assumes @base has enough additional storage. > + * > + * Returns 1 on success, 0 on duplicate key. > + */ > +static int > +badd(const void *key, void *item, void *base, size_t nmemb, > + size_t size, int (*compar) (const void *, const void *)) > +{ > + int min, max, mid, shift, res; > + caddr_t paddr; > + > + if (nmemb == 0) { > + memcpy(base, item, size); > + return (1); > + } > + > + /* Binary search */ > + min = 0; > + max = nmemb - 1; > + mid = 0; > + while (min <= max) { > + mid = (min + max) / 2; > + res = compar(key, (const void *)((caddr_t)base + mid * size)); > + if (res == 0) > + return (0); > + > + if (res > 0) > + min = mid + 1; > + else > + max = mid - 1; > + } > + > + /* Item not found. */ > + res = compar(key, (const void *)((caddr_t)base + mid * size)); > + if (res > 0) > + shift = mid + 1; > + else > + shift = mid; > + > + paddr = (caddr_t)base + shift * size; > + if (nmemb > shift) > + memmove(paddr + size, paddr, (nmemb - shift) * size); > + > + memcpy(paddr, item, size); > + > + return (1); > +} > + > +/* > + * Deletes item with key @key from ascending-sorted array @base. > + * > + * Returns 1 on success, 0 for non-existent key. > + */ > +static int > +bdel(const void *key, void *base, size_t nmemb, size_t size, > + int (*compar) (const void *, const void *)) > +{ > + caddr_t item; > + size_t sz; > + > + item = (caddr_t)bsearch(key, base, nmemb, size, compar); > + > + if (item == NULL) > + return (0); > + > + sz = (caddr_t)base + nmemb * size - item; > + > + if (sz > 0) > + memmove(item, item + size, sz); > + > + return (1); > +} > + > +static struct ifidx * > +ifidx_find(struct table_info *ti, void *key) > +{ > + struct ifidx *ifi; > + > + ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), > + compare_ifidx); > + > + return (ifi); > +} > + > +static int > +ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct ifidx *ifi; > + > + ifi = ifidx_find(ti, key); > + > + if (ifi != NULL) { > + *val = ifi->value; > + return (1); > + } > + > + return (0); > +} > + > +static int > +ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + struct iftable_cfg *icfg; > + > + icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); > + > + icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); > + icfg->size = DEFAULT_IFIDX_SIZE; > + icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, > + M_WAITOK | M_ZERO); > + icfg->ch = ch; > + > + *ta_state = icfg; > + ti->state = icfg->main_ptr; > + ti->lookup = ta_lookup_ifidx; > + > + return (0); > +} > + > +/* > + * Handle tableinfo @ti pointer change (on table array resize). > + */ > +static void > +ta_change_ti_ifidx(void *ta_state, struct table_info *ti) > +{ > + struct iftable_cfg *icfg; > + > + icfg = (struct iftable_cfg *)ta_state; > + icfg->ti = ti; > +} > + > +static void > +destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, > + void *arg) > +{ > + struct ifentry *ife; > + struct ip_fw_chain *ch; > + > + ch = (struct ip_fw_chain *)arg; > + ife = (struct ifentry *)no; > + > + ipfw_iface_del_notify(ch, &ife->ic); > + free(ife, M_IPFW_TBL); > +} > + > + > +/* > + * Destroys table @ti > + */ > +static void > +ta_destroy_ifidx(void *ta_state, struct table_info *ti) > +{ > + struct iftable_cfg *icfg; > + struct ip_fw_chain *ch; > + > + icfg = (struct iftable_cfg *)ta_state; > + ch = icfg->ch; > + > + if (icfg->main_ptr != NULL) > + free(icfg->main_ptr, M_IPFW); > + > + ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); > + > + ipfw_objhash_destroy(icfg->ii); > + > + free(icfg, M_IPFW); > +} > + > +/* > + * Provide algo-specific table info > + */ > +static void > +ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + struct iftable_cfg *cfg; > + > + cfg = (struct iftable_cfg *)ta_state; > + > + tinfo->taclass4 = IPFW_TACLASS_ARRAY; > + tinfo->size4 = cfg->size; > + tinfo->count4 = cfg->used; > + tinfo->itemsize4 = sizeof(struct ifidx); > +} > + > +/* > + * Prepare state to add to the table: > + * allocate ifentry and reference needed interface. > + */ > +static int > +ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_ifidx *tb; > + char *ifname; > + struct ifentry *ife; > + > + tb = (struct ta_buf_ifidx *)ta_buf; > + > + /* Check if string is terminated */ > + ifname = (char *)tei->paddr; > + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) > + return (EINVAL); > + > + ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); > + ife->ic.cb = if_notifier; > + ife->ic.cbdata = ife; > + > + if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { > + free(ife, M_IPFW_TBL); > + return (EINVAL); > + } > + > + /* Use ipfw_iface 'ifname' field as stable storage */ > + ife->no.name = ife->ic.iface->ifname; > + > + tb->ife = ife; > + > + return (0); > +} > + > +static int > +ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct iftable_cfg *icfg; > + struct ifentry *ife, *tmp; > + struct ta_buf_ifidx *tb; > + struct ipfw_iface *iif; > + struct ifidx *ifi; > + char *ifname; > + uint32_t value; > + > + tb = (struct ta_buf_ifidx *)ta_buf; > + ifname = (char *)tei->paddr; > + icfg = (struct iftable_cfg *)ta_state; > + ife = tb->ife; > + > + ife->icfg = icfg; > + ife->value = tei->value; > + > + tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); > + > + if (tmp != NULL) { > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) > + return (EEXIST); > + > + /* Exchange values in @tmp and @tei */ > + value = tmp->value; > + tmp->value = tei->value; > + tei->value = value; > + > + iif = tmp->ic.iface; > + if (iif->resolved != 0) { > + /* We have to update runtime value, too */ > + ifi = ifidx_find(ti, &iif->ifindex); > + ifi->value = ife->value; > + } > + > + /* Indicate that update has happened instead of addition */ > + tei->flags |= TEI_FLAGS_UPDATED; > + *pnum = 0; > + return (0); > + } > + > + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) > + return (EFBIG); > + > + /* Link to internal list */ > + ipfw_objhash_add(icfg->ii, &ife->no); > + > + /* Link notifier (possible running its callback) */ > + ipfw_iface_add_notify(icfg->ch, &ife->ic); > + icfg->count++; > + > + tb->ife = NULL; > + *pnum = 1; > + > + return (0); > +} > + > +/* > + * Prepare to delete key from table. > + * Do basic interface name checks. > + */ > +static int > +ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_ifidx *tb; > + char *ifname; > + > + tb = (struct ta_buf_ifidx *)ta_buf; > + > + /* Check if string is terminated */ > + ifname = (char *)tei->paddr; > + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) > + return (EINVAL); > + > + return (0); > +} > + > +/* > + * Remove key from both configuration list and > + * runtime array. Removed interface notification. > + */ > +static int > +ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct iftable_cfg *icfg; > + struct ifentry *ife; > + struct ta_buf_ifidx *tb; > + char *ifname; > + uint16_t ifindex; > + int res; > + > + tb = (struct ta_buf_ifidx *)ta_buf; > + ifname = (char *)tei->paddr; > + icfg = (struct iftable_cfg *)ta_state; > + ife = tb->ife; > + > + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); > + > + if (ife == NULL) > + return (ENOENT); > + > + if (ife->linked != 0) { > + /* We have to remove item from runtime */ > + ifindex = ife->ic.iface->ifindex; > + > + res = bdel(&ifindex, icfg->main_ptr, icfg->used, > + sizeof(struct ifidx), compare_ifidx); > + > + KASSERT(res == 1, ("index %d does not exist", ifindex)); > + icfg->used--; > + ti->data = icfg->used; > + ife->linked = 0; > + } > + > + /* Unlink from local list */ > + ipfw_objhash_del(icfg->ii, &ife->no); > + /* Unlink notifier */ > + ipfw_iface_del_notify(icfg->ch, &ife->ic); > + > + icfg->count--; > + tei->value = ife->value; > + > + tb->ife = ife; > + *pnum = 1; > + > + return (0); > +} > + > +/* > + * Flush deleted entry. > + * Drops interface reference and frees entry. > + */ > +static void > +ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_ifidx *tb; > + > + tb = (struct ta_buf_ifidx *)ta_buf; > + > + if (tb->ife != NULL) { > + /* Unlink first */ > + ipfw_iface_unref(ch, &tb->ife->ic); > + free(tb->ife, M_IPFW_TBL); > + } > +} > + > + > +/* > + * Handle interface announce/withdrawal for particular table. > + * Every real runtime array modification happens here. > + */ > +static void > +if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) > +{ > + struct ifentry *ife; > + struct ifidx ifi; > + struct iftable_cfg *icfg; > + struct table_info *ti; > + int res; > + > + ife = (struct ifentry *)cbdata; > + icfg = ife->icfg; > + ti = icfg->ti; > + > + KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); > + > + if (ife->linked == 0 && ifindex != 0) { > + /* Interface announce */ > + ifi.kidx = ifindex; > + ifi.spare = 0; > + ifi.value = ife->value; > + res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, > + sizeof(struct ifidx), compare_ifidx); > + KASSERT(res == 1, ("index %d already exists", ifindex)); > + icfg->used++; > + ti->data = icfg->used; > + ife->linked = 1; > + } else if (ife->linked != 0 && ifindex == 0) { > + /* Interface withdrawal */ > + ifindex = ife->ic.iface->ifindex; > + > + res = bdel(&ifindex, icfg->main_ptr, icfg->used, > + sizeof(struct ifidx), compare_ifidx); > + > + KASSERT(res == 1, ("index %d does not exist", ifindex)); > + icfg->used--; > + ti->data = icfg->used; > + ife->linked = 0; > + } > +} > + > + > +/* > + * Table growing callbacks. > + */ > + > +static int > +ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, > + uint64_t *pflags) > +{ > + struct iftable_cfg *cfg; > + uint32_t size; > + > + cfg = (struct iftable_cfg *)ta_state; > + > + size = cfg->size; > + while (size < cfg->count + count) > + size *= 2; > + > + if (size != cfg->size) { > + *pflags = size; > + return (1); > + } > + > + return (0); > +} > + > +/* > + * Allocate ned, larger runtime ifidx array. > + */ > +static int > +ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + > + memset(mi, 0, sizeof(struct mod_item)); > + mi->size = *pflags; > + mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, > + M_WAITOK | M_ZERO); > + > + return (0); > +} > + > +/* > + * Copy data from old runtime array to new one. > + */ > +static int > +ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t *pflags) > +{ > + struct mod_item *mi; > + struct iftable_cfg *icfg; > + > + mi = (struct mod_item *)ta_buf; > + icfg = (struct iftable_cfg *)ta_state; > + > + /* Check if we still need to grow array */ > + if (icfg->size >= mi->size) { > + *pflags = 0; > + return (0); > + } > + > + memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); > + > + return (0); > +} > + > +/* > + * Switch old & new arrays. > + */ > +static void > +ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags) > +{ > + struct mod_item *mi; > + struct iftable_cfg *icfg; > + void *old_ptr; > + > + mi = (struct mod_item *)ta_buf; > + icfg = (struct iftable_cfg *)ta_state; > + > + old_ptr = icfg->main_ptr; > + icfg->main_ptr = mi->main_ptr; > + icfg->size = mi->size; > + ti->state = icfg->main_ptr; > + > + mi->main_ptr = old_ptr; > +} > + > +/* > + * Free unneded array. > + */ > +static void > +ta_flush_mod_ifidx(void *ta_buf) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + if (mi->main_ptr != NULL) > + free(mi->main_ptr, M_IPFW); > +} > + > +static int > +ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct ifentry *ife; > + > + ife = (struct ifentry *)e; > + > + tent->masklen = 8 * IF_NAMESIZE; > + memcpy(&tent->k, ife->no.name, IF_NAMESIZE); > + tent->v.kidx = ife->value; > + > + return (0); > +} > + > +static int > +ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct iftable_cfg *icfg; > + struct ifentry *ife; > + char *ifname; > + > + icfg = (struct iftable_cfg *)ta_state; > + ifname = tent->k.iface; > + > + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) > + return (EINVAL); > + > + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); > + > + if (ife != NULL) { > + ta_dump_ifidx_tentry(ta_state, ti, ife, tent); > + return (0); > + } > + > + return (ENOENT); > +} > + > +struct wa_ifidx { > + ta_foreach_f *f; > + void *arg; > +}; > + > +static void > +foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, > + void *arg) > +{ > + struct ifentry *ife; > + struct wa_ifidx *wa; > + > + ife = (struct ifentry *)no; > + wa = (struct wa_ifidx *)arg; > + > + wa->f(ife, wa->arg); > +} > + > +static void > +ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct iftable_cfg *icfg; > + struct wa_ifidx wa; > + > + icfg = (struct iftable_cfg *)ta_state; > + > + wa.f = f; > + wa.arg = arg; > + > + ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); > +} > + > +struct table_algo iface_idx = { > + .name = "iface:array", > + .type = IPFW_TABLE_INTERFACE, > + .flags = TA_FLAG_DEFAULT, > + .ta_buf_size = sizeof(struct ta_buf_ifidx), > + .init = ta_init_ifidx, > + .destroy = ta_destroy_ifidx, > + .prepare_add = ta_prepare_add_ifidx, > + .prepare_del = ta_prepare_del_ifidx, > + .add = ta_add_ifidx, > + .del = ta_del_ifidx, > + .flush_entry = ta_flush_ifidx_entry, > + .foreach = ta_foreach_ifidx, > + .dump_tentry = ta_dump_ifidx_tentry, > + .find_tentry = ta_find_ifidx_tentry, > + .dump_tinfo = ta_dump_ifidx_tinfo, > + .need_modify = ta_need_modify_ifidx, > + .prepare_mod = ta_prepare_mod_ifidx, > + .fill_mod = ta_fill_mod_ifidx, > + .modify = ta_modify_ifidx, > + .flush_mod = ta_flush_mod_ifidx, > + .change_ti = ta_change_ti_ifidx, > +}; > + > +/* > + * Number array cmds. > + * > + * Implementation: > + * > + * Runtime part: > + * - sorted array of "struct numarray" pointed by ti->state. > + * Array is allocated with rounding up to NUMARRAY_CHUNK. > + * - current array size is stored in ti->data > + * > + */ > + > +struct numarray { > + uint32_t number; > + uint32_t value; > +}; > + > +struct numarray_cfg { > + void *main_ptr; > + size_t size; /* Number of items allocated in array */ > + size_t used; /* Number of items _active_ now */ > +}; > + > +struct ta_buf_numarray > +{ > + struct numarray na; > +}; > + > +int compare_numarray(const void *k, const void *v); > +static struct numarray *numarray_find(struct table_info *ti, void *key); > +static int ta_lookup_numarray(struct table_info *ti, void *key, > + uint32_t keylen, uint32_t *val); > +static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +static void ta_destroy_numarray(void *ta_state, struct table_info *ti); > +static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int ta_prepare_add_numarray(struct ip_fw_chain *ch, > + struct tentry_info *tei, void *ta_buf); > +static int ta_add_numarray(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static int ta_del_numarray(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static void ta_flush_numarray_entry(struct ip_fw_chain *ch, > + struct tentry_info *tei, void *ta_buf); > +static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > +static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); > +static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t *pflags); > +static void ta_modify_numarray(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t pflags); > +static void ta_flush_mod_numarray(void *ta_buf); > +static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, > + void *e, ipfw_obj_tentry *tent); > +static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void ta_foreach_numarray(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > + > +int > +compare_numarray(const void *k, const void *v) > +{ > + const struct numarray *na; > + uint32_t key; > + > + key = *((const uint32_t *)k); > + na = (const struct numarray *)v; > + > + if (key < na->number) > + return (-1); > + else if (key > na->number) > + return (1); > + > + return (0); > +} > + > +static struct numarray * > +numarray_find(struct table_info *ti, void *key) > +{ > + struct numarray *ri; > + > + ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), > + compare_ifidx); > + > + return (ri); > +} > + > +static int > +ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct numarray *ri; > + > + ri = numarray_find(ti, key); > + > + if (ri != NULL) { > + *val = ri->value; > + return (1); > + } > + > + return (0); > +} > + > +static int > +ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + struct numarray_cfg *cfg; > + > + cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); > + > + cfg->size = 16; > + cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, > + M_WAITOK | M_ZERO); > + > + *ta_state = cfg; > + ti->state = cfg->main_ptr; > + ti->lookup = ta_lookup_numarray; > + > + return (0); > +} > + > +/* > + * Destroys table @ti > + */ > +static void > +ta_destroy_numarray(void *ta_state, struct table_info *ti) > +{ > + struct numarray_cfg *cfg; > + > + cfg = (struct numarray_cfg *)ta_state; > + > + if (cfg->main_ptr != NULL) > + free(cfg->main_ptr, M_IPFW); > + > + free(cfg, M_IPFW); > +} > + > +/* > + * Provide algo-specific table info > + */ > +static void > +ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + struct numarray_cfg *cfg; > + > + cfg = (struct numarray_cfg *)ta_state; > + > + tinfo->taclass4 = IPFW_TACLASS_ARRAY; > + tinfo->size4 = cfg->size; > + tinfo->count4 = cfg->used; > + tinfo->itemsize4 = sizeof(struct numarray); > +} > + > +/* > + * Prepare for addition/deletion to an array. > + */ > +static int > +ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_numarray *tb; > + > + tb = (struct ta_buf_numarray *)ta_buf; > + > + tb->na.number = *((uint32_t *)tei->paddr); > + > + return (0); > +} > + > +static int > +ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct numarray_cfg *cfg; > + struct ta_buf_numarray *tb; > + struct numarray *ri; > + int res; > + uint32_t value; > + > + tb = (struct ta_buf_numarray *)ta_buf; > + cfg = (struct numarray_cfg *)ta_state; > + > + /* Read current value from @tei */ > + tb->na.value = tei->value; > + > + ri = numarray_find(ti, &tb->na.number); > + > + if (ri != NULL) { > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) > + return (EEXIST); > + > + /* Exchange values between ri and @tei */ > + value = ri->value; > + ri->value = tei->value; > + tei->value = value; > + /* Indicate that update has happened instead of addition */ > + tei->flags |= TEI_FLAGS_UPDATED; > + *pnum = 0; > + return (0); > + } > + > + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) > + return (EFBIG); > + > + res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, > + sizeof(struct numarray), compare_numarray); > + > + KASSERT(res == 1, ("number %d already exists", tb->na.number)); > + cfg->used++; > + ti->data = cfg->used; > + *pnum = 1; > + > + return (0); > +} > + > +/* > + * Remove key from both configuration list and > + * runtime array. Removed interface notification. > + */ > +static int > +ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct numarray_cfg *cfg; > + struct ta_buf_numarray *tb; > + struct numarray *ri; > + int res; > + > + tb = (struct ta_buf_numarray *)ta_buf; > + cfg = (struct numarray_cfg *)ta_state; > + > + ri = numarray_find(ti, &tb->na.number); > + if (ri == NULL) > + return (ENOENT); > + > + tei->value = ri->value; > + > + res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, > + sizeof(struct numarray), compare_numarray); > + > + KASSERT(res == 1, ("number %u does not exist", tb->na.number)); > + cfg->used--; > + ti->data = cfg->used; > + *pnum = 1; > + > + return (0); > +} > + > +static void > +ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + > + /* We don't have any state, do nothing */ > +} > + > + > +/* > + * Table growing callbacks. > + */ > + > +static int > +ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, > + uint64_t *pflags) > +{ > + struct numarray_cfg *cfg; > + size_t size; > + > + cfg = (struct numarray_cfg *)ta_state; > + > + size = cfg->size; > + while (size < cfg->used + count) > + size *= 2; > + > + if (size != cfg->size) { > + *pflags = size; > + return (1); > + } > + > + return (0); > +} > + > +/* > + * Allocate new, larger runtime array. > + */ > +static int > +ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + > + memset(mi, 0, sizeof(struct mod_item)); > + mi->size = *pflags; > + mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, > + M_WAITOK | M_ZERO); > + > + return (0); > +} > + > +/* > + * Copy data from old runtime array to new one. > + */ > +static int > +ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t *pflags) > +{ > + struct mod_item *mi; > + struct numarray_cfg *cfg; > + > + mi = (struct mod_item *)ta_buf; > + cfg = (struct numarray_cfg *)ta_state; > + > + /* Check if we still need to grow array */ > + if (cfg->size >= mi->size) { > + *pflags = 0; > + return (0); > + } > + > + memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); > + > + return (0); > +} > + > +/* > + * Switch old & new arrays. > + */ > +static void > +ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags) > +{ > + struct mod_item *mi; > + struct numarray_cfg *cfg; > + void *old_ptr; > + > + mi = (struct mod_item *)ta_buf; > + cfg = (struct numarray_cfg *)ta_state; > + > + old_ptr = cfg->main_ptr; > + cfg->main_ptr = mi->main_ptr; > + cfg->size = mi->size; > + ti->state = cfg->main_ptr; > + > + mi->main_ptr = old_ptr; > +} > + > +/* > + * Free unneded array. > + */ > +static void > +ta_flush_mod_numarray(void *ta_buf) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + if (mi->main_ptr != NULL) > + free(mi->main_ptr, M_IPFW); > +} > + > +static int > +ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct numarray *na; > + > + na = (struct numarray *)e; > + > + tent->k.key = na->number; > + tent->v.kidx = na->value; > + > + return (0); > +} > + > +static int > +ta_find_numarray_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct numarray_cfg *cfg; > + struct numarray *ri; > + > + cfg = (struct numarray_cfg *)ta_state; > + > + ri = numarray_find(ti, &tent->k.key); > + > + if (ri != NULL) { > + ta_dump_numarray_tentry(ta_state, ti, ri, tent); > + return (0); > + } > + > + return (ENOENT); > +} > + > +static void > +ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct numarray_cfg *cfg; > + struct numarray *array; > + int i; > + > + cfg = (struct numarray_cfg *)ta_state; > + array = cfg->main_ptr; > + > + for (i = 0; i < cfg->used; i++) > + f(&array[i], arg); > +} > + > +struct table_algo number_array = { > + .name = "number:array", > + .type = IPFW_TABLE_NUMBER, > + .ta_buf_size = sizeof(struct ta_buf_numarray), > + .init = ta_init_numarray, > + .destroy = ta_destroy_numarray, > + .prepare_add = ta_prepare_add_numarray, > + .prepare_del = ta_prepare_add_numarray, > + .add = ta_add_numarray, > + .del = ta_del_numarray, > + .flush_entry = ta_flush_numarray_entry, > + .foreach = ta_foreach_numarray, > + .dump_tentry = ta_dump_numarray_tentry, > + .find_tentry = ta_find_numarray_tentry, > + .dump_tinfo = ta_dump_numarray_tinfo, > + .need_modify = ta_need_modify_numarray, > + .prepare_mod = ta_prepare_mod_numarray, > + .fill_mod = ta_fill_mod_numarray, > + .modify = ta_modify_numarray, > + .flush_mod = ta_flush_mod_numarray, > +}; > + > +/* > + * flow:hash cmds > + * > + * > + * ti->data: > + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] > + * [ 8][ 8[ 8][ 8] > + * > + * inv.mask4: 32 - mask > + * inv.mask6: > + * 1) _slow lookup: mask > + * 2) _aligned: (128 - mask) / 8 > + * 3) _64: 8 > + * > + * > + * pflags: > + * [hsize4][hsize6] > + * [ 16][ 16] > + */ > + > +struct fhashentry; > + > +SLIST_HEAD(fhashbhead, fhashentry); > + > +struct fhashentry { > + SLIST_ENTRY(fhashentry) next; > + uint8_t af; > + uint8_t proto; > + uint16_t spare0; > + uint16_t dport; > + uint16_t sport; > + uint32_t value; > + uint32_t spare1; > +}; > + > +struct fhashentry4 { > + struct fhashentry e; > + struct in_addr dip; > + struct in_addr sip; > +}; > + > +struct fhashentry6 { > + struct fhashentry e; > + struct in6_addr dip6; > + struct in6_addr sip6; > +}; > + > +struct fhash_cfg { > + struct fhashbhead *head; > + size_t size; > + size_t items; > + struct fhashentry4 fe4; > + struct fhashentry6 fe6; > +}; > + > +struct ta_buf_fhash { > + void *ent_ptr; > + struct fhashentry6 fe6; > +}; > + > +static __inline int cmp_flow_ent(struct fhashentry *a, > + struct fhashentry *b, size_t sz); > +static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); > +static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); > +static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); > +static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, > +struct table_info *ti, char *data, uint8_t tflags); > +static void ta_destroy_fhash(void *ta_state, struct table_info *ti); > +static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, > + void *e, ipfw_obj_tentry *tent); > +static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); > +static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void ta_foreach_fhash(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > +static int ta_prepare_add_fhash(struct ip_fw_chain *ch, > + struct tentry_info *tei, void *ta_buf); > +static int ta_add_fhash(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_del_fhash(void *ta_state, struct table_info *ti, > + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); > +static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf); > +static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, > + uint32_t count, uint64_t *pflags); > +static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); > +static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, > + void *ta_buf, uint64_t *pflags); > +static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags); > +static void ta_flush_mod_fhash(void *ta_buf); > + > +static __inline int > +cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) > +{ > + uint64_t *ka, *kb; > + > + ka = (uint64_t *)(&a->next + 1); > + kb = (uint64_t *)(&b->next + 1); > + > + if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) > + return (1); > + > + return (0); > +} > + > +static __inline uint32_t > +hash_flow4(struct fhashentry4 *f, int hsize) > +{ > + uint32_t i; > + > + i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); > + > + return (i % (hsize - 1)); > +} > + > +static __inline uint32_t > +hash_flow6(struct fhashentry6 *f, int hsize) > +{ > + uint32_t i; > + > + i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ > + (f->dip6.__u6_addr.__u6_addr32[3]) ^ > + (f->sip6.__u6_addr.__u6_addr32[2]) ^ > + (f->sip6.__u6_addr.__u6_addr32[3]) ^ > + (f->e.dport) ^ (f->e.sport); > + > + return (i % (hsize - 1)); > +} > + > +static uint32_t > +hash_flow_ent(struct fhashentry *ent, uint32_t size) > +{ > + uint32_t hash; > + > + if (ent->af == AF_INET) { > + hash = hash_flow4((struct fhashentry4 *)ent, size); > + } else { > + hash = hash_flow6((struct fhashentry6 *)ent, size); > + } > + > + return (hash); > +} > + > +static int > +ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct fhashbhead *head; > + struct fhashentry *ent; > + struct fhashentry4 *m4; > + struct ipfw_flow_id *id; > + uint16_t hash, hsize; > + > + id = (struct ipfw_flow_id *)key; > + head = (struct fhashbhead *)ti->state; > + hsize = ti->data; > + m4 = (struct fhashentry4 *)ti->xstate; > + > + if (id->addr_type == 4) { > + struct fhashentry4 f; > + > + /* Copy hash mask */ > + f = *m4; > + > + f.dip.s_addr &= id->dst_ip; > + f.sip.s_addr &= id->src_ip; > + f.e.dport &= id->dst_port; > + f.e.sport &= id->src_port; > + f.e.proto &= id->proto; > + hash = hash_flow4(&f, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { > + *val = ent->value; > + return (1); > + } > + } > + } else if (id->addr_type == 6) { > + struct fhashentry6 f; > + uint64_t *fp, *idp; > + > + /* Copy hash mask */ > + f = *((struct fhashentry6 *)(m4 + 1)); > + > + /* Handle lack of __u6_addr.__u6_addr64 */ > + fp = (uint64_t *)&f.dip6; > + idp = (uint64_t *)&id->dst_ip6; > + /* src IPv6 is stored after dst IPv6 */ > + *fp++ &= *idp++; > + *fp++ &= *idp++; > + *fp++ &= *idp++; > + *fp &= *idp; > + f.e.dport &= id->dst_port; > + f.e.sport &= id->src_port; > + f.e.proto &= id->proto; > + hash = hash_flow6(&f, hsize); > + SLIST_FOREACH(ent, &head[hash], next) { > + if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { > + *val = ent->value; > + return (1); > + } > + } > + } > + > + return (0); > +} > + > +/* > + * New table. > + */ > +static int > +ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + int i; > + struct fhash_cfg *cfg; > + struct fhashentry4 *fe4; > + struct fhashentry6 *fe6; > + > + cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); > + > + cfg->size = 512; > + > + cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, > + M_WAITOK | M_ZERO); > + for (i = 0; i < cfg->size; i++) > + SLIST_INIT(&cfg->head[i]); > + > + /* Fill in fe masks based on @tflags */ > + fe4 = &cfg->fe4; > + fe6 = &cfg->fe6; > + if (tflags & IPFW_TFFLAG_SRCIP) { > + memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); > + memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); > + } > + if (tflags & IPFW_TFFLAG_DSTIP) { > + memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); > + memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); > + } > + if (tflags & IPFW_TFFLAG_SRCPORT) { > + memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); > + memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); > + } > + if (tflags & IPFW_TFFLAG_DSTPORT) { > + memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); > + memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); > + } > + if (tflags & IPFW_TFFLAG_PROTO) { > + memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); > + memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); > + } > + > + fe4->e.af = AF_INET; > + fe6->e.af = AF_INET6; > + > + *ta_state = cfg; > + ti->state = cfg->head; > + ti->xstate = &cfg->fe4; > + ti->data = cfg->size; > + ti->lookup = ta_lookup_fhash; > + > + return (0); > +} > + > +static void > +ta_destroy_fhash(void *ta_state, struct table_info *ti) > +{ > + struct fhash_cfg *cfg; > + struct fhashentry *ent, *ent_next; > + int i; > + > + cfg = (struct fhash_cfg *)ta_state; > + > + for (i = 0; i < cfg->size; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) > + free(ent, M_IPFW_TBL); > + > + free(cfg->head, M_IPFW); > + free(cfg, M_IPFW); > +} > + > +/* > + * Provide algo-specific table info > + */ > +static void > +ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + struct fhash_cfg *cfg; > + > + cfg = (struct fhash_cfg *)ta_state; > + > + tinfo->flags = IPFW_TATFLAGS_AFITEM; > + tinfo->taclass4 = IPFW_TACLASS_HASH; > + tinfo->size4 = cfg->size; > + tinfo->count4 = cfg->items; > + tinfo->itemsize4 = sizeof(struct fhashentry4); > + tinfo->itemsize6 = sizeof(struct fhashentry6); > +} > + > +static int > +ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct fhash_cfg *cfg; > + struct fhashentry *ent; > + struct fhashentry4 *fe4; > +#ifdef INET6 > + struct fhashentry6 *fe6; > +#endif > + struct tflow_entry *tfe; > + > + cfg = (struct fhash_cfg *)ta_state; > + ent = (struct fhashentry *)e; > + tfe = &tent->k.flow; > + > + tfe->af = ent->af; > + tfe->proto = ent->proto; > + tfe->dport = htons(ent->dport); > + tfe->sport = htons(ent->sport); > + tent->v.kidx = ent->value; > + tent->subtype = ent->af; > + > + if (ent->af == AF_INET) { > + fe4 = (struct fhashentry4 *)ent; > + tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); > + tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); > + tent->masklen = 32; > +#ifdef INET6 > + } else { > + fe6 = (struct fhashentry6 *)ent; > + tfe->a.a6.sip6 = fe6->sip6; > + tfe->a.a6.dip6 = fe6->dip6; > + tent->masklen = 128; > +#endif > + } > + > + return (0); > +} > + > +static int > +tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) > +{ > +#ifdef INET > + struct fhashentry4 *fe4; > +#endif > +#ifdef INET6 > + struct fhashentry6 *fe6; > +#endif > + struct tflow_entry *tfe; > + > + tfe = (struct tflow_entry *)tei->paddr; > + > + ent->af = tei->subtype; > + ent->proto = tfe->proto; > + ent->dport = ntohs(tfe->dport); > + ent->sport = ntohs(tfe->sport); > + > + if (tei->subtype == AF_INET) { > +#ifdef INET > + fe4 = (struct fhashentry4 *)ent; > + fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); > + fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); > +#endif > +#ifdef INET6 > + } else if (tei->subtype == AF_INET6) { > + fe6 = (struct fhashentry6 *)ent; > + fe6->sip6 = tfe->a.a6.sip6; > + fe6->dip6 = tfe->a.a6.dip6; > +#endif > + } else { > + /* Unknown CIDR type */ > + return (EINVAL); > + } > + > + return (0); > +} > + > + > +static int > +ta_find_fhash_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct fhash_cfg *cfg; > + struct fhashbhead *head; > + struct fhashentry *ent, *tmp; > + struct fhashentry6 fe6; > + struct tentry_info tei; > + int error; > + uint32_t hash; > + size_t sz; > + > + cfg = (struct fhash_cfg *)ta_state; > + > + ent = &fe6.e; > + > + memset(&fe6, 0, sizeof(fe6)); > + memset(&tei, 0, sizeof(tei)); > + > + tei.paddr = &tent->k.flow; > + tei.subtype = tent->subtype; > + > + if ((error = tei_to_fhash_ent(&tei, ent)) != 0) > + return (error); > + > + head = cfg->head; > + hash = hash_flow_ent(ent, cfg->size); > + > + if (tei.subtype == AF_INET) > + sz = 2 * sizeof(struct in_addr); > + else > + sz = 2 * sizeof(struct in6_addr); > + > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (cmp_flow_ent(tmp, ent, sz) != 0) { > + ta_dump_fhash_tentry(ta_state, ti, tmp, tent); > + return (0); > + } > + } > + > + return (ENOENT); > +} > + > +static void > +ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct fhash_cfg *cfg; > + struct fhashentry *ent, *ent_next; > + int i; > + > + cfg = (struct fhash_cfg *)ta_state; > + > + for (i = 0; i < cfg->size; i++) > + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) > + f(ent, arg); > +} > + > +static int > +ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_fhash *tb; > + struct fhashentry *ent; > + size_t sz; > + int error; > + > + tb = (struct ta_buf_fhash *)ta_buf; > + > + if (tei->subtype == AF_INET) > + sz = sizeof(struct fhashentry4); > + else if (tei->subtype == AF_INET6) > + sz = sizeof(struct fhashentry6); > + else > + return (EINVAL); > + > + ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); > + > + error = tei_to_fhash_ent(tei, ent); > + if (error != 0) { > + free(ent, M_IPFW_TBL); > + return (error); > + } > + tb->ent_ptr = ent; > + > + return (0); > +} > + > +static int > +ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct fhash_cfg *cfg; > + struct fhashbhead *head; > + struct fhashentry *ent, *tmp; > + struct ta_buf_fhash *tb; > + int exists; > + uint32_t hash, value; > + size_t sz; > + > + cfg = (struct fhash_cfg *)ta_state; > + tb = (struct ta_buf_fhash *)ta_buf; > + ent = (struct fhashentry *)tb->ent_ptr; > + exists = 0; > + > + /* Read current value from @tei */ > + ent->value = tei->value; > + > + head = cfg->head; > + hash = hash_flow_ent(ent, cfg->size); > + > + if (tei->subtype == AF_INET) > + sz = 2 * sizeof(struct in_addr); > + else > + sz = 2 * sizeof(struct in6_addr); > + > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (cmp_flow_ent(tmp, ent, sz) != 0) { > + exists = 1; > + break; > + } > + } > + > + if (exists == 1) { > + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) > + return (EEXIST); > + /* Record already exists. Update value if we're asked to */ > + /* Exchange values between tmp and @tei */ > + value = tmp->value; > + tmp->value = tei->value; > + tei->value = value; > + /* Indicate that update has happened instead of addition */ > + tei->flags |= TEI_FLAGS_UPDATED; > + *pnum = 0; > + } else { > + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) > + return (EFBIG); > + > + SLIST_INSERT_HEAD(&head[hash], ent, next); > + tb->ent_ptr = NULL; > + *pnum = 1; > + > + /* Update counters and check if we need to grow hash */ > + cfg->items++; > + } > + > + return (0); > +} > + > +static int > +ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_fhash *tb; > + > + tb = (struct ta_buf_fhash *)ta_buf; > + > + return (tei_to_fhash_ent(tei, &tb->fe6.e)); > +} > + > +static int > +ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, > + void *ta_buf, uint32_t *pnum) > +{ > + struct fhash_cfg *cfg; > + struct fhashbhead *head; > + struct fhashentry *ent, *tmp; > + struct ta_buf_fhash *tb; > + uint32_t hash; > + size_t sz; > + > + cfg = (struct fhash_cfg *)ta_state; > + tb = (struct ta_buf_fhash *)ta_buf; > + ent = &tb->fe6.e; > + > + head = cfg->head; > + hash = hash_flow_ent(ent, cfg->size); > + > + if (tei->subtype == AF_INET) > + sz = 2 * sizeof(struct in_addr); > + else > + sz = 2 * sizeof(struct in6_addr); > + > + /* Check for existence */ > + SLIST_FOREACH(tmp, &head[hash], next) { > + if (cmp_flow_ent(tmp, ent, sz) == 0) > + continue; > + > + SLIST_REMOVE(&head[hash], tmp, fhashentry, next); > + tei->value = tmp->value; > + *pnum = 1; > + cfg->items--; > + tb->ent_ptr = tmp; > + return (0); > + } > + > + return (ENOENT); > +} > + > +static void > +ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, > + void *ta_buf) > +{ > + struct ta_buf_fhash *tb; > + > + tb = (struct ta_buf_fhash *)ta_buf; > + > + if (tb->ent_ptr != NULL) > + free(tb->ent_ptr, M_IPFW_TBL); > +} > + > +/* > + * Hash growing callbacks. > + */ > + > +static int > +ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, > + uint64_t *pflags) > +{ > + struct fhash_cfg *cfg; > + > + cfg = (struct fhash_cfg *)ta_state; > + > + if (cfg->items > cfg->size && cfg->size < 65536) { > + *pflags = cfg->size * 2; > + return (1); > + } > + > + return (0); > +} > + > +/* > + * Allocate new, larger fhash. > + */ > +static int > +ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) > +{ > + struct mod_item *mi; > + struct fhashbhead *head; > + int i; > + > + mi = (struct mod_item *)ta_buf; > + > + memset(mi, 0, sizeof(struct mod_item)); > + mi->size = *pflags; > + head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, > + M_WAITOK | M_ZERO); > + for (i = 0; i < mi->size; i++) > + SLIST_INIT(&head[i]); > + > + mi->main_ptr = head; > + > + return (0); > +} > + > +/* > + * Copy data from old runtime array to new one. > + */ > +static int > +ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t *pflags) > +{ > + > + /* In is not possible to do rehash if we're not holidng WLOCK. */ > + return (0); > +} > + > +/* > + * Switch old & new arrays. > + */ > +static void > +ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, > + uint64_t pflags) > +{ > + struct mod_item *mi; > + struct fhash_cfg *cfg; > + struct fhashbhead *old_head, *new_head; > + struct fhashentry *ent, *ent_next; > + int i; > + uint32_t nhash; > + size_t old_size; > + > + mi = (struct mod_item *)ta_buf; > + cfg = (struct fhash_cfg *)ta_state; > + > + old_size = cfg->size; > + old_head = ti->state; > + > + new_head = (struct fhashbhead *)mi->main_ptr; > + for (i = 0; i < old_size; i++) { > + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { > + nhash = hash_flow_ent(ent, mi->size); > + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); > + } > + } > + > + ti->state = new_head; > + ti->data = mi->size; > + cfg->head = new_head; > + cfg->size = mi->size; > + > + mi->main_ptr = old_head; > +} > + > +/* > + * Free unneded array. > + */ > +static void > +ta_flush_mod_fhash(void *ta_buf) > +{ > + struct mod_item *mi; > + > + mi = (struct mod_item *)ta_buf; > + if (mi->main_ptr != NULL) > + free(mi->main_ptr, M_IPFW); > +} > + > +struct table_algo flow_hash = { > + .name = "flow:hash", > + .type = IPFW_TABLE_FLOW, > + .flags = TA_FLAG_DEFAULT, > + .ta_buf_size = sizeof(struct ta_buf_fhash), > + .init = ta_init_fhash, > + .destroy = ta_destroy_fhash, > + .prepare_add = ta_prepare_add_fhash, > + .prepare_del = ta_prepare_del_fhash, > + .add = ta_add_fhash, > + .del = ta_del_fhash, > + .flush_entry = ta_flush_fhash_entry, > + .foreach = ta_foreach_fhash, > + .dump_tentry = ta_dump_fhash_tentry, > + .find_tentry = ta_find_fhash_tentry, > + .dump_tinfo = ta_dump_fhash_tinfo, > + .need_modify = ta_need_modify_fhash, > + .prepare_mod = ta_prepare_mod_fhash, > + .fill_mod = ta_fill_mod_fhash, > + .modify = ta_modify_fhash, > + .flush_mod = ta_flush_mod_fhash, > +}; > + > +/* > + * Kernel fibs bindings. > + * > + * Implementation: > + * > + * Runtime part: > + * - fully relies on route API > + * - fib number is stored in ti->data > + * > + */ > + > +static struct rtentry *lookup_kfib(void *key, int keylen, int fib); > +static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val); > +static int kfib_parse_opts(int *pfib, char *data); > +static void ta_print_kfib_config(void *ta_state, struct table_info *ti, > + char *buf, size_t bufsize); > +static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, > + struct table_info *ti, char *data, uint8_t tflags); > +static void ta_destroy_kfib(void *ta_state, struct table_info *ti); > +static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, > + ipfw_ta_tinfo *tinfo); > +static int contigmask(uint8_t *p, int len); > +static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent); > +static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent); > +static void ta_foreach_kfib(void *ta_state, struct table_info *ti, > + ta_foreach_f *f, void *arg); > + > +static struct rtentry * > +lookup_kfib(void *key, int keylen, int fib) > +{ > + struct sockaddr *s; > + > + if (keylen == 4) { > + struct sockaddr_in sin; > + bzero(&sin, sizeof(sin)); > + sin.sin_len = sizeof(struct sockaddr_in); > + sin.sin_family = AF_INET; > + sin.sin_addr.s_addr = *(in_addr_t *)key; > + s = (struct sockaddr *)&sin; > + } else { > + struct sockaddr_in6 sin6; > + bzero(&sin6, sizeof(sin6)); > + sin6.sin6_len = sizeof(struct sockaddr_in6); > + sin6.sin6_family = AF_INET6; > + sin6.sin6_addr = *(struct in6_addr *)key; > + s = (struct sockaddr *)&sin6; > + } > + > + return (rtalloc1_fib(s, 0, 0, fib)); > +} > + > +static int > +ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, > + uint32_t *val) > +{ > + struct rtentry *rte; > + > + if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) > + return (0); > + > + *val = 0; > + RTFREE_LOCKED(rte); > + > + return (1); > +} > + > +/* Parse 'fib=%d' */ > +static int > +kfib_parse_opts(int *pfib, char *data) > +{ > + char *pdel, *pend, *s; > + int fibnum; > + > + if (data == NULL) > + return (0); > + if ((pdel = strchr(data, ' ')) == NULL) > + return (0); > + while (*pdel == ' ') > + pdel++; > + if (strncmp(pdel, "fib=", 4) != 0) > + return (EINVAL); > + if ((s = strchr(pdel, ' ')) != NULL) > + *s++ = '\0'; > + > + pdel += 4; > + /* Need \d+ */ > + fibnum = strtol(pdel, &pend, 10); > + if (*pend != '\0') > + return (EINVAL); > + > + *pfib = fibnum; > + > + return (0); > +} > + > +static void > +ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, > + size_t bufsize) > +{ > + > + if (ti->data != 0) > + snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); > + else > + snprintf(buf, bufsize, "%s", "addr:kfib"); > +} > + > +static int > +ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, > + char *data, uint8_t tflags) > +{ > + int error, fibnum; > + > + fibnum = 0; > + if ((error = kfib_parse_opts(&fibnum, data)) != 0) > + return (error); > + > + if (fibnum >= rt_numfibs) > + return (E2BIG); > + > + ti->data = fibnum; > + ti->lookup = ta_lookup_kfib; > + > + return (0); > +} > + > +/* > + * Destroys table @ti > + */ > +static void > +ta_destroy_kfib(void *ta_state, struct table_info *ti) > +{ > + > +} > + > +/* > + * Provide algo-specific table info > + */ > +static void > +ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) > +{ > + > + tinfo->flags = IPFW_TATFLAGS_AFDATA; > + tinfo->taclass4 = IPFW_TACLASS_RADIX; > + tinfo->count4 = 0; > + tinfo->itemsize4 = sizeof(struct rtentry); > + tinfo->taclass6 = IPFW_TACLASS_RADIX; > + tinfo->count6 = 0; > + tinfo->itemsize6 = sizeof(struct rtentry); > +} > + > +static int > +contigmask(uint8_t *p, int len) > +{ > + int i, n; > + > + for (i = 0; i < len ; i++) > + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ > + break; > + for (n= i + 1; n < len; n++) > + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) > + return (-1); /* mask not contiguous */ > + return (i); > +} > + > + > +static int > +ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, > + ipfw_obj_tentry *tent) > +{ > + struct rtentry *rte; > +#ifdef INET > + struct sockaddr_in *addr, *mask; > +#endif > +#ifdef INET6 > + struct sockaddr_in6 *addr6, *mask6; > +#endif > + int len; > + > + rte = (struct rtentry *)e; > + addr = (struct sockaddr_in *)rt_key(rte); > + mask = (struct sockaddr_in *)rt_mask(rte); > + len = 0; > + > + /* Guess IPv4/IPv6 radix by sockaddr family */ > +#ifdef INET > + if (addr->sin_family == AF_INET) { > + tent->k.addr.s_addr = addr->sin_addr.s_addr; > + len = 32; > + if (mask != NULL) > + len = contigmask((uint8_t *)&mask->sin_addr, 32); > + if (len == -1) > + len = 0; > + tent->masklen = len; > + tent->subtype = AF_INET; > + tent->v.kidx = 0; /* Do we need to put GW here? */ > + } > +#endif > +#ifdef INET6 > + if (addr->sin_family == AF_INET6) { > + addr6 = (struct sockaddr_in6 *)addr; > + mask6 = (struct sockaddr_in6 *)mask; > + memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr)); > + len = 128; > + if (mask6 != NULL) > + len = contigmask((uint8_t *)&mask6->sin6_addr, 128); > + if (len == -1) > + len = 0; > + tent->masklen = len; > + tent->subtype = AF_INET6; > + tent->v.kidx = 0; > + } > +#endif > + > + return (0); > +} > + > +static int > +ta_find_kfib_tentry(void *ta_state, struct table_info *ti, > + ipfw_obj_tentry *tent) > +{ > + struct rtentry *rte; > + void *key; > + int keylen; > + > + if (tent->subtype == AF_INET) { > + key = &tent->k.addr; > + keylen = sizeof(struct in_addr); > + } else { > + key = &tent->k.addr6; > + keylen = sizeof(struct in6_addr); > + } > + > + if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) > + return (0); > + > + if (rte != NULL) { > + ta_dump_kfib_tentry(ta_state, ti, rte, tent); > + RTFREE_LOCKED(rte); > + return (0); > + } > + > + return (ENOENT); > +} > + > +static void > +ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, > + void *arg) > +{ > + struct radix_node_head *rnh; > + int error; > + > + rnh = rt_tables_get_rnh(ti->data, AF_INET); > + if (rnh != NULL) { > + RADIX_NODE_HEAD_RLOCK(rnh); > + error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); > + RADIX_NODE_HEAD_RUNLOCK(rnh); > + } > + > + rnh = rt_tables_get_rnh(ti->data, AF_INET6); > + if (rnh != NULL) { > + RADIX_NODE_HEAD_RLOCK(rnh); > + error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); > + RADIX_NODE_HEAD_RUNLOCK(rnh); > + } > +} > + > +struct table_algo addr_kfib = { > + .name = "addr:kfib", > + .type = IPFW_TABLE_ADDR, > + .flags = TA_FLAG_READONLY, > + .ta_buf_size = 0, > + .init = ta_init_kfib, > + .destroy = ta_destroy_kfib, > + .foreach = ta_foreach_kfib, > + .dump_tentry = ta_dump_kfib_tentry, > + .find_tentry = ta_find_kfib_tentry, > + .dump_tinfo = ta_dump_kfib_tinfo, > + .print_config = ta_print_kfib_config, > +}; > + > +void > +ipfw_table_algo_init(struct ip_fw_chain *ch) > +{ > + size_t sz; > + > + /* > + * Register all algorithms presented here. > + */ > + sz = sizeof(struct table_algo); > + ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); > + ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); > + ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); > + ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); > + ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); > + ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); > +} > + > +void > +ipfw_table_algo_destroy(struct ip_fw_chain *ch) > +{ > + > + ipfw_del_table_algo(ch, addr_radix.idx); > + ipfw_del_table_algo(ch, addr_hash.idx); > + ipfw_del_table_algo(ch, iface_idx.idx); > + ipfw_del_table_algo(ch, number_array.idx); > + ipfw_del_table_algo(ch, flow_hash.idx); > + ipfw_del_table_algo(ch, addr_kfib.idx); > +} > + > + > diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c > new file mode 100644 > index 0000000..4b7b193 > --- /dev/null > +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c > @@ -0,0 +1,812 @@ > +/*- > + * Copyright (c) 2014 Yandex LLC > + * Copyright (c) 2014 Alexander V. Chernikov > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +#include <sys/cdefs.h> > +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table_value.c 272940 2014-10-11 15:04:50Z melifaro $"); > + > +/* > + * Multi-field value support for ipfw tables. > + * > + * This file contains necessary functions to convert > + * large multi-field values into u32 indices suitable to be fed > + * to various table algorithms. Other machinery like proper refcounting, > + * internal structures resizing are also kept here. > + */ > + > +#include "opt_ipfw.h" > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/malloc.h> > +#include <sys/kernel.h> > +#include <sys/hash.h> > +#include <sys/lock.h> > +#include <sys/rwlock.h> > +#include <sys/rmlock.h> > +#include <sys/socket.h> > +#include <sys/socketvar.h> > +#include <sys/queue.h> > +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ > + > +#include <netinet/in.h> > +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ > +#include <netinet/ip_fw.h> > + > +#include <netpfil/ipfw/ip_fw_private.h> > +#include <netpfil/ipfw/ip_fw_table.h> > + > +static uint32_t hash_table_value(struct namedobj_instance *ni, void *key, > + uint32_t kopt); > +static int cmp_table_value(struct named_object *no, void *key, uint32_t kopt); > + > +static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd); > + > +static struct ipfw_sopt_handler scodes[] = { > + { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values }, > +}; > + > +#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash) > + > +struct table_val_link > +{ > + struct named_object no; > + struct table_value *pval; /* Pointer to real table value */ > +}; > +#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */ > + > +struct vdump_args { > + struct ip_fw_chain *ch; > + struct sockopt_data *sd; > + struct table_value *pval; > + int error; > +}; > + > + > +static uint32_t > +hash_table_value(struct namedobj_instance *ni, void *key, uint32_t kopt) > +{ > + > + return (hash32_buf(key, 56, 0)); > +} > + > +static int > +cmp_table_value(struct named_object *no, void *key, uint32_t kopt) > +{ > + > + return (memcmp(((struct table_val_link *)no)->pval, key, 56)); > +} > + > +static void > +mask_table_value(struct table_value *src, struct table_value *dst, > + uint32_t mask) > +{ > +#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; } > + > + memset(dst, 0, sizeof(*dst)); > + _MCPY(tag, IPFW_VTYPE_TAG); > + _MCPY(pipe, IPFW_VTYPE_PIPE); > + _MCPY(divert, IPFW_VTYPE_DIVERT); > + _MCPY(skipto, IPFW_VTYPE_SKIPTO); > + _MCPY(netgraph, IPFW_VTYPE_NETGRAPH); > + _MCPY(fib, IPFW_VTYPE_FIB); > + _MCPY(nat, IPFW_VTYPE_NAT); > + _MCPY(dscp, IPFW_VTYPE_DSCP); > + _MCPY(nh4, IPFW_VTYPE_NH4); > + _MCPY(nh6, IPFW_VTYPE_NH6); > +#undef _MCPY > +} > + > +static void > +get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared, > + struct table_value **ptv, struct namedobj_instance **pvi) > +{ > + struct table_value *pval; > + struct namedobj_instance *vi; > + > + if (vshared != 0) { > + pval = (struct table_value *)ch->valuestate; > + vi = CHAIN_TO_VI(ch); > + } else { > + pval = NULL; > + vi = NULL; > + //pval = (struct table_value *)&tc->ti.data; > + } > + > + if (ptv != NULL) > + *ptv = pval; > + if (pvi != NULL) > + *pvi = vi; > +} > + > +/* > + * Update pointers to real vaues after @pval change. > + */ > +static void > +update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) > +{ > + struct vdump_args *da; > + struct table_val_link *ptv; > + struct table_value *pval; > + > + da = (struct vdump_args *)arg; > + ptv = (struct table_val_link *)no; > + > + pval = da->pval; > + ptv->pval = &pval[ptv->no.kidx]; > + > +} > + > +/* > + * Grows value storage shared among all tables. > + * Drops/reacquires UH locks. > + * Notifies other running adds on @ch shared storage resize. > + * Note function does not guarantee that free space > + * will be available after invocation, so one caller needs > + * to roll cycle himself. > + * > + * Returns 0 if case of no errors. > + */ > +static int > +resize_shared_value_storage(struct ip_fw_chain *ch) > +{ > + struct tables_config *tcfg; > + struct namedobj_instance *vi; > + struct table_value *pval, *valuestate, *old_valuestate; > + void *new_idx; > + struct vdump_args da; > + int new_blocks; > + int val_size, val_size_old; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + valuestate = NULL; > + new_idx = NULL; > + > + pval = (struct table_value *)ch->valuestate; > + vi = CHAIN_TO_VI(ch); > + tcfg = CHAIN_TO_TCFG(ch); > + > + val_size = tcfg->val_size * 2; > + > + if (val_size == (1 << 30)) > + return (ENOSPC); > + > + IPFW_UH_WUNLOCK(ch); > + > + valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW, > + M_WAITOK | M_ZERO); > + ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx, > + &new_blocks); > + > + IPFW_UH_WLOCK(ch); > + > + /* > + * Check if we still need to resize > + */ > + if (tcfg->val_size >= val_size) > + goto done; > + > + /* Update pointers and notify everyone we're changing @ch */ > + pval = (struct table_value *)ch->valuestate; > + rollback_toperation_state(ch, ch); > + > + /* Good. Let's merge */ > + memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size); > + ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks); > + > + IPFW_WLOCK(ch); > + /* Change pointers */ > + old_valuestate = ch->valuestate; > + ch->valuestate = valuestate; > + valuestate = old_valuestate; > + ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks); > + > + val_size_old = tcfg->val_size; > + tcfg->val_size = val_size; > + val_size = val_size_old; > + IPFW_WUNLOCK(ch); > + /* Update pointers to reflect resize */ > + memset(&da, 0, sizeof(da)); > + da.pval = (struct table_value *)ch->valuestate; > + ipfw_objhash_foreach(vi, update_tvalue, &da); > + > +done: > + free(valuestate, M_IPFW); > + ipfw_objhash_bitmap_free(new_idx, new_blocks); > + > + return (0); > +} > + > +/* > + * Drops reference for table value with index @kidx, stored in @pval and > + * @vi. Frees value if it has no references. > + */ > +static void > +unref_table_value(struct namedobj_instance *vi, struct table_value *pval, > + uint32_t kidx) > +{ > + struct table_val_link *ptvl; > + > + KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx)); > + if (--pval[kidx].refcnt > 0) > + return; > + > + /* Last reference, delete item */ > + ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx); > + KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx)); > + ipfw_objhash_del(vi, &ptvl->no); > + ipfw_objhash_free_idx(vi, kidx); > + free(ptvl, M_IPFW); > +} > + > +struct flush_args { > + struct ip_fw_chain *ch; > + struct table_algo *ta; > + struct table_info *ti; > + void *astate; > + ipfw_obj_tentry tent; > +}; > + > +static int > +unref_table_value_cb(void *e, void *arg) > +{ > + struct flush_args *fa; > + struct ip_fw_chain *ch; > + struct table_algo *ta; > + ipfw_obj_tentry *tent; > + int error; > + > + fa = (struct flush_args *)arg; > + > + ta = fa->ta; > + memset(&fa->tent, 0, sizeof(fa->tent)); > + tent = &fa->tent; > + error = ta->dump_tentry(fa->astate, fa->ti, e, tent); > + if (error != 0) > + return (error); > + > + ch = fa->ch; > + > + unref_table_value(CHAIN_TO_VI(ch), > + (struct table_value *)ch->valuestate, tent->v.kidx); > + > + return (0); > +} > + > +/* > + * Drop references for each value used in @tc. > + */ > +void > +ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, > + struct table_algo *ta, void *astate, struct table_info *ti) > +{ > + struct flush_args fa; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + memset(&fa, 0, sizeof(fa)); > + fa.ch = ch; > + fa.ta = ta; > + fa.astate = astate; > + fa.ti = ti; > + > + ta->foreach(astate, ti, unref_table_value_cb, &fa); > +} > + > +/* > + * Table operation state handler. > + * Called when we are going to change something in @tc which > + * may lead to inconsistencies in on-going table data addition. > + * > + * Here we rollback all already committed state (table values, currently) > + * and set "modified" field to non-zero value to indicate > + * that we need to restart original operation. > + */ > +void > +rollback_table_values(struct tableop_state *ts) > +{ > + struct ip_fw_chain *ch; > + struct table_value *pval; > + struct tentry_info *ptei; > + struct namedobj_instance *vi; > + int i; > + > + ch = ts->ch; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + /* Get current table value pointer */ > + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); > + > + for (i = 0; i < ts->count; i++) { > + ptei = &ts->tei[i]; > + > + if (ptei->value == 0) > + continue; > + > + unref_table_value(vi, pval, ptei->value); > + } > +} > + > +/* > + * Allocate new value index in either shared or per-table array. > + * Function may drop/reacquire UH lock. > + * > + * Returns 0 on success. > + */ > +static int > +alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts, > + struct namedobj_instance *vi, uint16_t *pvidx) > +{ > + int error, vlimit; > + uint16_t vidx; > + > + IPFW_UH_WLOCK_ASSERT(ch); > + > + error = ipfw_objhash_alloc_idx(vi, &vidx); > + if (error != 0) { > + > + /* > + * We need to resize array. This involves > + * lock/unlock, so we need to check "modified" > + * state. > + */ > + ts->opstate.func(ts->tc, &ts->opstate); > + error = resize_shared_value_storage(ch); > + return (error); /* ts->modified should be set, we will restart */ > + } > + > + vlimit = ts->ta->vlimit; > + if (vlimit != 0 && vidx >= vlimit) { > + > + /* > + * Algorithm is not able to store given index. > + * We have to rollback state, start using > + * per-table value array or return error > + * if we're already using it. > + * > + * TODO: do not rollback state if > + * atomicity is not required. > + */ > + if (ts->vshared != 0) { > + /* shared -> per-table */ > + return (ENOSPC); /* TODO: proper error */ > + } > + > + /* per-table. Fail for now. */ > + return (ENOSPC); /* TODO: proper error */ > + } > + > + *pvidx = vidx; > + return (0); > +} > + > +/* > + * Drops value reference for unused values (updates, deletes, partially > + * successful adds or rollbacks). > + */ > +void > +ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, > + struct tentry_info *tei, uint32_t count, int rollback) > +{ > + int i; > + struct tentry_info *ptei; > + struct table_value *pval; > + struct namedobj_instance *vi; > + > + /* > + * We have two slightly different ADD cases here: > + * either (1) we are successful / partially successful, > + * in that case we need > + * * to ignore ADDED entries values > + * * rollback every other values (either UPDATED since > + * old value has been stored there, or some failure like > + * EXISTS or LIMIT or simply "ignored" case. > + * > + * (2): atomic rollback of partially successful operation > + * in that case we simply need to unref all entries. > + * > + * DELETE case is simpler: no atomic support there, so > + * we simply unref all non-zero values. > + */ > + > + /* > + * Get current table value pointers. > + * XXX: Properly read vshared > + */ > + get_value_ptrs(ch, tc, 1, &pval, &vi); > + > + for (i = 0; i < count; i++) { > + ptei = &tei[i]; > + > + if (ptei->value == 0) { > + > + /* > + * We may be deleting non-existing record. > + * Skip. > + */ > + continue; > + } > + > + if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) { > + ptei->value = 0; > + continue; > + } > + > + unref_table_value(vi, pval, ptei->value); > + ptei->value = 0; > + } > +} > + > +/* > + * Main function used to link values of entries going to be added, > + * to the index. Since we may perform many UH locks drops/acquires, > + * handle changes by checking tablestate "modified" field. > + * > + * Success: return 0. > + */ > +int > +ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts) > +{ > + int error, i, found; > + struct namedobj_instance *vi; > + struct table_config *tc; > + struct tentry_info *tei, *ptei; > + uint32_t count, vlimit; > + uint16_t vidx; > + struct table_val_link *ptv; > + struct table_value tval, *pval; > + > + /* > + * Stage 1: reference all existing values and > + * save their indices. > + */ > + IPFW_UH_WLOCK_ASSERT(ch); > + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); > + > + error = 0; > + found = 0; > + vlimit = ts->ta->vlimit; > + vidx = 0; > + tc = ts->tc; > + tei = ts->tei; > + count = ts->count; > + for (i = 0; i < count; i++) { > + ptei = &tei[i]; > + ptei->value = 0; /* Ensure value is always 0 in the beginnig */ > + mask_table_value(ptei->pvalue, &tval, ts->vmask); > + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, > + (char *)&tval); > + if (ptv == NULL) > + continue; > + /* Deal with vlimit later */ > + if (vlimit > 0 && vlimit <= ptv->no.kidx) > + continue; > + > + /* Value found. Bump refcount */ > + ptv->pval->refcnt++; > + ptei->value = ptv->no.kidx; > + found++; > + } > + > + if (ts->count == found) { > + /* We've found all values , no need ts create new ones */ > + return (0); > + } > + > + /* > + * we have added some state here, let's attach operation > + * state ts the list ts be able ts rollback if necessary. > + */ > + add_toperation_state(ch, ts); > + /* Ensure table won't disappear */ > + tc_ref(tc); > + IPFW_UH_WUNLOCK(ch); > + > + /* > + * Stage 2: allocate objects for non-existing values. > + */ > + for (i = 0; i < count; i++) { > + ptei = &tei[i]; > + if (ptei->value != 0) > + continue; > + if (ptei->ptv != NULL) > + continue; > + ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW, > + M_WAITOK | M_ZERO); > + } > + > + /* > + * Stage 3: allocate index numbers for new values > + * and link them to index. > + */ > + IPFW_UH_WLOCK(ch); > + tc_unref(tc); > + del_toperation_state(ch, ts); > + if (ts->modified != 0) { > + > + /* > + * In general, we should free all state/indexes here > + * and return. However, we keep allocated state instead > + * to ensure we achieve some progress on each restart. > + */ > + return (0); > + } > + > + KASSERT(pval == ch->valuestate, ("resize_storage() notify failure")); > + > + /* Let's try to link values */ > + for (i = 0; i < count; i++) { > + ptei = &tei[i]; > + if (ptei->value != 0) { > + > + /* > + * We may be here after several process restarts, > + * so we need to update all fields that might > + * have changed. > + */ > + ptv = (struct table_val_link *)ptei->ptv; > + ptv->pval = &pval[i]; > + continue; > + } > + > + /* Check if record has appeared */ > + mask_table_value(ptei->pvalue, &tval, ts->vmask); > + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, > + (char *)&tval); > + if (ptv != NULL) { > + ptv->pval->refcnt++; > + ptei->value = ptv->no.kidx; > + continue; > + } > + > + /* May perform UH unlock/lock */ > + error = alloc_table_vidx(ch, ts, vi, &vidx); > + if (error != 0) { > + ts->opstate.func(ts->tc, &ts->opstate); > + return (error); > + } > + /* value storage resize has happened, return */ > + if (ts->modified != 0) > + return (0); > + > + /* Finally, we have allocated valid index, let's add entry */ > + ptei->value = vidx; > + ptv = (struct table_val_link *)ptei->ptv; > + ptei->ptv = NULL; > + > + ptv->no.kidx = vidx; > + ptv->no.name = (char *)&pval[vidx]; > + ptv->pval = &pval[vidx]; > + memcpy(ptv->pval, &tval, sizeof(struct table_value)); > + pval[vidx].refcnt = 1; > + ipfw_objhash_add(vi, &ptv->no); > + } > + > + return (0); > +} > + > +/* > + * Compability function used to import data from old > + * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes. > + */ > +void > +ipfw_import_table_value_legacy(uint32_t value, struct table_value *v) > +{ > + > + memset(v, 0, sizeof(*v)); > + v->tag = value; > + v->pipe = value; > + v->divert = value; > + v->skipto = value; > + v->netgraph = value; > + v->fib = value; > + v->nat = value; > + v->nh4 = value; /* host format */ > + v->dscp = value; > + v->limit = value; > +} > + > +/* > + * Export data to legacy table dumps opcodes. > + */ > +uint32_t > +ipfw_export_table_value_legacy(struct table_value *v) > +{ > + > + /* > + * TODO: provide more compatibility depending on > + * vmask value. > + */ > + return (v->tag); > +} > + > +/* > + * Imports table value from current userland format. > + * Saves value in kernel format to the same place. > + */ > +void > +ipfw_import_table_value_v1(ipfw_table_value *iv) > +{ > + struct table_value v; > + > + memset(&v, 0, sizeof(v)); > + v.tag = iv->tag; > + v.pipe = iv->pipe; > + v.divert = iv->divert; > + v.skipto = iv->skipto; > + v.netgraph = iv->netgraph; > + v.fib = iv->fib; > + v.nat = iv->nat; > + v.dscp = iv->dscp; > + v.nh4 = iv->nh4; > + v.nh6 = iv->nh6; > + v.limit = iv->limit; > + > + memcpy(iv, &v, sizeof(ipfw_table_value)); > +} > + > +/* > + * Export real table value @v to current userland format. > + * Note that @v and @piv may point to the same memory. > + */ > +void > +ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv) > +{ > + ipfw_table_value iv; > + > + memset(&iv, 0, sizeof(iv)); > + iv.tag = v->tag; > + iv.pipe = v->pipe; > + iv.divert = v->divert; > + iv.skipto = v->skipto; > + iv.netgraph = v->netgraph; > + iv.fib = v->fib; > + iv.nat = v->nat; > + iv.dscp = v->dscp; > + iv.limit = v->limit; > + iv.nh4 = v->nh4; > + iv.nh6 = v->nh6; > + > + memcpy(piv, &iv, sizeof(iv)); > +} > + > +/* > + * Exports real value data into ipfw_table_value structure. > + * Utilizes "spare1" field to store kernel index. > + */ > +static void > +dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) > +{ > + struct vdump_args *da; > + struct table_val_link *ptv; > + struct table_value *v; > + > + da = (struct vdump_args *)arg; > + ptv = (struct table_val_link *)no; > + > + v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v)); > + /* Out of memory, returning */ > + if (v == NULL) { > + da->error = ENOMEM; > + return; > + } > + > + memcpy(v, ptv->pval, sizeof(*v)); > + v->spare1 = ptv->no.kidx; > +} > + > +/* > + * Dumps all shared/table value data > + * Data layout (v1)(current): > + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size > + * Reply: [ ipfw_obj_lheader ipfw_table_value x N ] > + * > + * Returns 0 on success > + */ > +static int > +list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, > + struct sockopt_data *sd) > +{ > + struct _ipfw_obj_lheader *olh; > + struct namedobj_instance *vi; > + struct vdump_args da; > + uint32_t count, size; > + > + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); > + if (olh == NULL) > + return (EINVAL); > + if (sd->valsize < olh->size) > + return (EINVAL); > + > + IPFW_UH_RLOCK(ch); > + vi = CHAIN_TO_VI(ch); > + > + count = ipfw_objhash_count(vi); > + size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader); > + > + /* Fill in header regadless of buffer size */ > + olh->count = count; > + olh->objsize = sizeof(ipfw_table_value); > + > + if (size > olh->size) { > + olh->size = size; > + IPFW_UH_RUNLOCK(ch); > + return (ENOMEM); > + } > + olh->size = size; > + > + /* > + * Do the actual value dump > + */ > + memset(&da, 0, sizeof(da)); > + da.ch = ch; > + da.sd = sd; > + ipfw_objhash_foreach(vi, dump_tvalue, &da); > + > + IPFW_UH_RUNLOCK(ch); > + > + return (0); > +} > + > +void > +ipfw_table_value_init(struct ip_fw_chain *ch, int first) > +{ > + struct tables_config *tcfg; > + > + ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value), > + M_IPFW, M_WAITOK | M_ZERO); > + > + tcfg = ch->tblcfg; > + > + tcfg->val_size = VALDATA_START_SIZE; > + tcfg->valhash = ipfw_objhash_create(tcfg->val_size); > + ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value, > + cmp_table_value); > + > + IPFW_ADD_SOPT_HANDLER(first, scodes); > +} > + > +static void > +destroy_value(struct namedobj_instance *ni, struct named_object *no, > + void *arg) > +{ > + > + free(no, M_IPFW); > +} > + > +void > +ipfw_table_value_destroy(struct ip_fw_chain *ch, int last) > +{ > + > + IPFW_DEL_SOPT_HANDLER(last, scodes); > + > + free(ch->valuestate, M_IPFW); > + ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch); > + ipfw_objhash_destroy(CHAIN_TO_VI(ch)); > +} > + > diff --git a/example/ipfw/sys/sys/fnv_hash.h b/example/ipfw/sys/sys/fnv_hash.h > new file mode 100644 > index 0000000..f070e6e > --- /dev/null > +++ b/example/ipfw/sys/sys/fnv_hash.h > @@ -0,0 +1,71 @@ > +/*- > + * Fowler / Noll / Vo Hash (FNV Hash) > + * http://www.isthe.com/chongo/tech/comp/fnv/ > + * > + * This is an implementation of the algorithms posted above. > + * This file is placed in the public domain by Peter Wemm. > + * > + * $FreeBSD: head/sys/sys/fnv_hash.h 268351 2014-07-07 00:27:09Z marcel $ > + */ > +#ifndef _SYS_FNV_HASH_H_ > +#define _SYS_FNV_HASH_H_ > + > +typedef u_int32_t Fnv32_t; > +typedef u_int64_t Fnv64_t; > + > +#define FNV1_32_INIT ((Fnv32_t) 33554467UL) > +#define FNV1_64_INIT ((Fnv64_t) 0xcbf29ce484222325ULL) > + > +#define FNV_32_PRIME ((Fnv32_t) 0x01000193UL) > +#define FNV_64_PRIME ((Fnv64_t) 0x100000001b3ULL) > + > +static __inline Fnv32_t > +fnv_32_buf(const void *buf, size_t len, Fnv32_t hval) > +{ > + const u_int8_t *s = (const u_int8_t *)buf; > + > + while (len-- != 0) { > + hval *= FNV_32_PRIME; > + hval ^= *s++; > + } > + return hval; > +} > + > +static __inline Fnv32_t > +fnv_32_str(const char *str, Fnv32_t hval) > +{ > + const u_int8_t *s = (const u_int8_t *)str; > + Fnv32_t c; > + > + while ((c = *s++) != 0) { > + hval *= FNV_32_PRIME; > + hval ^= c; > + } > + return hval; > +} > + > +static __inline Fnv64_t > +fnv_64_buf(const void *buf, size_t len, Fnv64_t hval) > +{ > + const u_int8_t *s = (const u_int8_t *)buf; > + > + while (len-- != 0) { > + hval *= FNV_64_PRIME; > + hval ^= *s++; > + } > + return hval; > +} > + > +static __inline Fnv64_t > +fnv_64_str(const char *str, Fnv64_t hval) > +{ > + const u_int8_t *s = (const u_int8_t *)str; > + u_register_t c; /* 32 bit on i386, 64 bit on alpha */ > + > + while ((c = *s++) != 0) { > + hval *= FNV_64_PRIME; > + hval ^= c; > + } > + return hval; > +} > +#endif /* _SYS_FNV_HASH_H_ */ > diff --git a/example/ipfw/sys/sys/hash.h b/example/ipfw/sys/sys/hash.h > new file mode 100644 > index 0000000..bd8fa69 > --- /dev/null > +++ b/example/ipfw/sys/sys/hash.h > @@ -0,0 +1,133 @@ > +/*- > + * Copyright (c) 2001 Tobias Weingartner > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR > + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES > + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. > + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, > + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT > + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF > + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * > + * $OpenBSD: hash.h,v 1.4 2004/05/25 18:37:23 jmc Exp $ > + * $FreeBSD: head/sys/sys/hash.h 272906 2014-10-10 19:26:26Z gnn $ > + */ > + > +#ifndef _SYS_HASH_H_ > +#define _SYS_HASH_H_ > +#include <sys/types.h> > + > +/* Convenience */ > +#ifndef HASHINIT > +#define HASHINIT 5381 > +#define HASHSTEP(x,c) (((x << 5) + x) + (c)) > +#endif > + > +/* > + * Return a 32-bit hash of the given buffer. The init > + * value should be 0, or the previous hash value to extend > + * the previous hash. > + */ > +static __inline uint32_t > +hash32_buf(const void *buf, size_t len, uint32_t hash) > +{ > + const unsigned char *p = buf; > + > + while (len--) > + hash = HASHSTEP(hash, *p++); > + > + return hash; > +} > + > +/* > + * Return a 32-bit hash of the given string. > + */ > +static __inline uint32_t > +hash32_str(const void *buf, uint32_t hash) > +{ > + const unsigned char *p = buf; > + > + while (*p) > + hash = HASHSTEP(hash, *p++); > + > + return hash; > +} > + > +/* > + * Return a 32-bit hash of the given string, limited by N. > + */ > +static __inline uint32_t > +hash32_strn(const void *buf, size_t len, uint32_t hash) > +{ > + const unsigned char *p = buf; > + > + while (*p && len--) > + hash = HASHSTEP(hash, *p++); > + > + return hash; > +} > + > +/* > + * Return a 32-bit hash of the given string terminated by C, > + * (as well as 0). This is mainly here as a helper for the > + * namei() hashing of path name parts. > + */ > +static __inline uint32_t > +hash32_stre(const void *buf, int end, const char **ep, uint32_t hash) > +{ > + const unsigned char *p = buf; > + > + while (*p && (*p != end)) > + hash = HASHSTEP(hash, *p++); > + > + if (ep) > + *ep = (const char *)p; > + > + return hash; > +} > + > +/* > + * Return a 32-bit hash of the given string, limited by N, > + * and terminated by C (as well as 0). This is mainly here > + * as a helper for the namei() hashing of path name parts. > + */ > +static __inline uint32_t > +hash32_strne(const void *buf, size_t len, int end, const char **ep, > + uint32_t hash) > +{ > + const unsigned char *p = buf; > + > + while (*p && (*p != end) && len--) > + hash = HASHSTEP(hash, *p++); > + > + if (ep) > + *ep = (const char *)p; > + > + return hash; > +} > + > +#ifdef _KERNEL > +/* > + * Hashing function from Bob Jenkins. Implementation in libkern/jenkins_hash.c. > + */ > +uint32_t jenkins_hash(const void *, size_t, uint32_t); > +uint32_t jenkins_hash32(const uint32_t *, size_t, uint32_t); > + > +uint32_t murmur3_aligned_32(const void *data, size_t len, uint32_t seed); > + > +#endif /* _KERNEL */ > + > +#endif /* !_SYS_HASH_H_ */
On 15 March 2016 at 12:31, Maxim Uvarov <maxim.uvarov@linaro.org> wrote: > On 03/11/16 16:25, venkatesh.vivekanandan@linaro.org wrote: > >> From: Venkatesh Vivekanandan <venkatesh.vivekanandan@linaro.org> >> >> example/ipfw files are the ones that are taken as-is from netmap-ipfw. >> source can be found @ https://github.com/luigirizzo/netmap-ipfw.git >> >> Signed-off-by: Venkatesh Vivekanandan <venkatesh.vivekanandan@linaro.org> >> --- >> v2: remove permanent stub files and generate them while building >> >> example/ipfw/BSDmakefile | 8 + >> example/ipfw/README | 76 + >> example/ipfw/UPDATE | 44 + >> example/ipfw/extra/expand_number.c | 101 + >> example/ipfw/extra/glue.c | 555 +++ >> example/ipfw/extra/glue.h | 488 ++ >> example/ipfw/extra/humanize_number.c | 167 + >> example/ipfw/extra/ipfw2_mod.c | 278 ++ >> example/ipfw/extra/linux_defs.h | 144 + >> example/ipfw/extra/missing.c | 732 +++ >> example/ipfw/extra/missing.h | 801 ++++ >> example/ipfw/extra/session.c | 644 +++ >> example/ipfw/extra/sys/contrib/pf/net/pfvar.h | 27 + >> example/ipfw/extra/sys/sys/kernel.h | 26 + >> example/ipfw/extra/sys/sys/malloc.h | 13 + >> example/ipfw/extra/sys/sys/mbuf.h | 383 ++ >> example/ipfw/extra/sys/sys/module.h | 43 + >> example/ipfw/extra/sys/sys/systm.h | 159 + >> example/ipfw/extra/sys/sys/taskqueue.h | 51 + >> example/ipfw/ipfw/altq.c | 151 + >> example/ipfw/ipfw/dummynet.c | 1410 ++++++ >> example/ipfw/ipfw/ipfw.8 | 3723 +++++++++++++++ >> example/ipfw/ipfw/ipfw2.c | 4968 >> +++++++++++++++++++++ >> example/ipfw/ipfw/ipfw2.h | 352 ++ >> example/ipfw/ipfw/ipv6.c | 536 +++ >> example/ipfw/ipfw/main.c | 628 +++ >> example/ipfw/ipfw/nat.c | 1115 +++++ >> example/ipfw/ipfw/tables.c | 2013 +++++++++ >> example/ipfw/sys/net/pfil.h | 148 + >> example/ipfw/sys/net/radix.c | 1208 +++++ >> example/ipfw/sys/net/radix.h | 168 + >> example/ipfw/sys/netgraph/ng_ipfw.h | 33 + >> example/ipfw/sys/netinet/in_cksum.c | 146 + >> example/ipfw/sys/netinet/ip_dummynet.h | 264 ++ >> example/ipfw/sys/netinet/ip_fw.h | 1009 +++++ >> example/ipfw/sys/netinet/tcp.h | 247 + >> example/ipfw/sys/netinet/udp.h | 69 + >> example/ipfw/sys/netpfil/ipfw/dn_heap.c | 552 +++ >> example/ipfw/sys/netpfil/ipfw/dn_heap.h | 192 + >> example/ipfw/sys/netpfil/ipfw/dn_sched.h | 192 + >> example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c | 120 + >> example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c | 229 + >> example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c | 864 ++++ >> example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c | 307 ++ >> example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c | 373 ++ >> example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c | 846 ++++ >> example/ipfw/sys/netpfil/ipfw/ip_dn_io.c | 960 ++++ >> example/ipfw/sys/netpfil/ipfw/ip_dn_private.h | 404 ++ >> example/ipfw/sys/netpfil/ipfw/ip_dummynet.c | 2334 ++++++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw2.c | 2905 ++++++++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c | 1604 +++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c | 537 +++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_log.c | 567 +++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c | 587 +++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_private.h | 625 +++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c | 3469 ++++++++++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_table.c | 3674 +++++++++++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_table.h | 246 + >> example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c | 4081 >> +++++++++++++++++ >> example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c | 812 ++++ >> example/ipfw/sys/sys/fnv_hash.h | 71 + >> example/ipfw/sys/sys/hash.h | 133 + >> 62 files changed, 49612 insertions(+) >> create mode 100644 example/ipfw/BSDmakefile >> create mode 100644 example/ipfw/README >> create mode 100644 example/ipfw/UPDATE >> create mode 100644 example/ipfw/extra/expand_number.c >> create mode 100644 example/ipfw/extra/glue.c >> create mode 100644 example/ipfw/extra/glue.h >> create mode 100644 example/ipfw/extra/humanize_number.c >> create mode 100644 example/ipfw/extra/ipfw2_mod.c >> create mode 100644 example/ipfw/extra/linux_defs.h >> create mode 100644 example/ipfw/extra/missing.c >> create mode 100644 example/ipfw/extra/missing.h >> create mode 100644 example/ipfw/extra/session.c >> create mode 100644 example/ipfw/extra/sys/contrib/pf/net/pfvar.h >> create mode 100644 example/ipfw/extra/sys/sys/kernel.h >> create mode 100644 example/ipfw/extra/sys/sys/malloc.h >> create mode 100644 example/ipfw/extra/sys/sys/mbuf.h >> create mode 100644 example/ipfw/extra/sys/sys/module.h >> create mode 100644 example/ipfw/extra/sys/sys/systm.h >> create mode 100644 example/ipfw/extra/sys/sys/taskqueue.h >> create mode 100644 example/ipfw/ipfw/altq.c >> create mode 100644 example/ipfw/ipfw/dummynet.c >> create mode 100644 example/ipfw/ipfw/ipfw.8 >> create mode 100644 example/ipfw/ipfw/ipfw2.c >> create mode 100644 example/ipfw/ipfw/ipfw2.h >> create mode 100644 example/ipfw/ipfw/ipv6.c >> create mode 100644 example/ipfw/ipfw/main.c >> create mode 100644 example/ipfw/ipfw/nat.c >> create mode 100644 example/ipfw/ipfw/tables.c >> create mode 100644 example/ipfw/sys/net/pfil.h >> create mode 100644 example/ipfw/sys/net/radix.c >> create mode 100644 example/ipfw/sys/net/radix.h >> create mode 100644 example/ipfw/sys/netgraph/ng_ipfw.h >> create mode 100644 example/ipfw/sys/netinet/in_cksum.c >> create mode 100644 example/ipfw/sys/netinet/ip_dummynet.h >> create mode 100644 example/ipfw/sys/netinet/ip_fw.h >> create mode 100644 example/ipfw/sys/netinet/tcp.h >> create mode 100644 example/ipfw/sys/netinet/udp.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_heap.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_heap.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_io.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dn_private.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_dummynet.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw2.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_log.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_private.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table.h >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c >> create mode 100644 example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c >> create mode 100644 example/ipfw/sys/sys/fnv_hash.h >> create mode 100644 example/ipfw/sys/sys/hash.h >> >> diff --git a/example/ipfw/BSDmakefile b/example/ipfw/BSDmakefile >> new file mode 100644 >> index 0000000..810ae2b >> --- /dev/null >> +++ b/example/ipfw/BSDmakefile >> @@ -0,0 +1,8 @@ >> +# forward to use gmake >> +.PHONY: ipfw kipfw >> + >> +all: >> + gmake >> + >> +$(.TARGETS) : >> + gmake MAKE=gmake $(.TARGETS) >> diff --git a/example/ipfw/README b/example/ipfw/README >> new file mode 100644 >> index 0000000..2a55ba0 >> --- /dev/null >> +++ b/example/ipfw/README >> @@ -0,0 +1,76 @@ >> +# README FILE FOR IPFW-USER ON TOP OF NETMAP >> + >> > > This readme file should describe ODP and things related to odp. > Might be some comparison numbers between ODP and original code. > I removed README file from this hunk(basically to keep this hunk with files only that are taken as-is from netmap-ipfw) and added it part of patch 6/6 linux-generic: odp-ipfw: New file example/ipfw/extra/odp_ipfw.c. Patch 6/6 has all odp related changes. I will send v3 with these changes. > > > +This directory contains a version of ipfw and dummynet that can >> +run in userland, using NETMAP as the backend for packet I/O. >> +This permits a throughput about 10 times higher than the >> +corresponding in-kernel version. I have measured about 6.5 Mpps >> +for plain filtering, and 2.2 Mpps going through a pipe. >> +Some optimizations are possible when running on netmap pipes, >> +or other netmap ports that support zero copy. >> + >> +To build the code simply run >> + make NETMAP_INC=/some/where/with/netmap-release/sys >> + >> +pointing to the netmap 'sys' directory >> +(the makefile uses gmake underneath) >> + >> +The base version comes from FreeBSD-HEAD -r '{2012-08-03}' >> +(and subsequently updated in late 2013) >> +with small modifications listed below >> + >> + netinet/ipfw >> + ip_dn_io.c >> + support for on-stack mbufs >> + ip_fw2.c >> + some conditional compilation for functions not >> + available in userspace >> + ip_fw_log.c >> + revise snprintf, SNPARGS (MAC) >> + >> + >> +sbin/ipfw and the kernel counterpart communicate throuugh a >> +TCP socket (localhost:5555) carrying the raw data that would >> +normally be carried on seg/getsockopt. >> + >> +For testing purposes, opening a telnet session to port 5556 and >> +typing some bytes will start a fake 'infinite source' so you can >> +check how fast your ruleset works. >> + >> + gmake >> + dummynet/ipfw & # preferably in another window >> + telnet localhost 5556 # type some bytes to start 'traffic' >> + >> + sh -c "while true; do ipfw/ipfw show; ipfw/ipfw zero; sleep 1; >> done" >> + >> +(on an i7-3400 I get about 15 Mpps) >> + >> +Real packet I/O is possible using netmap >> info.iet.unipi.it/~luigi/netmap/ >> +You can use a couple of VALE switches (part of netmap) to connect >> +a source and sink to the userspace firewall, as follows >> + >> + s f f d >> + [pkt-gen]-->--[valeA]-->--[kipfw]-->--[valeB]-->--[pkt-gen] >> + >> +The commands to run (in separate windows) are >> + >> + # preliminarly, load the netmap module >> + sudo kldload netmap.ko >> + >> + # connect the firewall to two vale switches >> + ./kipfw valeA:f valeB:f & >> + >> + # configure ipfw/dummynet >> + ipfw/ipfw show # or other >> + >> + # start the sink >> + pkt-gen -i valeB:d -f rx >> + >> + # start an infinite source >> + pkt-gen -i valeA:s -f tx >> + >> + # plain again with the firewall and enjoy >> + ipfw/ipfw show # or other >> > Ok, that is good setup. We support VALE and netmap pktio so might be we > will > have about the same numbers. We need some 'make check' script to prove that > example still works. That can be done with pcap pktio or with netmap/VALE > or > with loop back. > I have tested it only on linux-generic and provided the steps to work in that platform. When it is supported on netmap/odp-dpdk, README can be updated with those numbers. Patch set v3 is updated with linux-generic numbers. > > + >> +On my i7-3400 I get about 6.5 Mpps with a single rule, and about 2.2 Mpps >> +when going through a dummynet pipe. This is for a single process handling >> +the traffic. >> diff --git a/example/ipfw/UPDATE b/example/ipfw/UPDATE >> new file mode 100644 >> index 0000000..3da344f >> --- /dev/null >> +++ b/example/ipfw/UPDATE >> @@ -0,0 +1,44 @@ >> +--- 20141017 --- updating to FreeBSD head 273155 >> + >> +sys/net/pfil.h V $FreeBSD$ >> +sys/net/radix.h V $FreeBSD$ >> +sys/net/radix.c V merge, caddr_t -> u_char * >> + >> +sys/netgraph/ng_ipfw.h -- unchanged >> + >> +sys/netinet/in_cksum.c -- unchanged >> +sys/netinet/ip_dummynet.h V add DN_IS_ECN >> +sys/netinet/ip_fw.h massive changes >> +sys/netinet/tcp.h V $FreeBSD$ >> +sys/netinet/udp.h V $FreeBSD$ >> + >> +sys/netpfil/ipfw >> +dn_heap.c -- unchanged >> +dn_heap.h -- unchanged >> +dn_sched.h $FreeBSD$ >> +dn_sched_fifo.c -- unchanged >> +dn_sched_prio.c -- unchanged >> +dn_sched_qfq.c -- unchanged >> +dn_sched_rr.c -- unchanged >> +dn_sched_wf2q.c -- unchanged >> +ip_dn_glue.c V $FreeBSD$ >> +ip_dn_io.c V ecn, check missing ifp >> +ip_dn_private.h V $FreeBSD$ >> +ip_dummynet.c V $FreeBSD$, callout_reset_sbt, fs fixes, >> module >> +ip_fw2.c XXX large >> +ip_fw_dynamic.c XXX large >> +ip_fw_log.c XXX IP_FW_ARG.. TARG >> +ip_fw_pfil.c XXX small change >> +ip_fw_private.h XXX large >> +ip_fw_sockopt.c XXX huge >> +ip_fw_table.c XXX huge >> + >> +Userspace: >> +altq.c $FreeBSD$, bprintf >> +dummynet.c $FreeBSD$, ecn, bprintf >> +ipfw2.c $FreeBSD$, commands >> +ipfw2.h as above >> +ipv6.c as above >> +main.c small changes >> +nat.c internal >> + >> diff --git a/example/ipfw/extra/expand_number.c >> b/example/ipfw/extra/expand_number.c >> new file mode 100644 >> index 0000000..523fbb0 >> --- /dev/null >> +++ b/example/ipfw/extra/expand_number.c >> @@ -0,0 +1,101 @@ >> +/*- >> + * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org> >> + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * 1. Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * 2. Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in the >> + * documentation and/or other materials provided with the >> distribution. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' >> AND >> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE >> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR >> PURPOSE >> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE >> LIABLE >> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR >> CONSEQUENTIAL >> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE >> GOODS >> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) >> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, >> STRICT >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY >> WAY >> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF >> + * SUCH DAMAGE. >> + */ >> + >> +#include <sys/cdefs.h> >> +__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 211343 2010-08-15 >> 18:32:06Z des $"); >> + >> +#include <sys/types.h> >> +#include <ctype.h> >> +#include <errno.h> >> +#include <inttypes.h> >> +//#include <libutil.h> >> +#include <stdint.h> >> + >> +/* >> + * Convert an expression of the following forms to a uint64_t. >> + * 1) A positive decimal number. >> + * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). >> + * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 >> << 10). >> + * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 >> << 20). >> + * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 >> << 30). >> + * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 >> << 40). >> + * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 >> << 50). >> + * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 >> << 60). >> + */ >> +int >> +expand_number(const char *buf, uint64_t *num) >> +{ >> + uint64_t number; >> + unsigned shift; >> + char *endptr; >> + >> + number = strtoumax(buf, &endptr, 0); >> + >> + if (endptr == buf) { >> + /* No valid digits. */ >> + errno = EINVAL; >> + return (-1); >> + } >> + >> + switch (tolower((unsigned char)*endptr)) { >> + case 'e': >> + shift = 60; >> + break; >> + case 'p': >> + shift = 50; >> + break; >> + case 't': >> + shift = 40; >> + break; >> + case 'g': >> + shift = 30; >> + break; >> + case 'm': >> + shift = 20; >> + break; >> + case 'k': >> + shift = 10; >> + break; >> + case 'b': >> + case '\0': /* No unit. */ >> + *num = number; >> + return (0); >> + default: >> + /* Unrecognized unit. */ >> + errno = EINVAL; >> + return (-1); >> + } >> + >> + if ((number << shift) >> shift != number) { >> + /* Overflow */ >> + errno = ERANGE; >> + return (-1); >> + } >> + >> + *num = number << shift; >> + return (0); >> +} >> diff --git a/example/ipfw/extra/glue.c b/example/ipfw/extra/glue.c >> new file mode 100644 >> index 0000000..0786453 >> --- /dev/null >> +++ b/example/ipfw/extra/glue.c >> @@ -0,0 +1,555 @@ >> +/* >> + * Userland functions missing in linux >> + * taken from /usr/src/lib/libc/stdtime/time32.c >> + */ >> + >> +#include <stdlib.h> >> +#include <stdio.h> >> +#include <sys/types.h> >> +#include <sys/socket.h> >> +#include <netinet/in.h> /* sockaddr_in */ >> +#include <netinet/tcp.h> /* TCP_NODELAY */ >> +#include <sys/uio.h> >> +#include <unistd.h> /* uint* types */ >> +#include <errno.h> >> +#include <string.h> /* bzero */ >> +#include <arpa/inet.h> /* htonl */ >> + >> +#ifndef HAVE_NAT >> +/* dummy nat functions */ >> +void >> +ipfw_show_nat(int ac, char **av) >> +{ >> + D("unsupported"); >> +} >> + >> +void >> +ipfw_config_nat(int ac, char **av) >> +{ >> + D("unsupported"); >> +} >> +#endif /* HAVE_NAT */ >> + >> +#ifdef NEED_STRTONUM >> +/* missing in linux and windows */ >> +long long int >> +strtonum(const char *nptr, long long minval, long long maxval, >> + const char **errstr) >> +{ >> + long long ret; >> + int errno_c = errno; /* save actual errno */ >> + >> + errno = 0; >> +#ifdef TCC >> + ret = strtol(nptr, (char **)errstr, 0); >> +#else >> + ret = strtoll(nptr, (char **)errstr, 0); >> +#endif >> + /* We accept only a string that represent exactly a number (ie. >> start >> + * and end with a digit). >> + * FreeBSD version wants errstr==NULL if no error occurs, >> otherwise >> + * errstr should point to an error string. >> + * For our purspose, we implement only the invalid error, ranges >> + * error aren't checked >> + */ >> + if (errno != 0 || nptr == *errstr || **errstr != '\0') >> + *errstr = "invalid"; >> + else { >> + *errstr = NULL; >> + errno = errno_c; >> + } >> + return ret; >> +} >> + >> +int >> +ishexnumber(int c) >> +{ >> + return ((c >= '0' && c <= '9') || >> + (c >= 'a' && c <= 'f') || >> + (c >= 'A' && c <= 'F') ); >> +} >> + >> +#endif /* NEED_STRTONUM */ >> + >> +#ifdef __linux__ >> + >> + >> +int optreset; /* missing in linux */ >> + >> +/* >> + * not implemented in linux. >> + * taken from /usr/src/lib/libc/string/strlcpy.c >> + */ >> +size_t >> +strlcpy(dst, src, siz) >> + char *dst; >> + const char *src; >> + size_t siz; >> +{ >> + char *d = dst; >> + const char *s = src; >> + size_t n = siz; >> + >> + /* Copy as many bytes as will fit */ >> + if (n != 0 && --n != 0) { >> + do { >> + if ((*d++ = *s++) == 0) >> + break; >> + } while (--n != 0); >> + } >> + >> + /* Not enough room in dst, add NUL and traverse rest of src */ >> + if (n == 0) { >> + if (siz != 0) >> + *d = '\0'; /* NUL-terminate dst */ >> + while (*s++) >> + ; >> + } >> + >> + return(s - src - 1); /* count does not include NUL */ >> +} >> + >> + >> +#endif /* __linux__ */ >> + >> + >> +#if defined (EMULATE_SYSCTL) >> +//XXX missing prerequisites >> +#include <net/if.h> //openwrt >> +#include <netinet/ip.h> //openwrt >> +#include <netinet/ip_fw.h> >> +#include <netinet/ip_dummynet.h> >> +int do_cmd(int optname, void *optval, uintptr_t optlen); >> +#endif /* EMULATE_SYSCTL */ >> + >> +/* >> + * set or get system information >> + * XXX lock acquisition/serialize calls >> + * >> + * we export this as sys/module/ipfw_mod/parameters/___ >> + * This function get or/and set the value of the sysctl passed by >> + * the name parameter. If the old value is not desired, >> + * oldp and oldlenp should be set to NULL. >> + * >> + * XXX >> + * I do not know how this works in FreeBSD in the case >> + * where there are no write permission on the sysctl var. >> + * We read the value and set return variables in any way >> + * but returns -1 on write failures, regardless the >> + * read success. >> + * >> + * Since there is no information on types, in the following >> + * code we assume a length of 4 is a int. >> + * >> + * Returns 0 on success, -1 on errors. >> + */ >> +int >> +sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, >> + size_t newlen) >> +{ >> +#if defined (EMULATE_SYSCTL) >> + /* >> + * we embed the sysctl request in the usual sockopt mechanics. >> + * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 >> + * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET >> + * subcommands. >> + * the syntax of this function is fully compatible with >> + * POSIX sysctlby name: >> + * if newp and newlen are != 0 => this is a set >> + * else if oldp and oldlen are != 0 => this is a get >> + * to avoid too much overhead in the module, the >> whole >> + * sysctltable is returned, and the parsing is done >> in userland, >> + * a probe request is done to retrieve the size >> needed to >> + * transfer the table, before the real request >> + * if both old and new params = 0 => this is a print >> + * this is a special request, done only by main() >> + * to implement the extension './ipfw sysctl', >> + * a command that bypasses the normal getopt, and >> that >> + * is available on those platforms that use this >> + * sysctl emulation. >> + * in this case, a negative oldlen signals that *oldp >> + * is actually a FILE* to print somewhere else than >> stdout >> + */ >> + >> + int l; >> + int ret; >> + struct dn_id* oid; >> + struct sysctlhead* entry; >> + char* pstring; >> + char* pdata; >> + FILE* fp; >> + >> + if((oldlenp != NULL) && ((int)*oldlenp < 0)) >> + fp = (FILE*)oldp; >> + else >> + fp = stdout; >> + if(newp != NULL && newlen != 0) >> + { >> + //this is a set >> + l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + >> strlen(name)+1 + newlen; >> + oid = malloc(l); >> + if (oid == NULL) >> + return -1; >> + oid->len = l; >> + oid->type = DN_SYSCTL_SET; >> + oid->id = DN_API_VERSION; >> + >> + entry = (struct sysctlhead*)(oid+1); >> + pdata = (char*)(entry+1); >> + pstring = pdata + newlen; >> + >> + entry->blocklen = ((sizeof(struct sysctlhead) + >> strlen(name)+1 + newlen) + 3) & ~3; >> + entry->namelen = strlen(name)+1; >> + entry->flags = 0; >> + entry->datalen = newlen; >> + >> + bcopy(newp, pdata, newlen); >> + bcopy(name, pstring, strlen(name)+1); >> + >> + ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); >> + if (ret != 0) >> + return -1; >> + } >> + else >> + { >> + //this is a get or a print >> + l = sizeof(struct dn_id); >> + oid = malloc(l); >> + if (oid == NULL) >> + return -1; >> + oid->len = l; >> + oid->type = DN_SYSCTL_GET; >> + oid->id = DN_API_VERSION; >> + >> + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); >> + if (ret != 0) >> + return -1; >> + >> + l=oid->id; >> + free(oid); >> + oid = malloc(l); >> + if (oid == NULL) >> + return -1; >> + oid->len = l; >> + oid->type = DN_SYSCTL_GET; >> + oid->id = DN_API_VERSION; >> + >> + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); >> + if (ret != 0) >> + return -1; >> + >> + entry = (struct sysctlhead*)(oid+1); >> + while(entry->blocklen != 0) >> + { >> + pdata = (char*)(entry+1); >> + pstring = pdata+entry->datalen; >> + >> + //time to check if this is a get or a print >> + if(name != NULL && oldp != NULL && *oldlenp > 0) >> + { >> + //this is a get >> + if(strcmp(name,pstring) == 0) >> + { >> + //match found, sanity chech on len >> + if(*oldlenp < entry->datalen) >> + { >> + printf("%s error: buffer >> too small\n",__FUNCTION__); >> + return -1; >> + } >> + *oldlenp = entry->datalen; >> + bcopy(pdata, oldp, *oldlenp); >> + return 0; >> + } >> + } >> + else >> + { >> + //this is a print >> + if( name == NULL ) >> + goto print; >> + if ( (strncmp(pstring,name,strlen(name)) >> == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) >> + goto print; >> + else >> + goto skip; >> +print: >> + fprintf(fp, "%s: ",pstring); >> + switch( entry->flags >> 2 ) >> + { >> + case SYSCTLTYPE_LONG: >> + fprintf(fp, "%li ", >> *(long*)(pdata)); >> + break; >> + case SYSCTLTYPE_UINT: >> + fprintf(fp, "%u ", >> *(unsigned int*)(pdata)); >> + break; >> + case SYSCTLTYPE_ULONG: >> + fprintf(fp, "%lu ", >> *(unsigned long*)(pdata)); >> + break; >> + case SYSCTLTYPE_INT: >> + default: >> + fprintf(fp, "%i ", >> *(int*)(pdata)); >> + } >> + if( (entry->flags & 0x00000003) == >> CTLFLAG_RD ) >> + fprintf(fp, "\t(read only)\n"); >> + else >> + fprintf(fp, "\n"); >> +skip: ; >> + } >> + entry = (struct sysctlhead*)((unsigned >> char*)entry + entry->blocklen); >> + } >> + free(oid); >> + return 0; >> + } >> + //fallback for invalid options >> + return -1; >> + >> +#else /* __linux__ */ >> + FILE *fp; >> + char *basename = "/sys/module/ipfw_mod/parameters/"; >> + char filename[256]; /* full filename */ >> + char *varp; >> + int ret = 0; /* return value */ >> + int d; >> + >> + if (name == NULL) /* XXX set errno */ >> + return -1; >> + >> + /* locate the filename */ >> + varp = strrchr(name, '.'); >> + if (varp == NULL) /* XXX set errno */ >> + return -1; >> + >> + snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); >> + >> + /* >> + * XXX we could open the file here, in rw mode >> + * but need to check if a file have write >> + * permissions. >> + */ >> + >> + /* check parameters */ >> + if (oldp && oldlenp) { /* read mode */ >> + fp = fopen(filename, "r"); >> + if (fp == NULL) { >> + fprintf(stderr, "%s fopen error reading filename >> %s\n", __FUNCTION__, filename); >> + return -1; >> + } >> + if (*oldlenp == 4) { >> + if (fscanf(fp, "%d", &d) == 1) >> + memcpy(oldp, &d, *oldlenp); >> + else >> + ret = -1; >> + } >> + fclose(fp); >> + } >> + >> + if (newp && newlen) { /* write */ >> + fp = fopen(filename, "w"); >> + if (fp == NULL) { >> + fprintf(stderr, "%s fopen error writing filename >> %s\n", __FUNCTION__, filename); >> + return -1; >> + } >> + if (newlen == 4) { >> + if (fprintf(fp, "%d", *(int*)newp) < 1) >> + ret = -1; >> + } >> + >> + fclose(fp); >> + } >> + >> + return ret; >> +#endif /* __linux__ */ >> +} >> + >> +/* >> + * The following two functions implement getsockopt/setsockopt >> + * replacements to talk over a TCP socket. >> + * Because the calls are synchronous, we can run blocking code >> + * and do not need to play special tricks to be selectable. >> + * The wire protocol for the emulation is the following: >> + * REQUEST: n32 req_size, level, optname; u8 data[req_size] >> + * RESPONSE: n32 resp_size, ret_code; u8 data[resp_size] >> + * data is only present if ret_code == 0 >> + * >> + * Return 0 if the message wan sent to the remote >> + * endpoint, -1 on error. >> + * >> + * If the required lenght is greater then the >> + * available buffer size, -1 is returned and >> + * optlen is the required lenght. >> + */ >> +enum sock_type {GET_SOCKOPT, SET_SOCKOPT}; >> + >> +struct wire_hdr { >> + uint32_t optlen; /* actual data len */ >> + uint32_t level; /* or error */ >> + uint32_t optname; /* or act len */ >> + uint32_t dir; /* in or out */ >> +}; >> + >> +/* do a complete write of the buffer */ >> +static int >> +writen(int fd, const char *buf, int len) >> +{ >> + int i; >> + >> + for (; len > 0; buf += i, len -= i) { >> + i = write(fd, buf, len); >> + ND("have %d wrote %d", len, i); >> + if (i < 0) { >> + if (errno == EAGAIN) >> + continue; >> + return -1; >> + } >> + } >> + return 0; >> +} >> + >> +/* do a complete read */ >> +static int >> +readn(int fd, char *buf, int len) >> +{ >> + int i, pos; >> + >> + for (pos = 0; pos < len; pos += i) { >> + i = read(fd, buf + pos, len - pos); >> + ND("have %d want %d got %d", pos, len, i); >> + if (i < 0) { >> + if (errno == EAGAIN) >> + continue; >> + return -1; >> + } >> + } >> + ND("full read got %d", pos); >> + return 0; >> +} >> + >> +int >> +__sockopt2(int s, int level, int optname, void *optval, socklen_t >> *optlen, >> + enum sopt_dir dir) >> +{ >> + struct wire_hdr r; >> + int len = optlen && optval ? *optlen : 0; >> + int new_errno; >> + >> + ND("dir %d optlen %d level %d optname %d", dir, len, level, >> optname); >> + /* send request to the server */ >> + r.optlen = htonl(len); >> + r.level = htonl(level); >> + r.optname = htonl(optname); >> + r.dir = htonl(dir); >> + >> + if (writen(s, (const char *) &r, sizeof(r))) >> + return -1; /* error writing */ >> + >> + /* send data, if present */ >> + if (len < 0) { >> + fprintf(stderr, "%s invalid args found\n", __FUNCTION__); >> + return -1; >> + } else if (len > 0) { >> + if (writen(s, optval, len)) >> + return -1; /* error writing */ >> + } >> + >> + /* read response size and error code */ >> + if (readn(s, (char *)&r, sizeof(r))) >> + return -1; /* error reading */ >> + len = ntohl(r.optlen); >> + ND("got header, datalen %d", len); >> + if (len > 0) { >> + if (readn(s, optval, len)) { >> + return -1; /* error reading */ >> + } >> + } >> + if (optlen) >> + *optlen = ntohl(r.optlen); /* actual len */ >> + new_errno = ntohl(r.level); >> + if (new_errno) >> + errno = new_errno; >> + return (new_errno ? -1 : 0); >> +} >> + >> +/* >> + * getsockopt() replacement. >> + */ >> +int >> +getsockopt2(int s, int level, int optname, void *optval, >> + socklen_t *optlen) >> +{ >> + return __sockopt2(s, level, optname, optval, optlen, SOPT_GET); >> +} >> + >> +/* >> + * setsockopt() replacement >> + */ >> +int >> +setsockopt2(int s, int level, int optname, void *optval, >> + socklen_t optlen) >> +{ >> + /* optlen not changed, use the local address */ >> + return __sockopt2(s, level, optname, optval, &optlen, SOPT_SET); >> +} >> + >> +#ifdef socket >> +#undef socket /* we want the real one */ >> +#endif >> +/* >> + * This function replaces the socket() call to connect to >> + * the ipfw control socket. >> + * We actually ignore the paramerers if IPFW_HOST and IPFW_PORT >> + * are defined. >> + */ >> +int >> +do_connect(const char *addr, int port) >> +{ >> + int conn_fd; >> + >> + /* open the socket */ >> +#ifdef NETLINK >> + >> +struct rtnl_handle rth; >> + >> + conn_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); >> +#else >> + struct sockaddr_in server; /* server address */ >> + const char *s; >> + >> + conn_fd = socket(AF_INET, SOCK_STREAM, 0); >> + if (conn_fd < 0) { >> + perror("socket"); >> + return -1; >> + } >> +#endif >> +#ifndef NETLINK >> + /* fill the sockaddr structure with server address */ >> + bzero(&server, sizeof(server)); >> + server.sin_family = AF_INET; >> + >> + /* override the host if set in the environment */ >> + s = getenv("IPFW_HOST"); >> + if (s) >> + addr = s; >> + inet_aton(addr, &server.sin_addr); >> + s = getenv("IPFW_PORT"); >> + if (s && atoi(s) > 0) >> + port = atoi(s); >> + server.sin_port = htons(port); >> + >> + /* connect to the server */ >> + if (connect(conn_fd, (struct sockaddr*) &server, sizeof(server)) >> < 0) { >> + perror("connect"); >> + return -1; >> + } >> +#ifdef setsockopt /* we want the real one here */ >> +#undef setsockopt >> +#undef getsockopt >> +#endif >> + { >> + int on = 1, ret; >> + ret = setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, &on, >> sizeof(on)); >> + ND("set TCP_NODELAY %d returns %d", on, ret); >> + } >> + if (0) >> + fprintf(stderr, "connected to %s:%d\n", >> + inet_ntoa(server.sin_addr), >> ntohs(server.sin_port)); >> +#endif >> + return conn_fd; >> +} >> diff --git a/example/ipfw/extra/glue.h b/example/ipfw/extra/glue.h >> new file mode 100644 >> index 0000000..97b25bf >> --- /dev/null >> +++ b/example/ipfw/extra/glue.h >> @@ -0,0 +1,488 @@ >> +/* >> + * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * 1. Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * 2. Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in the >> + * documentation and/or other materials provided with the >> distribution. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND >> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE >> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR >> PURPOSE >> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE >> LIABLE >> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR >> CONSEQUENTIAL >> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE >> GOODS >> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) >> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, >> STRICT >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY >> WAY >> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF >> + * SUCH DAMAGE. >> + */ >> +/* >> + * $Id: glue.h 8327 2011-03-22 17:01:35Z marta $ >> + * >> + * glue code to adapt the FreeBSD version to linux and windows, >> + * userland and kernel. >> + * This is included before any other headers, so we do not have >> + * a chance to override any #define that should appear in other >> + * headers. >> + * First handle headers for userland and kernel. Then common code >> + * (including headers that require a specific order of inclusion), >> + * then the user- and kernel- specific parts. >> + */ >> + >> +#ifndef _GLUE_H >> +#define _GLUE_H >> + >> +/* >> + * common definitions to allow portability >> + */ >> +#ifndef __FBSDID >> +#define __FBSDID(x) struct __hack >> +#endif /* FBSDID */ >> + >> +#include <stdint.h> /* linux needs it in addition to sys/types.h */ >> +#include <sys/types.h> /* for size_t */ >> + >> +#define true 1 /* stdbool */ >> +#ifdef _KERNEL /* prevent a warning */ >> +#undef _KERNEL >> +#include <sys/ioctl.h> >> +#include <sys/time.h> >> +#include <errno.h> /* we want errno */ >> +#define _KERNEL >> +#else >> +#include <sys/ioctl.h> >> +#endif >> + >> +#include <time.h> >> +#ifndef USERSPACE >> +#include <netinet/ether.h> >> +#endif >> + >> + >> +/*----- */ >> + >> +/* ipfw2.c - from timeconv.h */ >> +static __inline time_t >> +_long_to_time(long tlong) >> +{ >> + if (sizeof(long) == sizeof(__int32_t)) >> + return((time_t)(__int32_t)(tlong)); >> + return((time_t)tlong); >> +} >> + >> +#define min(a, b) ((a) < (b) ? (a) : (b) ) // radix.c >> +/* >> + * debugging macros from ip_dn_private.h >> + */ >> +#include <sys/time.h> >> +#include <stdio.h> >> +extern char *strrchr(const char *, int); >> +static inline const char *xyz(const char *s) { >> + static char buf[128]; >> + struct timeval t; >> + const char *ret = strrchr(s, '/'); >> + if (ret) s = ret + 1; >> + gettimeofday(&t, NULL); >> + buf[sizeof(buf) - 1] = '\0'; >> + snprintf(buf, sizeof(buf), "[%4d.%06d] %s", >> + (int)(t.tv_sec % 1000), (int)(t.tv_usec), s); >> + return buf; >> +} >> + >> +#define ND(fmt, ...) do {} while (0) >> +#define D1(fmt, ...) do {} while (0) >> +#define D(fmt, ...) fprintf(stderr, "%s:%-10s [%d] " fmt "\n", \ >> + xyz(__FILE__), __FUNCTION__, __LINE__, ## __VA_ARGS__) >> + >> +/* Rate limited version of "D", lps indicates how many per second */ >> +#define RD(lps, format, ...) \ >> + do { \ >> + static int t0, __cnt; \ >> + struct timeval __xxts; \ >> + gettimeofday(&__xxts, NULL); \ >> + if (t0 != __xxts.tv_sec) { \ >> + t0 = __xxts.tv_sec; \ >> + __cnt = 0; \ >> + } \ >> + if (__cnt++ < lps) { \ >> + D(format, ##__VA_ARGS__); \ >> + } \ >> + } while (0) >> + >> +#define DX(lev, fmt, ...) do { \ >> + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) >> +/* end debugging macros */ >> + >> + >> +/* >> + * sbin/ipfw on non-freebsd platform >> + */ >> +#ifdef NEED_STRTONUM >> +/* prototypes from libutil */ >> +/* humanize_number(3) */ >> +#define HN_DECIMAL 0x01 >> +#define HN_NOSPACE 0x02 >> +#define HN_B 0x04 >> +#define HN_DIVISOR_1000 0x08 >> +#define HN_IEC_PREFIXES 0x10 >> + >> +#define HN_GETSCALE 0x10 >> +#define HN_AUTOSCALE 0x20 >> + >> + >> +int humanize_number(char *_buf, size_t _len, int64_t _number, >> + const char *_suffix, int _scale, int _flags); >> +int expand_number(const char *buf, uint64_t *num); >> + >> + >> +long long >> +strtonum(const char *nptr, long long minval, long long maxval, >> + const char **errstr); >> +#ifndef __APPLE__ >> +int ishexnumber(int c); >> +#endif >> +#endif /* NEED_STRTONUM */ >> + >> +#ifdef NEED_SYSCTLBYNAME /* and other linux calls */ >> +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, >> + void *newp, size_t newlen); >> +#define setprogname(x) /* not present in linux */ >> + >> +extern int optreset; /* not present in linux */ >> + >> +long long int strtonum(const char *nptr, long long minval, >> + long long maxval, const char **errstr); >> + >> + >> +struct ether_addr; >> +struct ether_addr * ether_aton(const char *a); >> + >> +#define ICMP6_MAXTYPE 201 >> +#define __u6_addr in6_u >> +#define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ >> + >> + >> +#define __u6_addr32 u6_addr32 >> +/* on freebsd sys/socket.h pf specific */ >> +#define NET_RT_IFLIST 3 /* survey interface list */ >> + >> +#define RTM_VERSION 5 /* Up the ante and ignore older versions >> */ >> + >> +#endif // NEED_SYSCTLBYNAME >> + >> +#ifdef NEED_SIN_LEN >> +/* >> + * linux at least does not have sin_len and sin6_len, so we remap >> + * to some safe fields (check use of sin6_flowinfo XXX) >> + */ >> +#define sin_len sin_zero[0] >> +#define sin6_len sin6_flowinfo >> +#endif /* NEED_SIN_LEN */ >> + >> +#ifdef NEED_ROUNDUP2 /* in freensd is in sys/param.h */ >> +/* round up to the next power of 2 (y) */ >> +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of >> two */ >> +#endif // NEED_ROUNDUP2 >> + >> +/* possibly redundant, does not harm */ >> +size_t strlcpy(char * dst, const char * src, size_t siz); >> + >> +/* >> + * Part 2: common userland and kernel definitions >> + */ >> + >> +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to >> destination */ >> +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively >> prohibited */ >> +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ >> +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ >> + >> +/* >> + * linux: sysctl are mapped into /sys/module/ipfw_mod parameters >> + * windows: they are emulated via get/setsockopt >> + */ >> +#define CTLFLAG_RD 1 >> +#define CTLFLAG_RDTUN 1 >> +#define CTLFLAG_RW 2 >> +#define CTLFLAG_SECURE3 0 /* unsupported */ >> +#define CTLFLAG_VNET 0 /* unsupported */ >> + >> +/* if needed, queue.h must be included here after list.h */ >> + >> +/* >> + * our own struct thread >> + */ >> +struct thread { /* ip_fw_sockopt */ >> + void *sopt_td; >> + void *td_ucred; >> +}; >> + >> +enum sopt_dir { SOPT_GET, SOPT_SET }; >> + >> +struct sockopt { >> + enum sopt_dir sopt_dir; /* is this a get or a set? */ >> + int sopt_level; /* second arg of [gs]etsockopt */ >> + int sopt_name; /* third arg of [gs]etsockopt */ >> + void *sopt_val; /* fourth arg of [gs]etsockopt */ >> + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ >> + struct thread *sopt_td; /* calling thread or null if kernel */ >> +}; >> + >> + >> +/* >> + * List of values used for set/getsockopt options. >> + * The base value on FreeBSD is defined as a macro, >> + * if not available we will use our own enum. >> + * The TABLE_BASE value is used in the kernel. >> + */ >> +#define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ >> +#define IP_FW_TABLE_ADD (_IPFW_SOCKOPT_BASE + 0) >> +#define IP_FW_TABLE_DEL (_IPFW_SOCKOPT_BASE + 1) >> +#define IP_FW_TABLE_FLUSH (_IPFW_SOCKOPT_BASE + 2) >> +#define IP_FW_TABLE_GETSIZE (_IPFW_SOCKOPT_BASE + 3) >> +#define IP_FW_TABLE_LIST (_IPFW_SOCKOPT_BASE + 4) >> +#define IP_FW_DYN_GET (_IPFW_SOCKOPT_BASE + 5) >> + >> +#define IP_FW3 (_IPFW_SOCKOPT_BASE + 8) >> +#define IP_DUMMYNET3 (_IPFW_SOCKOPT_BASE + 9) >> + >> +#define IP_FW_ADD (_IPFW_SOCKOPT_BASE + 10) >> +#define IP_FW_DEL (_IPFW_SOCKOPT_BASE + 11) >> +#define IP_FW_FLUSH (_IPFW_SOCKOPT_BASE + 12) >> +#define IP_FW_ZERO (_IPFW_SOCKOPT_BASE + 13) >> +#define IP_FW_GET (_IPFW_SOCKOPT_BASE + 14) >> +#define IP_FW_RESETLOG (_IPFW_SOCKOPT_BASE + 15) >> + >> +#define IP_FW_NAT_CFG (_IPFW_SOCKOPT_BASE + 16) >> +#define IP_FW_NAT_DEL (_IPFW_SOCKOPT_BASE + 17) >> +#define IP_FW_NAT_GET_CONFIG (_IPFW_SOCKOPT_BASE + 18) >> +#define IP_FW_NAT_GET_LOG (_IPFW_SOCKOPT_BASE + 19) >> + >> +#define IP_DUMMYNET_CONFIGURE (_IPFW_SOCKOPT_BASE + 20) >> +#define IP_DUMMYNET_DEL (_IPFW_SOCKOPT_BASE + 21) >> +#define IP_DUMMYNET_FLUSH (_IPFW_SOCKOPT_BASE + 22) >> + /* 63 is missing */ >> +#define IP_DUMMYNET_GET (_IPFW_SOCKOPT_BASE + 24) >> +#define _IPFW_SOCKOPT_END (_IPFW_SOCKOPT_BASE + 25) >> + >> +/* >> + * Part 3: userland stuff for linux/windows >> + */ >> + >> + >> +/* >> + * now remap functions for userland or linux kernel etc. >> + */ >> +#ifdef USERSPACE >> +/* >> + * definitions used when the programs communicate through userspace. >> + * We need to define the socket and addresses used to talk, and >> + * the userland side must also remap socket() and [gs]etsockopt() >> + * to appropriate wrappers. >> + */ >> + >> +#define LOCALADDR "127.0.0.1" >> +#define IPFW_PORT 5555 >> + >> +#ifndef KERNEL_SIDE >> +#ifdef _KERNEL >> +#error _KERNEL defined in user space >> +#endif >> +int do_connect(const char *addr, int port); >> +#include <sys/socket.h> /* for socklen_t */ >> + >> +#define socket(a, b, c) do_connect(LOCALADDR, IPFW_PORT) >> +#define setsockopt setsockopt2 >> +#define getsockopt getsockopt2 >> +int getsockopt2(int s, int lev, int optname, void *optval, socklen_t >> *optlen); >> +int setsockopt2(int s, int lev, int optname, void *optval, socklen_t >> optlen); >> +#endif /* KERNEL_SIDE */ >> + >> +#endif /* USERSPACE */ >> + >> +/* >> + * Part 5: windows specific stuff and sysctl emulation >> + */ >> + >> +/******************* >> +* SYSCTL emulation * >> +********************/ >> +#ifdef EMULATE_SYSCTL >> + >> +/* this needs to be here, as it is part of the user-kernel messages */ >> +/* flag is set with the last 2 bits for access, as defined in glue.h >> + * and the rest for type >> + */ >> +enum { >> + SYSCTLTYPE_INT = 0, >> + SYSCTLTYPE_UINT = 1, >> + SYSCTLTYPE_SHORT = 2, >> + SYSCTLTYPE_USHORT = 3, >> + SYSCTLTYPE_LONG = 4, >> + SYSCTLTYPE_ULONG = 5, >> + SYSCTLTYPE_STRING = 6, >> + >> + /* the following are SYSCTL_PROC equivalents of the above, >> + * where the SYSCTLTYPE is shifted 2 bits, >> + * and SYSCTLTYPE_PROC is set >> + */ >> + SYSCTLTYPE_PROC = 0x100, >> + CTLTYPE_INT = (0x100 | (0<<2)), >> + CTLTYPE_UINT = (0x100 | (1<<2)), >> + CTLTYPE_LONG = (0x100 | (4<<2)), >> + CTLTYPE_ULONG = (0x100 | (5<<2)) >> +}; >> + >> +struct sysctlhead { >> + uint32_t blocklen; //total size of the entry >> + uint32_t namelen; //strlen(name) + '\0' >> + uint32_t flags; //type and access >> + uint32_t datalen; >> +}; >> + >> + >> +#endif /* EMULATE_SYSCTL */ >> +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void >> *newp, >> + size_t newlen); >> + >> +#ifndef __FreeBSD__ >> +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) >> +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) >> +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) >> + >> +static inline int fls(int _n) >> +{ >> + unsigned int n = _n; >> + int i = 0; >> + for (i = 0; n > 0; n >>= 1, i++) >> + ; >> + return i; >> +} >> + >> +static inline unsigned long __fls(unsigned long word) >> +{ >> + return fls(word) - 1; >> +} >> + >> + >> +#endif /* !FreeBSD */ >> + >> +#ifdef KERNEL_SIDE >> +/* sys/counter.h , to be moved to a file */ >> +typedef uint64_t *counter_u64_t; // XXX kernel >> +static inline void counter_u64_add(counter_u64_t c, int64_t v) >> +{ >> + *c += v; >> +} >> +static inline void counter_u64_zero(counter_u64_t c) >> +{ >> + *c = 0; >> +} >> +static inline uint64_t counter_u64_fetch(counter_u64_t c) >> +{ >> + return *c; >> +} >> + >> +struct rm_priotracker { >> +}; >> + >> +#define vslock(_a, _b) (0) >> +#define vsunlock(_a, _b) >> + >> +typedef uint64_t u_register_t; // XXX not on osx ? >> + >> +typedef uintptr_t eventhandler_tag; >> +#define EVENTHANDLER_REGISTER(_a, _b, ...) (uintptr_t)_b; >> +#define EVENTHANDLER_DEREGISTER(_a, _b, ...) (void)_b; >> + >> +// XXX this needs to be completed >> +#define if_name(_ifp) (_ifp->if_xname) >> +#define ifunit_ref(_n) NULL // XXX >> +#define if_rele(_n) >> + >> +#define rtalloc1_fib(_a, ...) NULL >> +#define rt_key(_a) NULL >> +#define rt_mask(_a) NULL >> +#define RTFREE_LOCKED(_a) ((void)NULL) >> +struct rtentry { >> +}; >> +#define rt_tables_get_rnh(_a, _b) NULL >> + >> +#endif /* KERNEL_SIDE */ >> + >> +#ifdef _KERNEL >> +/* XXX kernel support */ >> +/* on freebsd net/if.h XXX used */ >> +#ifdef linux >> +#define div64(a,b) (((int64_t)a)/((int64_t)b)) >> +#define LINUX_VERSION_CODE 30003 >> +#define KERNEL_VERSION(a,b,c) (a*10000+b*100 + c) >> +#define __printflike(a,b) >> +#endif /* linux */ >> + >> +#endif /* _KERNEL */ >> + >> +#ifndef __FreeBSD__ >> +#ifndef IFNAMSIZ >> +#define IFNAMSIZ 16 >> +#endif >> +#include "missing.h" >> + >> +struct if_data { >> + /* ... */ >> + u_long ifi_mtu; /* maximum transmission unit */ >> +}; >> + >> +#endif >> + >> +#ifdef __APPLE__ >> +#include <sys/socketvar.h> // need in kernel >> + >> +/* needed both in kernel and userspace */ >> +struct if_data64 { // XXX Darwin version >> + /* ... */ >> + u_long ifi_mtu; /* maximum transmission unit */ >> +}; >> + >> +struct net_event_data { >> +}; >> + >> +struct in_addr; >> +#endif /* __APPLE__ */ >> + >> +#define __PAST_END(v, idx) v[idx] >> + >> +/* >> + * a fast copy routine >> + */ >> +#include <strings.h> >> +// XXX only for multiples of 64 bytes, non overlapped. >> +static inline void >> +_pkt_copy(const void *_src, void *_dst, int l) >> +{ >> + const uint64_t *src = _src; >> + uint64_t *dst = _dst; >> +#define likely(x) __builtin_expect(!!(x), 1) >> +#define unlikely(x) __builtin_expect(!!(x), 0) >> + if (unlikely(l >= 1024)) { >> + bcopy(src, dst, l); >> + return; >> + } >> + for (; l > 0; l-=64) { >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + *dst++ = *src++; >> + } >> +} >> + >> +#endif /* !_GLUE_H */ >> diff --git a/example/ipfw/extra/humanize_number.c >> b/example/ipfw/extra/humanize_number.c >> new file mode 100644 >> index 0000000..0b7382f >> --- /dev/null >> +++ b/example/ipfw/extra/humanize_number.c >> @@ -0,0 +1,167 @@ >> +/* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp >> $ */ >> + >> +/* >> + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. >> + * All rights reserved. >> + * >> + * This code is derived from software contributed to The NetBSD >> Foundation >> + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, >> + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * 1. Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * 2. Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in the >> + * documentation and/or other materials provided with the >> distribution. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND >> CONTRIBUTORS >> + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> LIMITED >> + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A >> PARTICULAR >> + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR >> CONTRIBUTORS >> + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR >> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF >> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR >> BUSINESS >> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER >> IN >> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR >> OTHERWISE) >> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED >> OF THE >> + * POSSIBILITY OF SUCH DAMAGE. >> + */ >> + >> +#include <sys/cdefs.h> >> +__FBSDID("$FreeBSD: head/lib/libutil/humanize_number.c 220582 2011-04-12 >> 22:48:03Z delphij $"); >> + >> +#include <sys/types.h> >> +#include <assert.h> >> +#include <inttypes.h> >> +#include <stdio.h> >> +#include <stdlib.h> >> +#include <string.h> >> +#include <locale.h> >> +//#include <libutil.h> >> + >> +static const int maxscale = 7; >> + >> +int >> +humanize_number(char *buf, size_t len, int64_t quotient, >> + const char *suffix, int scale, int flags) >> +{ >> + const char *prefixes, *sep; >> + int i, r, remainder, s1, s2, sign; >> + int64_t divisor, max; >> + size_t baselen; >> + >> + assert(buf != NULL); >> + assert(suffix != NULL); >> + assert(scale >= 0); >> + assert(scale < maxscale || (((scale & (HN_AUTOSCALE | >> HN_GETSCALE)) != 0))); >> + assert(!((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES))); >> + >> + remainder = 0; >> + >> + if (flags & HN_IEC_PREFIXES) { >> + baselen = 2; >> + /* >> + * Use the prefixes for power of two recommended by >> + * the International Electrotechnical Commission >> + * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). >> + * >> + * HN_IEC_PREFIXES implies a divisor of 1024 here >> + * (use of HN_DIVISOR_1000 would have triggered >> + * an assertion earlier). >> + */ >> + divisor = 1024; >> + if (flags & HN_B) >> + prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; >> + else >> + prefixes = "\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; >> + } else { >> + baselen = 1; >> + if (flags & HN_DIVISOR_1000) >> + divisor = 1000; >> + else >> + divisor = 1024; >> + >> + if (flags & HN_B) >> + prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; >> + else >> + prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; >> + } >> + >> +#define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) >> + >> + if (scale < 0 || (scale >= maxscale && >> + (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)) >> + return (-1); >> + >> + if (buf == NULL || suffix == NULL) >> + return (-1); >> + >> + if (len > 0) >> + buf[0] = '\0'; >> + if (quotient < 0) { >> + sign = -1; >> + quotient = -quotient; >> + baselen += 2; /* sign, digit */ >> + } else { >> + sign = 1; >> + baselen += 1; /* digit */ >> + } >> + if (flags & HN_NOSPACE) >> + sep = ""; >> + else { >> + sep = " "; >> + baselen++; >> + } >> + baselen += strlen(suffix); >> + >> + /* Check if enough room for `x y' + suffix + `\0' */ >> + if (len < baselen + 1) >> + return (-1); >> + >> + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { >> + /* See if there is additional columns can be used. */ >> + for (max = 1, i = len - baselen; i-- > 0;) >> + max *= 10; >> + >> + /* >> + * Divide the number until it fits the given column. >> + * If there will be an overflow by the rounding below, >> + * divide once more. >> + */ >> + for (i = 0; >> + (quotient >= max || (quotient == max - 1 && remainder >> >= 950)) && >> + i < maxscale; i++) { >> + remainder = quotient % divisor; >> + quotient /= divisor; >> + } >> + >> + if (scale & HN_GETSCALE) >> + return (i); >> + } else { >> + for (i = 0; i < scale && i < maxscale; i++) { >> + remainder = quotient % divisor; >> + quotient /= divisor; >> + } >> + } >> + >> + /* If a value <= 9.9 after rounding and ... */ >> + if (quotient <= 9 && remainder < 950 && i > 0 && flags & >> HN_DECIMAL) { >> + /* baselen + \0 + .N */ >> + if (len < baselen + 1 + 2) >> + return (-1); >> + s1 = (int)quotient + ((remainder + 50) / 1000); >> + s2 = ((remainder + 50) / 100) % 10; >> + r = snprintf(buf, len, "%d%s%d%s%s%s", >> + sign * s1, localeconv()->decimal_point, s2, >> + sep, SCALE2PREFIX(i), suffix); >> + } else >> + r = snprintf(buf, len, "%" PRId64 "%s%s%s", >> + sign * (quotient + (remainder + 50) / 1000), >> + sep, SCALE2PREFIX(i), suffix); >> + >> + return (r); >> +} >> + >> diff --git a/example/ipfw/extra/ipfw2_mod.c >> b/example/ipfw/extra/ipfw2_mod.c >> new file mode 100644 >> index 0000000..974c6af >> --- /dev/null >> +++ b/example/ipfw/extra/ipfw2_mod.c >> @@ -0,0 +1,278 @@ >> +/* >> + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * 1. Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * 2. Redistributions in binary form must reproduce the above copyright >> + * notice, this list of conditions and the following disclaimer in the >> + * documentation and/or other materials provided with the >> distribution. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND >> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE >> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR >> PURPOSE >> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE >> LIABLE >> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR >> CONSEQUENTIAL >> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE >> GOODS >> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) >> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, >> STRICT >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY >> WAY >> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF >> + * SUCH DAMAGE. >> + */ >> + >> +/* >> + * $Id: ipfw2_mod.c 7787 2010-11-19 21:15:50Z marta $ >> + * >> + * The main interface to build ipfw+dummynet as a linux module. >> + * (and possibly as a windows module as well, though that part >> + * is not complete yet). >> + * >> + * The control interface uses the sockopt mechanism >> + * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). >> + * >> + * The data interface uses the netfilter interface, at the moment >> + * hooked to the PRE_ROUTING and POST_ROUTING hooks. >> + * Unfortunately the netfilter interface is a moving target, >> + * so we need a set of macros to adapt to the various cases. >> + * >> + * In the netfilter hook we just mark packet as 'QUEUE' and then >> + * let the queue handler to do the whole work (filtering and >> + * possibly emulation). >> + * As we receive packets, we wrap them with an mbuf descriptor >> + * so the existing ipfw+dummynet code runs unmodified. >> + */ >> + >> +#include <sys/cdefs.h> > >
diff --git a/example/ipfw/BSDmakefile b/example/ipfw/BSDmakefile new file mode 100644 index 0000000..810ae2b --- /dev/null +++ b/example/ipfw/BSDmakefile @@ -0,0 +1,8 @@ +# forward to use gmake +.PHONY: ipfw kipfw + +all: + gmake + +$(.TARGETS) : + gmake MAKE=gmake $(.TARGETS) diff --git a/example/ipfw/README b/example/ipfw/README new file mode 100644 index 0000000..2a55ba0 --- /dev/null +++ b/example/ipfw/README @@ -0,0 +1,76 @@ +# README FILE FOR IPFW-USER ON TOP OF NETMAP + +This directory contains a version of ipfw and dummynet that can +run in userland, using NETMAP as the backend for packet I/O. +This permits a throughput about 10 times higher than the +corresponding in-kernel version. I have measured about 6.5 Mpps +for plain filtering, and 2.2 Mpps going through a pipe. +Some optimizations are possible when running on netmap pipes, +or other netmap ports that support zero copy. + +To build the code simply run + make NETMAP_INC=/some/where/with/netmap-release/sys + +pointing to the netmap 'sys' directory +(the makefile uses gmake underneath) + +The base version comes from FreeBSD-HEAD -r '{2012-08-03}' +(and subsequently updated in late 2013) +with small modifications listed below + + netinet/ipfw + ip_dn_io.c + support for on-stack mbufs + ip_fw2.c + some conditional compilation for functions not + available in userspace + ip_fw_log.c + revise snprintf, SNPARGS (MAC) + + +sbin/ipfw and the kernel counterpart communicate throuugh a +TCP socket (localhost:5555) carrying the raw data that would +normally be carried on seg/getsockopt. + +For testing purposes, opening a telnet session to port 5556 and +typing some bytes will start a fake 'infinite source' so you can +check how fast your ruleset works. + + gmake + dummynet/ipfw & # preferably in another window + telnet localhost 5556 # type some bytes to start 'traffic' + + sh -c "while true; do ipfw/ipfw show; ipfw/ipfw zero; sleep 1; done" + +(on an i7-3400 I get about 15 Mpps) + +Real packet I/O is possible using netmap info.iet.unipi.it/~luigi/netmap/ +You can use a couple of VALE switches (part of netmap) to connect +a source and sink to the userspace firewall, as follows + + s f f d + [pkt-gen]-->--[valeA]-->--[kipfw]-->--[valeB]-->--[pkt-gen] + +The commands to run (in separate windows) are + + # preliminarly, load the netmap module + sudo kldload netmap.ko + + # connect the firewall to two vale switches + ./kipfw valeA:f valeB:f & + + # configure ipfw/dummynet + ipfw/ipfw show # or other + + # start the sink + pkt-gen -i valeB:d -f rx + + # start an infinite source + pkt-gen -i valeA:s -f tx + + # plain again with the firewall and enjoy + ipfw/ipfw show # or other + +On my i7-3400 I get about 6.5 Mpps with a single rule, and about 2.2 Mpps +when going through a dummynet pipe. This is for a single process handling +the traffic. diff --git a/example/ipfw/UPDATE b/example/ipfw/UPDATE new file mode 100644 index 0000000..3da344f --- /dev/null +++ b/example/ipfw/UPDATE @@ -0,0 +1,44 @@ +--- 20141017 --- updating to FreeBSD head 273155 + +sys/net/pfil.h V $FreeBSD$ +sys/net/radix.h V $FreeBSD$ +sys/net/radix.c V merge, caddr_t -> u_char * + +sys/netgraph/ng_ipfw.h -- unchanged + +sys/netinet/in_cksum.c -- unchanged +sys/netinet/ip_dummynet.h V add DN_IS_ECN +sys/netinet/ip_fw.h massive changes +sys/netinet/tcp.h V $FreeBSD$ +sys/netinet/udp.h V $FreeBSD$ + +sys/netpfil/ipfw +dn_heap.c -- unchanged +dn_heap.h -- unchanged +dn_sched.h $FreeBSD$ +dn_sched_fifo.c -- unchanged +dn_sched_prio.c -- unchanged +dn_sched_qfq.c -- unchanged +dn_sched_rr.c -- unchanged +dn_sched_wf2q.c -- unchanged +ip_dn_glue.c V $FreeBSD$ +ip_dn_io.c V ecn, check missing ifp +ip_dn_private.h V $FreeBSD$ +ip_dummynet.c V $FreeBSD$, callout_reset_sbt, fs fixes, module +ip_fw2.c XXX large +ip_fw_dynamic.c XXX large +ip_fw_log.c XXX IP_FW_ARG.. TARG +ip_fw_pfil.c XXX small change +ip_fw_private.h XXX large +ip_fw_sockopt.c XXX huge +ip_fw_table.c XXX huge + +Userspace: +altq.c $FreeBSD$, bprintf +dummynet.c $FreeBSD$, ecn, bprintf +ipfw2.c $FreeBSD$, commands +ipfw2.h as above +ipv6.c as above +main.c small changes +nat.c internal + diff --git a/example/ipfw/extra/expand_number.c b/example/ipfw/extra/expand_number.c new file mode 100644 index 0000000..523fbb0 --- /dev/null +++ b/example/ipfw/extra/expand_number.c @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org> + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 211343 2010-08-15 18:32:06Z des $"); + +#include <sys/types.h> +#include <ctype.h> +#include <errno.h> +#include <inttypes.h> +//#include <libutil.h> +#include <stdint.h> + +/* + * Convert an expression of the following forms to a uint64_t. + * 1) A positive decimal number. + * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). + * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). + * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). + * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). + * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). + * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). + * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). + */ +int +expand_number(const char *buf, uint64_t *num) +{ + uint64_t number; + unsigned shift; + char *endptr; + + number = strtoumax(buf, &endptr, 0); + + if (endptr == buf) { + /* No valid digits. */ + errno = EINVAL; + return (-1); + } + + switch (tolower((unsigned char)*endptr)) { + case 'e': + shift = 60; + break; + case 'p': + shift = 50; + break; + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': /* No unit. */ + *num = number; + return (0); + default: + /* Unrecognized unit. */ + errno = EINVAL; + return (-1); + } + + if ((number << shift) >> shift != number) { + /* Overflow */ + errno = ERANGE; + return (-1); + } + + *num = number << shift; + return (0); +} diff --git a/example/ipfw/extra/glue.c b/example/ipfw/extra/glue.c new file mode 100644 index 0000000..0786453 --- /dev/null +++ b/example/ipfw/extra/glue.c @@ -0,0 +1,555 @@ +/* + * Userland functions missing in linux + * taken from /usr/src/lib/libc/stdtime/time32.c + */ + +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> /* sockaddr_in */ +#include <netinet/tcp.h> /* TCP_NODELAY */ +#include <sys/uio.h> +#include <unistd.h> /* uint* types */ +#include <errno.h> +#include <string.h> /* bzero */ +#include <arpa/inet.h> /* htonl */ + +#ifndef HAVE_NAT +/* dummy nat functions */ +void +ipfw_show_nat(int ac, char **av) +{ + D("unsupported"); +} + +void +ipfw_config_nat(int ac, char **av) +{ + D("unsupported"); +} +#endif /* HAVE_NAT */ + +#ifdef NEED_STRTONUM +/* missing in linux and windows */ +long long int +strtonum(const char *nptr, long long minval, long long maxval, + const char **errstr) +{ + long long ret; + int errno_c = errno; /* save actual errno */ + + errno = 0; +#ifdef TCC + ret = strtol(nptr, (char **)errstr, 0); +#else + ret = strtoll(nptr, (char **)errstr, 0); +#endif + /* We accept only a string that represent exactly a number (ie. start + * and end with a digit). + * FreeBSD version wants errstr==NULL if no error occurs, otherwise + * errstr should point to an error string. + * For our purspose, we implement only the invalid error, ranges + * error aren't checked + */ + if (errno != 0 || nptr == *errstr || **errstr != '\0') + *errstr = "invalid"; + else { + *errstr = NULL; + errno = errno_c; + } + return ret; +} + +int +ishexnumber(int c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F') ); +} + +#endif /* NEED_STRTONUM */ + +#ifdef __linux__ + + +int optreset; /* missing in linux */ + +/* + * not implemented in linux. + * taken from /usr/src/lib/libc/string/strlcpy.c + */ +size_t +strlcpy(dst, src, siz) + char *dst; + const char *src; + size_t siz; +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} + + +#endif /* __linux__ */ + + +#if defined (EMULATE_SYSCTL) +//XXX missing prerequisites +#include <net/if.h> //openwrt +#include <netinet/ip.h> //openwrt +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +int do_cmd(int optname, void *optval, uintptr_t optlen); +#endif /* EMULATE_SYSCTL */ + +/* + * set or get system information + * XXX lock acquisition/serialize calls + * + * we export this as sys/module/ipfw_mod/parameters/___ + * This function get or/and set the value of the sysctl passed by + * the name parameter. If the old value is not desired, + * oldp and oldlenp should be set to NULL. + * + * XXX + * I do not know how this works in FreeBSD in the case + * where there are no write permission on the sysctl var. + * We read the value and set return variables in any way + * but returns -1 on write failures, regardless the + * read success. + * + * Since there is no information on types, in the following + * code we assume a length of 4 is a int. + * + * Returns 0 on success, -1 on errors. + */ +int +sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) +{ +#if defined (EMULATE_SYSCTL) + /* + * we embed the sysctl request in the usual sockopt mechanics. + * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 + * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET + * subcommands. + * the syntax of this function is fully compatible with + * POSIX sysctlby name: + * if newp and newlen are != 0 => this is a set + * else if oldp and oldlen are != 0 => this is a get + * to avoid too much overhead in the module, the whole + * sysctltable is returned, and the parsing is done in userland, + * a probe request is done to retrieve the size needed to + * transfer the table, before the real request + * if both old and new params = 0 => this is a print + * this is a special request, done only by main() + * to implement the extension './ipfw sysctl', + * a command that bypasses the normal getopt, and that + * is available on those platforms that use this + * sysctl emulation. + * in this case, a negative oldlen signals that *oldp + * is actually a FILE* to print somewhere else than stdout + */ + + int l; + int ret; + struct dn_id* oid; + struct sysctlhead* entry; + char* pstring; + char* pdata; + FILE* fp; + + if((oldlenp != NULL) && ((int)*oldlenp < 0)) + fp = (FILE*)oldp; + else + fp = stdout; + if(newp != NULL && newlen != 0) + { + //this is a set + l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_SET; + oid->id = DN_API_VERSION; + + entry = (struct sysctlhead*)(oid+1); + pdata = (char*)(entry+1); + pstring = pdata + newlen; + + entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; + entry->namelen = strlen(name)+1; + entry->flags = 0; + entry->datalen = newlen; + + bcopy(newp, pdata, newlen); + bcopy(name, pstring, strlen(name)+1); + + ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); + if (ret != 0) + return -1; + } + else + { + //this is a get or a print + l = sizeof(struct dn_id); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + l=oid->id; + free(oid); + oid = malloc(l); + if (oid == NULL) + return -1; + oid->len = l; + oid->type = DN_SYSCTL_GET; + oid->id = DN_API_VERSION; + + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0) + return -1; + + entry = (struct sysctlhead*)(oid+1); + while(entry->blocklen != 0) + { + pdata = (char*)(entry+1); + pstring = pdata+entry->datalen; + + //time to check if this is a get or a print + if(name != NULL && oldp != NULL && *oldlenp > 0) + { + //this is a get + if(strcmp(name,pstring) == 0) + { + //match found, sanity chech on len + if(*oldlenp < entry->datalen) + { + printf("%s error: buffer too small\n",__FUNCTION__); + return -1; + } + *oldlenp = entry->datalen; + bcopy(pdata, oldp, *oldlenp); + return 0; + } + } + else + { + //this is a print + if( name == NULL ) + goto print; + if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) + goto print; + else + goto skip; +print: + fprintf(fp, "%s: ",pstring); + switch( entry->flags >> 2 ) + { + case SYSCTLTYPE_LONG: + fprintf(fp, "%li ", *(long*)(pdata)); + break; + case SYSCTLTYPE_UINT: + fprintf(fp, "%u ", *(unsigned int*)(pdata)); + break; + case SYSCTLTYPE_ULONG: + fprintf(fp, "%lu ", *(unsigned long*)(pdata)); + break; + case SYSCTLTYPE_INT: + default: + fprintf(fp, "%i ", *(int*)(pdata)); + } + if( (entry->flags & 0x00000003) == CTLFLAG_RD ) + fprintf(fp, "\t(read only)\n"); + else + fprintf(fp, "\n"); +skip: ; + } + entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); + } + free(oid); + return 0; + } + //fallback for invalid options + return -1; + +#else /* __linux__ */ + FILE *fp; + char *basename = "/sys/module/ipfw_mod/parameters/"; + char filename[256]; /* full filename */ + char *varp; + int ret = 0; /* return value */ + int d; + + if (name == NULL) /* XXX set errno */ + return -1; + + /* locate the filename */ + varp = strrchr(name, '.'); + if (varp == NULL) /* XXX set errno */ + return -1; + + snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); + + /* + * XXX we could open the file here, in rw mode + * but need to check if a file have write + * permissions. + */ + + /* check parameters */ + if (oldp && oldlenp) { /* read mode */ + fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); + return -1; + } + if (*oldlenp == 4) { + if (fscanf(fp, "%d", &d) == 1) + memcpy(oldp, &d, *oldlenp); + else + ret = -1; + } + fclose(fp); + } + + if (newp && newlen) { /* write */ + fp = fopen(filename, "w"); + if (fp == NULL) { + fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); + return -1; + } + if (newlen == 4) { + if (fprintf(fp, "%d", *(int*)newp) < 1) + ret = -1; + } + + fclose(fp); + } + + return ret; +#endif /* __linux__ */ +} + +/* + * The following two functions implement getsockopt/setsockopt + * replacements to talk over a TCP socket. + * Because the calls are synchronous, we can run blocking code + * and do not need to play special tricks to be selectable. + * The wire protocol for the emulation is the following: + * REQUEST: n32 req_size, level, optname; u8 data[req_size] + * RESPONSE: n32 resp_size, ret_code; u8 data[resp_size] + * data is only present if ret_code == 0 + * + * Return 0 if the message wan sent to the remote + * endpoint, -1 on error. + * + * If the required lenght is greater then the + * available buffer size, -1 is returned and + * optlen is the required lenght. + */ +enum sock_type {GET_SOCKOPT, SET_SOCKOPT}; + +struct wire_hdr { + uint32_t optlen; /* actual data len */ + uint32_t level; /* or error */ + uint32_t optname; /* or act len */ + uint32_t dir; /* in or out */ +}; + +/* do a complete write of the buffer */ +static int +writen(int fd, const char *buf, int len) +{ + int i; + + for (; len > 0; buf += i, len -= i) { + i = write(fd, buf, len); + ND("have %d wrote %d", len, i); + if (i < 0) { + if (errno == EAGAIN) + continue; + return -1; + } + } + return 0; +} + +/* do a complete read */ +static int +readn(int fd, char *buf, int len) +{ + int i, pos; + + for (pos = 0; pos < len; pos += i) { + i = read(fd, buf + pos, len - pos); + ND("have %d want %d got %d", pos, len, i); + if (i < 0) { + if (errno == EAGAIN) + continue; + return -1; + } + } + ND("full read got %d", pos); + return 0; +} + +int +__sockopt2(int s, int level, int optname, void *optval, socklen_t *optlen, + enum sopt_dir dir) +{ + struct wire_hdr r; + int len = optlen && optval ? *optlen : 0; + int new_errno; + + ND("dir %d optlen %d level %d optname %d", dir, len, level, optname); + /* send request to the server */ + r.optlen = htonl(len); + r.level = htonl(level); + r.optname = htonl(optname); + r.dir = htonl(dir); + + if (writen(s, (const char *) &r, sizeof(r))) + return -1; /* error writing */ + + /* send data, if present */ + if (len < 0) { + fprintf(stderr, "%s invalid args found\n", __FUNCTION__); + return -1; + } else if (len > 0) { + if (writen(s, optval, len)) + return -1; /* error writing */ + } + + /* read response size and error code */ + if (readn(s, (char *)&r, sizeof(r))) + return -1; /* error reading */ + len = ntohl(r.optlen); + ND("got header, datalen %d", len); + if (len > 0) { + if (readn(s, optval, len)) { + return -1; /* error reading */ + } + } + if (optlen) + *optlen = ntohl(r.optlen); /* actual len */ + new_errno = ntohl(r.level); + if (new_errno) + errno = new_errno; + return (new_errno ? -1 : 0); +} + +/* + * getsockopt() replacement. + */ +int +getsockopt2(int s, int level, int optname, void *optval, + socklen_t *optlen) +{ + return __sockopt2(s, level, optname, optval, optlen, SOPT_GET); +} + +/* + * setsockopt() replacement + */ +int +setsockopt2(int s, int level, int optname, void *optval, + socklen_t optlen) +{ + /* optlen not changed, use the local address */ + return __sockopt2(s, level, optname, optval, &optlen, SOPT_SET); +} + +#ifdef socket +#undef socket /* we want the real one */ +#endif +/* + * This function replaces the socket() call to connect to + * the ipfw control socket. + * We actually ignore the paramerers if IPFW_HOST and IPFW_PORT + * are defined. + */ +int +do_connect(const char *addr, int port) +{ + int conn_fd; + + /* open the socket */ +#ifdef NETLINK + +struct rtnl_handle rth; + + conn_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); +#else + struct sockaddr_in server; /* server address */ + const char *s; + + conn_fd = socket(AF_INET, SOCK_STREAM, 0); + if (conn_fd < 0) { + perror("socket"); + return -1; + } +#endif +#ifndef NETLINK + /* fill the sockaddr structure with server address */ + bzero(&server, sizeof(server)); + server.sin_family = AF_INET; + + /* override the host if set in the environment */ + s = getenv("IPFW_HOST"); + if (s) + addr = s; + inet_aton(addr, &server.sin_addr); + s = getenv("IPFW_PORT"); + if (s && atoi(s) > 0) + port = atoi(s); + server.sin_port = htons(port); + + /* connect to the server */ + if (connect(conn_fd, (struct sockaddr*) &server, sizeof(server)) < 0) { + perror("connect"); + return -1; + } +#ifdef setsockopt /* we want the real one here */ +#undef setsockopt +#undef getsockopt +#endif + { + int on = 1, ret; + ret = setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + ND("set TCP_NODELAY %d returns %d", on, ret); + } + if (0) + fprintf(stderr, "connected to %s:%d\n", + inet_ntoa(server.sin_addr), ntohs(server.sin_port)); +#endif + return conn_fd; +} diff --git a/example/ipfw/extra/glue.h b/example/ipfw/extra/glue.h new file mode 100644 index 0000000..97b25bf --- /dev/null +++ b/example/ipfw/extra/glue.h @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * $Id: glue.h 8327 2011-03-22 17:01:35Z marta $ + * + * glue code to adapt the FreeBSD version to linux and windows, + * userland and kernel. + * This is included before any other headers, so we do not have + * a chance to override any #define that should appear in other + * headers. + * First handle headers for userland and kernel. Then common code + * (including headers that require a specific order of inclusion), + * then the user- and kernel- specific parts. + */ + +#ifndef _GLUE_H +#define _GLUE_H + +/* + * common definitions to allow portability + */ +#ifndef __FBSDID +#define __FBSDID(x) struct __hack +#endif /* FBSDID */ + +#include <stdint.h> /* linux needs it in addition to sys/types.h */ +#include <sys/types.h> /* for size_t */ + +#define true 1 /* stdbool */ +#ifdef _KERNEL /* prevent a warning */ +#undef _KERNEL +#include <sys/ioctl.h> +#include <sys/time.h> +#include <errno.h> /* we want errno */ +#define _KERNEL +#else +#include <sys/ioctl.h> +#endif + +#include <time.h> +#ifndef USERSPACE +#include <netinet/ether.h> +#endif + + +/*----- */ + +/* ipfw2.c - from timeconv.h */ +static __inline time_t +_long_to_time(long tlong) +{ + if (sizeof(long) == sizeof(__int32_t)) + return((time_t)(__int32_t)(tlong)); + return((time_t)tlong); +} + +#define min(a, b) ((a) < (b) ? (a) : (b) ) // radix.c +/* + * debugging macros from ip_dn_private.h + */ +#include <sys/time.h> +#include <stdio.h> +extern char *strrchr(const char *, int); +static inline const char *xyz(const char *s) { + static char buf[128]; + struct timeval t; + const char *ret = strrchr(s, '/'); + if (ret) s = ret + 1; + gettimeofday(&t, NULL); + buf[sizeof(buf) - 1] = '\0'; + snprintf(buf, sizeof(buf), "[%4d.%06d] %s", + (int)(t.tv_sec % 1000), (int)(t.tv_usec), s); + return buf; +} + +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) fprintf(stderr, "%s:%-10s [%d] " fmt "\n", \ + xyz(__FILE__), __FUNCTION__, __LINE__, ## __VA_ARGS__) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) + +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +/* end debugging macros */ + + +/* + * sbin/ipfw on non-freebsd platform + */ +#ifdef NEED_STRTONUM +/* prototypes from libutil */ +/* humanize_number(3) */ +#define HN_DECIMAL 0x01 +#define HN_NOSPACE 0x02 +#define HN_B 0x04 +#define HN_DIVISOR_1000 0x08 +#define HN_IEC_PREFIXES 0x10 + +#define HN_GETSCALE 0x10 +#define HN_AUTOSCALE 0x20 + + +int humanize_number(char *_buf, size_t _len, int64_t _number, + const char *_suffix, int _scale, int _flags); +int expand_number(const char *buf, uint64_t *num); + + +long long +strtonum(const char *nptr, long long minval, long long maxval, + const char **errstr); +#ifndef __APPLE__ +int ishexnumber(int c); +#endif +#endif /* NEED_STRTONUM */ + +#ifdef NEED_SYSCTLBYNAME /* and other linux calls */ +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen); +#define setprogname(x) /* not present in linux */ + +extern int optreset; /* not present in linux */ + +long long int strtonum(const char *nptr, long long minval, + long long maxval, const char **errstr); + + +struct ether_addr; +struct ether_addr * ether_aton(const char *a); + +#define ICMP6_MAXTYPE 201 +#define __u6_addr in6_u +#define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ + + +#define __u6_addr32 u6_addr32 +/* on freebsd sys/socket.h pf specific */ +#define NET_RT_IFLIST 3 /* survey interface list */ + +#define RTM_VERSION 5 /* Up the ante and ignore older versions */ + +#endif // NEED_SYSCTLBYNAME + +#ifdef NEED_SIN_LEN +/* + * linux at least does not have sin_len and sin6_len, so we remap + * to some safe fields (check use of sin6_flowinfo XXX) + */ +#define sin_len sin_zero[0] +#define sin6_len sin6_flowinfo +#endif /* NEED_SIN_LEN */ + +#ifdef NEED_ROUNDUP2 /* in freensd is in sys/param.h */ +/* round up to the next power of 2 (y) */ +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#endif // NEED_ROUNDUP2 + +/* possibly redundant, does not harm */ +size_t strlcpy(char * dst, const char * src, size_t siz); + +/* + * Part 2: common userland and kernel definitions + */ + +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ + +/* + * linux: sysctl are mapped into /sys/module/ipfw_mod parameters + * windows: they are emulated via get/setsockopt + */ +#define CTLFLAG_RD 1 +#define CTLFLAG_RDTUN 1 +#define CTLFLAG_RW 2 +#define CTLFLAG_SECURE3 0 /* unsupported */ +#define CTLFLAG_VNET 0 /* unsupported */ + +/* if needed, queue.h must be included here after list.h */ + +/* + * our own struct thread + */ +struct thread { /* ip_fw_sockopt */ + void *sopt_td; + void *td_ucred; +}; + +enum sopt_dir { SOPT_GET, SOPT_SET }; + +struct sockopt { + enum sopt_dir sopt_dir; /* is this a get or a set? */ + int sopt_level; /* second arg of [gs]etsockopt */ + int sopt_name; /* third arg of [gs]etsockopt */ + void *sopt_val; /* fourth arg of [gs]etsockopt */ + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ + struct thread *sopt_td; /* calling thread or null if kernel */ +}; + + +/* + * List of values used for set/getsockopt options. + * The base value on FreeBSD is defined as a macro, + * if not available we will use our own enum. + * The TABLE_BASE value is used in the kernel. + */ +#define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ +#define IP_FW_TABLE_ADD (_IPFW_SOCKOPT_BASE + 0) +#define IP_FW_TABLE_DEL (_IPFW_SOCKOPT_BASE + 1) +#define IP_FW_TABLE_FLUSH (_IPFW_SOCKOPT_BASE + 2) +#define IP_FW_TABLE_GETSIZE (_IPFW_SOCKOPT_BASE + 3) +#define IP_FW_TABLE_LIST (_IPFW_SOCKOPT_BASE + 4) +#define IP_FW_DYN_GET (_IPFW_SOCKOPT_BASE + 5) + +#define IP_FW3 (_IPFW_SOCKOPT_BASE + 8) +#define IP_DUMMYNET3 (_IPFW_SOCKOPT_BASE + 9) + +#define IP_FW_ADD (_IPFW_SOCKOPT_BASE + 10) +#define IP_FW_DEL (_IPFW_SOCKOPT_BASE + 11) +#define IP_FW_FLUSH (_IPFW_SOCKOPT_BASE + 12) +#define IP_FW_ZERO (_IPFW_SOCKOPT_BASE + 13) +#define IP_FW_GET (_IPFW_SOCKOPT_BASE + 14) +#define IP_FW_RESETLOG (_IPFW_SOCKOPT_BASE + 15) + +#define IP_FW_NAT_CFG (_IPFW_SOCKOPT_BASE + 16) +#define IP_FW_NAT_DEL (_IPFW_SOCKOPT_BASE + 17) +#define IP_FW_NAT_GET_CONFIG (_IPFW_SOCKOPT_BASE + 18) +#define IP_FW_NAT_GET_LOG (_IPFW_SOCKOPT_BASE + 19) + +#define IP_DUMMYNET_CONFIGURE (_IPFW_SOCKOPT_BASE + 20) +#define IP_DUMMYNET_DEL (_IPFW_SOCKOPT_BASE + 21) +#define IP_DUMMYNET_FLUSH (_IPFW_SOCKOPT_BASE + 22) + /* 63 is missing */ +#define IP_DUMMYNET_GET (_IPFW_SOCKOPT_BASE + 24) +#define _IPFW_SOCKOPT_END (_IPFW_SOCKOPT_BASE + 25) + +/* + * Part 3: userland stuff for linux/windows + */ + + +/* + * now remap functions for userland or linux kernel etc. + */ +#ifdef USERSPACE +/* + * definitions used when the programs communicate through userspace. + * We need to define the socket and addresses used to talk, and + * the userland side must also remap socket() and [gs]etsockopt() + * to appropriate wrappers. + */ + +#define LOCALADDR "127.0.0.1" +#define IPFW_PORT 5555 + +#ifndef KERNEL_SIDE +#ifdef _KERNEL +#error _KERNEL defined in user space +#endif +int do_connect(const char *addr, int port); +#include <sys/socket.h> /* for socklen_t */ + +#define socket(a, b, c) do_connect(LOCALADDR, IPFW_PORT) +#define setsockopt setsockopt2 +#define getsockopt getsockopt2 +int getsockopt2(int s, int lev, int optname, void *optval, socklen_t *optlen); +int setsockopt2(int s, int lev, int optname, void *optval, socklen_t optlen); +#endif /* KERNEL_SIDE */ + +#endif /* USERSPACE */ + +/* + * Part 5: windows specific stuff and sysctl emulation + */ + +/******************* +* SYSCTL emulation * +********************/ +#ifdef EMULATE_SYSCTL + +/* this needs to be here, as it is part of the user-kernel messages */ +/* flag is set with the last 2 bits for access, as defined in glue.h + * and the rest for type + */ +enum { + SYSCTLTYPE_INT = 0, + SYSCTLTYPE_UINT = 1, + SYSCTLTYPE_SHORT = 2, + SYSCTLTYPE_USHORT = 3, + SYSCTLTYPE_LONG = 4, + SYSCTLTYPE_ULONG = 5, + SYSCTLTYPE_STRING = 6, + + /* the following are SYSCTL_PROC equivalents of the above, + * where the SYSCTLTYPE is shifted 2 bits, + * and SYSCTLTYPE_PROC is set + */ + SYSCTLTYPE_PROC = 0x100, + CTLTYPE_INT = (0x100 | (0<<2)), + CTLTYPE_UINT = (0x100 | (1<<2)), + CTLTYPE_LONG = (0x100 | (4<<2)), + CTLTYPE_ULONG = (0x100 | (5<<2)) +}; + +struct sysctlhead { + uint32_t blocklen; //total size of the entry + uint32_t namelen; //strlen(name) + '\0' + uint32_t flags; //type and access + uint32_t datalen; +}; + + +#endif /* EMULATE_SYSCTL */ +int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen); + +#ifndef __FreeBSD__ +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) + +static inline int fls(int _n) +{ + unsigned int n = _n; + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} + +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} + + +#endif /* !FreeBSD */ + +#ifdef KERNEL_SIDE +/* sys/counter.h , to be moved to a file */ +typedef uint64_t *counter_u64_t; // XXX kernel +static inline void counter_u64_add(counter_u64_t c, int64_t v) +{ + *c += v; +} +static inline void counter_u64_zero(counter_u64_t c) +{ + *c = 0; +} +static inline uint64_t counter_u64_fetch(counter_u64_t c) +{ + return *c; +} + +struct rm_priotracker { +}; + +#define vslock(_a, _b) (0) +#define vsunlock(_a, _b) + +typedef uint64_t u_register_t; // XXX not on osx ? + +typedef uintptr_t eventhandler_tag; +#define EVENTHANDLER_REGISTER(_a, _b, ...) (uintptr_t)_b; +#define EVENTHANDLER_DEREGISTER(_a, _b, ...) (void)_b; + +// XXX this needs to be completed +#define if_name(_ifp) (_ifp->if_xname) +#define ifunit_ref(_n) NULL // XXX +#define if_rele(_n) + +#define rtalloc1_fib(_a, ...) NULL +#define rt_key(_a) NULL +#define rt_mask(_a) NULL +#define RTFREE_LOCKED(_a) ((void)NULL) +struct rtentry { +}; +#define rt_tables_get_rnh(_a, _b) NULL + +#endif /* KERNEL_SIDE */ + +#ifdef _KERNEL +/* XXX kernel support */ +/* on freebsd net/if.h XXX used */ +#ifdef linux +#define div64(a,b) (((int64_t)a)/((int64_t)b)) +#define LINUX_VERSION_CODE 30003 +#define KERNEL_VERSION(a,b,c) (a*10000+b*100 + c) +#define __printflike(a,b) +#endif /* linux */ + +#endif /* _KERNEL */ + +#ifndef __FreeBSD__ +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif +#include "missing.h" + +struct if_data { + /* ... */ + u_long ifi_mtu; /* maximum transmission unit */ +}; + +#endif + +#ifdef __APPLE__ +#include <sys/socketvar.h> // need in kernel + +/* needed both in kernel and userspace */ +struct if_data64 { // XXX Darwin version + /* ... */ + u_long ifi_mtu; /* maximum transmission unit */ +}; + +struct net_event_data { +}; + +struct in_addr; +#endif /* __APPLE__ */ + +#define __PAST_END(v, idx) v[idx] + +/* + * a fast copy routine + */ +#include <strings.h> +// XXX only for multiples of 64 bytes, non overlapped. +static inline void +_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = _src; + uint64_t *dst = _dst; +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + if (unlikely(l >= 1024)) { + bcopy(src, dst, l); + return; + } + for (; l > 0; l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +#endif /* !_GLUE_H */ diff --git a/example/ipfw/extra/humanize_number.c b/example/ipfw/extra/humanize_number.c new file mode 100644 index 0000000..0b7382f --- /dev/null +++ b/example/ipfw/extra/humanize_number.c @@ -0,0 +1,167 @@ +/* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $ */ + +/* + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/lib/libutil/humanize_number.c 220582 2011-04-12 22:48:03Z delphij $"); + +#include <sys/types.h> +#include <assert.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +//#include <libutil.h> + +static const int maxscale = 7; + +int +humanize_number(char *buf, size_t len, int64_t quotient, + const char *suffix, int scale, int flags) +{ + const char *prefixes, *sep; + int i, r, remainder, s1, s2, sign; + int64_t divisor, max; + size_t baselen; + + assert(buf != NULL); + assert(suffix != NULL); + assert(scale >= 0); + assert(scale < maxscale || (((scale & (HN_AUTOSCALE | HN_GETSCALE)) != 0))); + assert(!((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES))); + + remainder = 0; + + if (flags & HN_IEC_PREFIXES) { + baselen = 2; + /* + * Use the prefixes for power of two recommended by + * the International Electrotechnical Commission + * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). + * + * HN_IEC_PREFIXES implies a divisor of 1024 here + * (use of HN_DIVISOR_1000 would have triggered + * an assertion earlier). + */ + divisor = 1024; + if (flags & HN_B) + prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + else + prefixes = "\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + } else { + baselen = 1; + if (flags & HN_DIVISOR_1000) + divisor = 1000; + else + divisor = 1024; + + if (flags & HN_B) + prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + else + prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + } + +#define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) + + if (scale < 0 || (scale >= maxscale && + (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)) + return (-1); + + if (buf == NULL || suffix == NULL) + return (-1); + + if (len > 0) + buf[0] = '\0'; + if (quotient < 0) { + sign = -1; + quotient = -quotient; + baselen += 2; /* sign, digit */ + } else { + sign = 1; + baselen += 1; /* digit */ + } + if (flags & HN_NOSPACE) + sep = ""; + else { + sep = " "; + baselen++; + } + baselen += strlen(suffix); + + /* Check if enough room for `x y' + suffix + `\0' */ + if (len < baselen + 1) + return (-1); + + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { + /* See if there is additional columns can be used. */ + for (max = 1, i = len - baselen; i-- > 0;) + max *= 10; + + /* + * Divide the number until it fits the given column. + * If there will be an overflow by the rounding below, + * divide once more. + */ + for (i = 0; + (quotient >= max || (quotient == max - 1 && remainder >= 950)) && + i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + + if (scale & HN_GETSCALE) + return (i); + } else { + for (i = 0; i < scale && i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + } + + /* If a value <= 9.9 after rounding and ... */ + if (quotient <= 9 && remainder < 950 && i > 0 && flags & HN_DECIMAL) { + /* baselen + \0 + .N */ + if (len < baselen + 1 + 2) + return (-1); + s1 = (int)quotient + ((remainder + 50) / 1000); + s2 = ((remainder + 50) / 100) % 10; + r = snprintf(buf, len, "%d%s%d%s%s%s", + sign * s1, localeconv()->decimal_point, s2, + sep, SCALE2PREFIX(i), suffix); + } else + r = snprintf(buf, len, "%" PRId64 "%s%s%s", + sign * (quotient + (remainder + 50) / 1000), + sep, SCALE2PREFIX(i), suffix); + + return (r); +} + diff --git a/example/ipfw/extra/ipfw2_mod.c b/example/ipfw/extra/ipfw2_mod.c new file mode 100644 index 0000000..974c6af --- /dev/null +++ b/example/ipfw/extra/ipfw2_mod.c @@ -0,0 +1,278 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: ipfw2_mod.c 7787 2010-11-19 21:15:50Z marta $ + * + * The main interface to build ipfw+dummynet as a linux module. + * (and possibly as a windows module as well, though that part + * is not complete yet). + * + * The control interface uses the sockopt mechanism + * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). + * + * The data interface uses the netfilter interface, at the moment + * hooked to the PRE_ROUTING and POST_ROUTING hooks. + * Unfortunately the netfilter interface is a moving target, + * so we need a set of macros to adapt to the various cases. + * + * In the netfilter hook we just mark packet as 'QUEUE' and then + * let the queue handler to do the whole work (filtering and + * possibly emulation). + * As we receive packets, we wrap them with an mbuf descriptor + * so the existing ipfw+dummynet code runs unmodified. + */ + +#include <sys/cdefs.h> +#include <sys/mbuf.h> /* sizeof struct mbuf */ +#include <sys/param.h> /* NGROUPS */ +#include <netinet/in.h> /* in_addr */ +#include <netinet/ip_fw.h> /* ip_fw_ctl_t, ip_fw_chk_t */ +#include <netinet/ip_dummynet.h> /* ip_dn_ctl_t, ip_dn_io_t */ +#include <net/pfil.h> /* PFIL_IN, PFIL_OUT */ +#include <net/route.h> /* inet_iif */ + +#include <netpfil/ipfw/ip_fw_private.h> /* ip_fw_ctl_t, ip_fw_chk_t */ + +/* + * Here we allocate some global variables used in the firewall. + */ +//ip_dn_ctl_t *ip_dn_ctl_ptr; +int (*ip_dn_ctl_ptr)(struct sockopt *); + +ip_fw_ctl_t *ip_fw_ctl_ptr; + +int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); + +void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +/* Divert hooks. */ +void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +/* ng_ipfw hooks. */ +ng_ipfw_input_t *ng_ipfw_input_p = NULL; + + +/*--- + * Control hooks: + * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. + * then call the ipfw handler in order to manage requests. + * In turn this is called by the linux set/get handlers. + */ +static int +ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) +{ + struct thread t; + int ret = EINVAL; + + memset(s, 0, sizeof(*s)); + s->sopt_name = cmd; + s->sopt_dir = dir; + s->sopt_valsize = len; + s->sopt_val = user; + + /* sopt_td is not used but it is referenced */ + memset(&t, 0, sizeof(t)); + s->sopt_td = &t; + + if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || + cmd < IP_DUMMYNET_CONFIGURE)) + ret = ip_fw_ctl_ptr(s); + else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || + cmd >= IP_DUMMYNET_CONFIGURE)) + ret = ip_dn_ctl_ptr(s); + + return -ret; /* errors are < 0 on linux */ +} + + + +/* + * setsockopt hook has no return value other than the error code. + */ +int +do_ipfw_set_ctl(void *sk, int cmd, + void __user *user, unsigned int len) +{ + struct sockopt s; /* pass arguments */ + return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); +} + +/* + * getsockopt can can return a block of data in response. + */ +int +do_ipfw_get_ctl(void *sk, + int cmd, void __user *user, int *len) +{ + struct sockopt s; /* pass arguments */ + int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); + + *len = s.sopt_valsize; /* return lenght back to the caller */ + return ret; +} + + +/* + * Module glue - init and exit function. + */ +#include <sys/module.h> +/* descriptors for the children, until i find a way for the + * linker to produce them + */ +extern moduledata_t *moddesc_ipfw; +extern moduledata_t *moddesc_dummynet; +extern moduledata_t *moddesc_dn_fifo; +extern moduledata_t *moddesc_dn_wf2qp; +extern moduledata_t *moddesc_dn_rr; +extern moduledata_t *moddesc_dn_qfq; +extern moduledata_t *moddesc_dn_prio; +extern int (*sysinit_ipfw_init)(void *); +extern int (*sysuninit_ipfw_destroy)(void *); +extern int (*sysinit_vnet_ipfw_init)(void *); +extern int (*sysuninit_vnet_ipfw_uninit)(void *); + +/*--- + * Glue code to implement the registration of children with the parent. + * Each child should call my_mod_register() when linking, so that + * module_init() and module_exit() can call init_children() and + * fini_children() to provide the necessary initialization. + * We use the same mechanism for MODULE_ and SYSINIT_. + * The former only get a pointer to the moduledata, + * the latter have two function pointers (init/uninit) + */ +#include <sys/module.h> +struct mod_args { + const char *name; + int order; + struct moduledata *mod; + int (*init)(void *); + int (*uninit)(void *); +}; + +static unsigned int mod_idx; +static struct mod_args mods[10]; /* hard limit to 10 modules */ + +int +my_mod_register(const char *name, int order, + struct moduledata *mod, int (*init)(void *), int (*uninit)(void *)); +/* + * my_mod_register should be called automatically as the init + * functions in the submodules. Unfortunately this compiler/linker + * trick is not supported yet so we call it manually. + */ +int +my_mod_register(const char *name, int order, + struct moduledata *mod, int (*init)(void *), int (*uninit)(void *)) +{ + struct mod_args m; + + m.name = name; + m.order = order; + m.mod = mod; + m.init = init; + m.uninit = uninit; + + ND("called for %s", name); + if (mod_idx < sizeof(mods) / sizeof(mods[0])) + mods[mod_idx++] = m; + return 0; +} + +static void +init_children(void) +{ + unsigned int i; + + /* Call the functions registered at init time. */ + printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); + for (i = 0; i < mod_idx; i++) { + struct mod_args *m = &mods[i]; + printf("+++ start module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); + else if (m->init) + m->init(NULL); + } +} + +static void +fini_children(void) +{ + int i; + + /* Call the functions registered at init time. */ + for (i = mod_idx - 1; i >= 0; i--) { + struct mod_args *m = &mods[i]; + printf("+++ end module %d %s %s at %p order 0x%x\n", + i, m->name, m->mod ? m->mod->name : "SYSINIT", + m->mod, m->order); + if (m->mod && m->mod->evhand) + m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); + else if (m->uninit) + m->uninit(NULL); + } +} +/*--- end of module binding helper functions ---*/ + +int +ipfw_module_init(void) +{ + int ret = 0; + + my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); + my_mod_register("sy_ipfw", 2, NULL, + sysinit_ipfw_init, sysuninit_ipfw_destroy); + my_mod_register("sy_Vnet_ipfw", 3, NULL, + sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); + my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); + my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); + my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); + my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); + my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); + my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); + init_children(); + +#ifdef EMULATE_SYSCTL + keinit_GST(); +#endif + + return ret; +} + +/* module shutdown */ +void +ipfw_module_exit(void) +{ +#ifdef EMULATE_SYSCTL + keexit_GST(); +#endif + + fini_children(); + + printf("%s unloaded\n", __FUNCTION__); +} diff --git a/example/ipfw/extra/linux_defs.h b/example/ipfw/extra/linux_defs.h new file mode 100644 index 0000000..b7994cf --- /dev/null +++ b/example/ipfw/extra/linux_defs.h @@ -0,0 +1,144 @@ +#ifndef __LINUX_DEFS_ +#define __LINUX_DEFS_ + +/* define, includes and functions missing in linux */ + +#ifdef __linux__ +/* include and define */ +#include <arpa/inet.h> /* inet_ntoa */ +#include <netinet/tcp.h> + +#include <linux/errno.h> /* error define */ +#include <stdint.h> /* u_int32_t */ +#include <stdio.h> /* snprintf */ + +typedef struct mtx spinlock_t; +typedef struct mtx rwlock_t; + +/* + * some network structure can be defined in the bsd way + * by using the _FAVOR_BSD definition. This is not true + * for icmp structure. + * XXX struct icmp contains bsd names in + * /usr/include/netinet/ip_icmp.h + */ +#define icmp_code code +#define icmp_type type + +/* linux in6_addr has no member __u6_addr + * replace the whole structure ? + */ +#define __u6_addr __in6_u +// #define __u6_addr32 u6_addr32 + +/* defined in linux/sctp.h with no bsd definition */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* Adler32 C-Sum */ + /* chunks follow... */ +} SCTP_PACKED; + +/* missing definition */ +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_ACK 0x10 + +#define RTF_CLONING 0x100 /* generate new routes on use */ + +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_CARP 112 /* CARP */ +#define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ + +#define CARP_VERSION 2 +#define CARP_ADVERTISEMENT 0x01 + +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) /* linux/stdlib */ + +#define IP_FORWARDING 0x1 /* most of ip header exists */ + +#define NETISR_IP 2 /* same as AF_INET */ + +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ + +extern int securelevel; + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif +}; + +struct pim { +}; + +struct route { + struct rtentry *ro_rt; + struct sockaddr ro_dst; +}; + + +#if 0 // already in main header +struct ifaltq { + void *ifq_head; +}; + +struct ifnet { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ + struct ifaltq if_snd; /* output queue (includes altq) */ +}; + +/* involves mbufs */ +int in_cksum(struct mbuf *m, int len); +#define divert_cookie(mtag) 0 +#define divert_info(mtag) 0 +#define INADDR_TO_IFP(a, b) b = NULL +#define pf_find_mtag(a) NULL +#define pf_get_mtag(a) NULL +#define AF_LINK AF_ASH /* ? linux/socket.h */ + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + sa_family_t af; /* for ECN */ + u_int32_t qid; /* queue id */ +}; +#endif + +/* radix related */ + +#if 0 +struct radix_node { + caddr_t rn_key; /* object of search */ + caddr_t rn_mask; /* netmask, if present */ +}; +#endif + + +/* missing functions */ + +/* from bsd sys/queue.h */ +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +/* depending of linux version */ +#ifndef ETHERTYPE_IPV6 +#define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ +#endif + +#endif /* __linux__ */ +#endif /* !__LINUX_DEFS_ */ diff --git a/example/ipfw/extra/missing.c b/example/ipfw/extra/missing.c new file mode 100644 index 0000000..1713bdd --- /dev/null +++ b/example/ipfw/extra/missing.c @@ -0,0 +1,732 @@ +/* + * $Id$ + * + * Support to compile the kernel side of ipfw/dummynet in userland. + * This file contains variables and functions that are not available in + * userland. It is compiled in a kernel-like environment, so + * it has _KERNEL defined, together with malloc() and free(). + * They must be redefined here as we build the real thing. + */ + +#include "glue.h" /* normally comes from the command line */ +#include "missing.h" /* normally comes from the command line */ +#undef _KERNEL +#include <sys/types.h> +#include <pthread.h> +#include <sys/select.h> +#include <sys/time.h> /* timersub */ +#define _KERNEL + +#include <sys/types.h> +#include <sys/taskqueue.h> + +#include <sys/mbuf.h> +#undef malloc +#undef free + +#include <stdlib.h> // calloc + +#include <netinet/in.h> /* struct sockaddr, route, sockopt... */ +#include <netinet/in_systm.h> + +#if 0 +#define IF_NAMESIZE 16 /* ip_fw.h */ +#define IFNAMSIZ IF_NAMESIZE /* ip_fw.h */ +#endif + + +/* + * Global bariables in the kernel + */ +int ticks; /* kernel ticks counter */ +int hz = 5000; /* default clock time */ +long tick = 0; /* XXX is this 100000/hz ? */ +int bootverbose = 0; +time_t time_uptime = 0; +struct timeval boottime; + +int max_protohdr = 14 + 4 + 20 + 20; /* mac, vlan, ip, tcp */ +int max_linkhdr; +int ip_defttl; +u_long in_ifaddrhmask; /* mask for hash table */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +u_int rt_numfibs = RT_NUMFIBS; + +void +module_register_init(const void *foo) +{ + D("start for %p", foo); +} + +/* defined as assert */ +#include <assert.h> +void +panic(const char *fmt, ...) +{ + assert(1); +} + +void +getmicrouptime(struct timeval *tv) +{ + gettimeofday(tv, NULL); +} + +/* + * pfil hook support. + * We make pfil_head_get return a non-null pointer, which is then ignored + * in our 'add-hook' routines. + */ +struct pfil_head; +typedef int (pfil_hook_t) + (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); + +struct pfil_head * +pfil_head_get(int proto, u_long flags) +{ + static int dummy; + D("called"); + return (struct pfil_head *)(void *)&dummy; +} + +int +pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + D("called"); + return 0; +} + +int +pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) +{ + D("called"); + return 0; +} + +/* from sys/netinet/ip_output.c */ +int +ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp) +{ + D("unimplemented"); + return 0; +} + +struct tags_freelist tags_freelist; +int tags_minlen = 64; +int tags_freelist_count = 0; +static int tags_freelist_max = 0; + +struct mbuf *mbuf_freelist; + +void +m_freem(struct mbuf *m) +{ + struct m_tag *t; + + /* free the m_tag chain */ + while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { + ND("free tag %p", &m->m_pkthdr.tags); + SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); + SLIST_INSERT_HEAD(&tags_freelist, t, m_tag_link); + tags_freelist_count++; + if (tags_freelist_count > tags_freelist_max) { + static int pr=0; + if ((pr++ % 1000) == 0) + D("new max %d", tags_freelist_count); + tags_freelist_max = tags_freelist_count; + } + } + if (m->m_flags & M_STACK) { + ND("free invalid mbuf %p", m); + return; + } + /* free the mbuf */ + ND("free(m = %p, M_IPFW);", m); + m->m_next = mbuf_freelist; + mbuf_freelist = m; +} + +/* from net/netisr.c */ +int +netisr_dispatch(u_int proto, struct mbuf *m) +{ + if ((int)proto < 0) + m_freem(m); + else if (m->__m_callback) + m->__m_callback(m, proto); + else + D("unimplemented proto %d mbuf %p", proto, m); + return 0; +} + +/* define empty body for kernel function */ +int +priv_check(struct thread *td, int priv) +{ + /* once connected, always allow */ + ND("called"); + return 0; +} + +int +securelevel_ge(struct ucred *cr, int level) +{ + /* we are always secure... */ + ND("called"); + return 0; +} + +int +sysctl_handle_int(SYSCTL_HANDLER_ARGS) +{ + int tmp; + + ND("called"); + if (!req || !req->oldptr || req->oldlen != sizeof(int)) + return EINVAL; + tmp = arg1 ? *(int *)arg1 : arg2; + bcopy(&tmp, req->oldptr, sizeof(int)); + /* XXX check the SET routine */ + if (req->newptr && arg1) + bcopy(req->newptr, arg1, sizeof(int)); + return 0; +} + +int +sysctl_handle_long(SYSCTL_HANDLER_ARGS) +{ + ND("called"); + sysctl_handle_int(oidp, arg1, arg2, req); + return 0; +} + +void +ether_demux(struct ifnet *ifp, struct mbuf *m) +{ + if (m->__m_callback) + m->__m_callback(m, 0); + else + D("missing callback mbuf %p", m); + return; +} + +int +ether_output_frame(struct ifnet *ifp, struct mbuf *m) +{ + D("incomplete"); + return 0; +} + +void +in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) +{ + D("called"); + return; +} + +void +icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) +{ + D("called"); + return; +} + +void +rtfree(struct rtentry *rt) +{ + D("called"); + return; +} + +u_short +in_cksum_skip(struct mbuf *m, int len, int skip) +{ + D("called"); + return 0; +} + +u_short +in_cksum_hdr(struct ip *ip) +{ + D("called"); + return 0; +} + + +struct mbuf * +ip_reass(struct mbuf *clone) +{ + D("called"); + return clone; +} +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +int +jailed(struct ucred *cred) +{ + D("called"); + return 0; +} + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int +in_localaddr(struct in_addr in) +{ + D("called"); + return 1; +} + +#if 0 +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +{ + return 1; +} +#endif + +/* + * Procedures for the callout interface + * + * callout_init() initializes a descriptor, + * callout_reset() starts a timer + * callout_stop() stops a timer + * + * Internally we hold a list of callout entries etc etc. + */ + +struct callout_tailq callout_head; + +#include <sys/systm.h> +void +callout_init(struct callout *c, int mpsafe) +{ + D("c %p mpsafe %d", c, mpsafe); + bzero(c, sizeof(*c)); +} + +int +callout_reset_on(struct callout *c, int due_ticks, void (*func)(void *), void *arg, int p) +{ + return callout_reset(c, due_ticks, func, arg); +} + +int +callout_reset(struct callout *c, int due_ticks, void (*func)(void *), void *arg) +{ + struct callout *cur; + + ND("c %p ticks %d f %p(%p)", c, due_ticks, func, arg); + if (c->c_flags & CALLOUT_ACTIVE) { + D(" --- callout was already active"); + return -1; + } + c->c_time = ticks + due_ticks; /* XXX not the original meaning */ + c->c_func = func; + c->c_arg = arg; + c->c_flags |= CALLOUT_ACTIVE; + TAILQ_FOREACH(cur, &callout_head, c_links.tqe) { + if ( (c->c_time - cur->c_time) < 0) + break; + } + if (cur) + TAILQ_INSERT_BEFORE(cur, c, c_links.tqe); + else + TAILQ_INSERT_TAIL(&callout_head, c, c_links.tqe); + return 0; /* no error */ +} + +int +_callout_stop_safe(struct callout *c, int safe) +{ + D("c %p safe %d", c, safe); + TAILQ_REMOVE(&callout_head, c, c_links.tqe); + return 0; +} + +int +callout_drain(struct callout *c) +{ + _callout_stop_safe(c, 1); + return 0; +} + +void +callout_startup(void) +{ + D("start"); + TAILQ_INIT( &callout_head); +} + +void +callout_run(void) +{ + struct callout *cur, *tmp; + + ND("Run pending callouts tick %d", ticks); + TAILQ_FOREACH_SAFE(cur, &callout_head, c_links.tqe, tmp) { + int delta = ticks - cur->c_time; + if (delta < 0) { // early ? + //fprintf(stderr, "c %p due at %d\n", cur, cur->c_time); + continue; + } + if (delta > 100) + RD(1,"running %p due at %d now %d", cur, cur->c_time, ticks); + TAILQ_REMOVE(&callout_head, cur, c_links.tqe); + cur->c_flags &= ~CALLOUT_ACTIVE; + cur->c_func(cur->c_arg); + } +} + +/* + * the taskqueue type is actually opaque + */ +struct taskqueue { + STAILQ_ENTRY(taskqueue) tq_link; + STAILQ_HEAD(, task) tq_queue; + const char *tq_name; + taskqueue_enqueue_fn tq_enqueue; + void *tq_context; + struct task *tq_running; + int tq_pcount; + int tq_spin; + int tq_flags; +}; + +#if 0 +/* + * instead of enqueueing, we run this immediately. + */ +int +taskqueue_enqueue(struct taskqueue *queue, struct task *task) +{ + task->ta_func(task->ta_context, 1); + return 0; +} +#endif + +void +taskqueue_thread_enqueue(void *context) +{ + D("ctx %p", context); +} + +struct taskqueue * +taskqueue_create_fast(const char *name, int mflags, + taskqueue_enqueue_fn enqueue, void *context) +{ + struct taskqueue *tq; + + tq = calloc(1, sizeof(*tq)); + if (tq == NULL) + return NULL; + D("start %s fn %p ctx %p", name, enqueue, context); + return tq; +} + +int +taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, + const char *name, ...) +{ + D("tqp %p count %d (dummy)", tqp, count); + return 0; +} + +void +taskqueue_drain(struct taskqueue *queue, struct task *task) +{ + D("q %p task %p", queue, task); +} + +void +taskqueue_free(struct taskqueue *queue) +{ + D("q %p", queue); + free(queue); +} + +void * +kern_malloc(int sz) +{ + return calloc(sz, 1); /* most of the time we want zeroed memory */ +} + +void +kern_free(void *p) +{ + free(p); +} + +#ifdef linux +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0 && --n != 0) { + do { + if ((*d++ = *s++) == 0) + break; + } while (--n != 0); + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return(s - src - 1); /* count does not include NUL */ +} +#endif // linux + +#ifdef EMULATE_SYSCTL +/* + * Support for sysctl emulation. + * We transfer options as part of the IP_DUMMYNET3 sockopt emulation, + * so we need to include ip_fw.h and ip_dummynet.h + */ + +#include <netinet/ip_fw.h> /* struct ip_fw_args */ +#include <netinet/ip_dummynet.h> /* struct dn_id */ +static struct sysctltable GST; + +int +kesysctl_emu_get(struct sockopt* sopt) +{ + struct dn_id* oid = sopt->sopt_val; + struct sysctlhead* entry; + int sizeneeded = sizeof(struct dn_id) + GST.totalsize + + sizeof(struct sysctlhead); + unsigned char* pstring; + unsigned char* pdata; + int i; + + if (sopt->sopt_valsize < sizeneeded) { + // this is a probe to retrieve the space needed for + // a dump of the sysctl table + oid->id = sizeneeded; + sopt->sopt_valsize = sizeof(struct dn_id); + return 0; + } + + entry = (struct sysctlhead*)(oid+1); + /* [entry][data(datalen)][name(namelen)] */ + ND("copying values"); + for( i=0; i<GST.count; i++) { + ND("entry %d %s flags 0x%x", + i, GST.entry[i].name, GST.entry[i].head.flags); + entry->blocklen = GST.entry[i].head.blocklen; + entry->namelen = GST.entry[i].head.namelen; + entry->flags = GST.entry[i].head.flags; + entry->datalen = GST.entry[i].head.datalen; + pdata = (unsigned char*)(entry+1); + pstring = pdata+GST.entry[i].head.datalen; + if (entry->flags & SYSCTLTYPE_PROC) { + //int (*f)(SYSCTL_HANDLER_ARGS); + sysctl_h_fn_t *f; + int tmp = 0, ret; + struct sysctl_req req; + + bzero(&req, sizeof(req)); + req.oldlen = req.newlen = sizeof(int); + req.oldptr = &tmp; + f = GST.entry[i].fn; + ND("-- %s is a proc -- at %p", GST.entry[i].name, f); + ret = f(NULL, NULL, (int)(intptr_t)(GST.entry[i].data), &req); + ND("-- %s returns %d", GST.entry[i].name, ret); + bcopy(&tmp, pdata, sizeof(tmp)); + } else { + bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); + } + bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); + entry = (struct sysctlhead*) + ((unsigned char*)(entry) + GST.entry[i].head.blocklen); + } + sopt->sopt_valsize = sizeneeded; + return 0; +} + +int +kesysctl_emu_set(void* p, int l) +{ + struct sysctlhead* entry; + unsigned char* pdata; + unsigned char* pstring; + int i = 0; + + entry = (struct sysctlhead*)(((struct dn_id*)p)+1); + pdata = (unsigned char*)(entry+1); + pstring = pdata + entry->datalen; + + for (i=0; i<GST.count; i++) { + if (strcmp(GST.entry[i].name, (char *)pstring) != 0) + continue; + ND("%s: match found! %s\n",__FUNCTION__,pstring); + //sanity check on len, not really useful now since + //we only accept int32 + if (entry->datalen != GST.entry[i].head.datalen) { + printf("%s: len mismatch, user %d vs kernel %d\n", + __FUNCTION__, entry->datalen, + GST.entry[i].head.datalen); + return -1; + } + // check access (at the moment flags handles only the R/W rights + //later on will be type + access + if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { + printf("%s: the entry %s is read only\n", + __FUNCTION__,GST.entry[i].name); + return -1; + } + if (GST.entry[i].head.flags & SYSCTLTYPE_PROC) { + int (*f)(SYSCTL_HANDLER_ARGS); + int tmp = 0, ret; + struct sysctl_req req; + + bzero(&req, sizeof(req)); + req.oldlen = req.newlen = sizeof(int); + req.oldptr = &tmp; + req.newptr = pdata; + f = GST.entry[i].fn; + ND("-- %s is a proc -- at %p", GST.entry[i].name, f); + ret = f(NULL, NULL, (int)(intptr_t)(GST.entry[i].data), &req); + ND("-- %s returns %d", GST.entry[i].name, ret); + } else { + bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); + } + return 0; + } + D("%s: match not found\n",__FUNCTION__); + return 0; +} + +/* convert all _ to . until the first . */ +static void +underscoretopoint(char* s) +{ + for (; *s && *s != '.'; s++) + if (*s == '_') + *s = '.'; +} + +static int +formatnames(void) +{ + int i; + int size=0; + char* name; + + for (i=0; i<GST.count; i++) + size += GST.entry[i].head.namelen; + GST.namebuffer = malloc(size); + if (GST.namebuffer == NULL) + return -1; + name = GST.namebuffer; + for (i=0; i<GST.count; i++) { + bcopy(GST.entry[i].name, name, GST.entry[i].head.namelen); + underscoretopoint(name); + GST.entry[i].name = name; + name += GST.entry[i].head.namelen; + } + return 0; +} + +static void +dumpGST(void) +{ + int i; + + for (i=0; i<GST.count; i++) { + printf("SYSCTL: entry %i\n", i); + printf("name %s\n", GST.entry[i].name); + printf("namelen %i\n", GST.entry[i].head.namelen); + printf("type %i access %i\n", + GST.entry[i].head.flags >> 2, + GST.entry[i].head.flags & 0x00000003); + printf("data %i\n", *(int*)(GST.entry[i].data)); + printf("datalen %i\n", GST.entry[i].head.datalen); + printf("blocklen %i\n", GST.entry[i].head.blocklen); + } +} + +void sysctl_addgroup_f1(void); +void sysctl_addgroup_f2(void); +void sysctl_addgroup_f3(void); +void sysctl_addgroup_f4(void); + +void +keinit_GST(void) +{ + int ret; + + sysctl_addgroup_f1(); + sysctl_addgroup_f2(); + sysctl_addgroup_f3(); + sysctl_addgroup_f4(); + ret = formatnames(); + if (ret != 0) + printf("conversion of names failed for some reason\n"); + if (0) + dumpGST(); // XXX debugging + printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", + GST.count, GST.totalsize); +} + +void +keexit_GST(void) +{ + if (GST.namebuffer != NULL) + free(GST.namebuffer); + bzero(&GST, sizeof(GST)); +} + +void +sysctl_pushback(char* name, int flags, int datalen, void* data, sysctl_h_fn_t *fn) +{ + if (GST.count >= GST_HARD_LIMIT) { + printf("WARNING: global sysctl table full, this entry will not be added," + "please recompile the module increasing the table size\n"); + return; + } + GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' + GST.entry[GST.count].name = name; + GST.entry[GST.count].head.flags = flags; + GST.entry[GST.count].data = data; + GST.entry[GST.count].fn = fn; + GST.entry[GST.count].head.datalen = datalen; + GST.entry[GST.count].head.blocklen = + ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + + GST.entry[GST.count].head.datalen)+3) & ~3; + GST.totalsize += GST.entry[GST.count].head.blocklen; + GST.count++; +} +#endif /* EMULATE_SYSCTL */ + +extern int mainloop(int argc, char *argv[]); + +/* + * main program for ipfw kernel side when running an userspace emulation: + * open a socket on which we receive requests from userland, + * another socket for calls from the 'kernel' (simulating packet + * arrivals etc), and then periodically run the tick handler. + */ +int +main(int argc, char *argv[]) +{ + tick = 1000000/hz; + D("initializing tick to %ld", tick); + return mainloop(argc, argv); +} diff --git a/example/ipfw/extra/missing.h b/example/ipfw/extra/missing.h new file mode 100644 index 0000000..b5b65b2 --- /dev/null +++ b/example/ipfw/extra/missing.h @@ -0,0 +1,801 @@ +/* + * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id: missing.h 8377 2011-04-04 16:08:27Z marta $ + * + * Header for kernel variables and functions that are not available in + * userland. + */ + +#ifndef _MISSING_H_ +#define _MISSING_H_ + +#define KLD_MODULE /* disable kernel dependencies */ + +/* defined as assert */ +void panic(const char *fmt, ...); + +#define KASSERT(exp,msg) do { \ + if (__predict_false(!(exp))) \ + panic msg; \ +} while (0) +/* don't bother to optimize */ +#ifndef __predict_false +#define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ +#endif // XXX + + +#ifdef _KERNEL +#define NEED_KERNEL +#undef _KERNEL +#endif + +#include <stdio.h> // printf +#include <sys/socket.h> // IFNAMSIZ ? +#include <string.h> // strncmp +#include <stdlib.h> // bsearch +#ifdef NEED_KERNEL +#define _KERNEL +#include <sys/cdefs.h> +#include <sys/param.h> + +#define __user // not defined here ? +#define __init +#define __exit + +/* portability features, to be set before the rest: */ +#define WITHOUT_BPF /* do not use bpf logging */ + +#define MALLOC_DECLARE(x) struct __hack /* nothing */ +// XXX kernel malloc/free +extern void *kern_malloc(int); +extern void kern_free(void *); +#define malloc(_size, type, flags) kern_malloc(_size) +#define free(_var, type) kern_free(_var) + +/* inet_ntoa_r() differs in userspace and kernel. + * We load netinet/in.h so we get the kernel prototype ? + * but we also need to put #defines in the two places where + * it is used XXX fixme + */ +#include <netinet/in.h> + +/* log() conflicts with the math function. + * Revise, modifying the first argument. + */ +#define LOG_ERR 0x100 +#define LOG_INFO 0x200 +#ifndef LOG_SECURITY +#define LOG_SECURITY 0x400 +#endif + +#define log(_level, fmt, arg...) do { \ + int __attribute__((unused)) _querty = _level; \ + printf("kernel: " fmt, ##arg); } while (0) + +#endif /* _KERNEL */ + +/* + * Kernel locking support. + * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c + * + * In linux we use spinlock_bh to implement both. + * For 'struct rwlock' we need an #ifdef to change it to spinlock_t + */ + +#ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ +#if defined(__APPLE__) +#define DEFINE_SPINLOCK(x) struct mtx x; +#else /* linux ? */ +#define DEFINE_SPINLOCK(x) spinlock_t x // = SPIN_LOCK_UNLOCKED +#endif +#endif + +/* 20111031 + * redefine mutex in terms of threads. + */ + +#undef _KERNEL +// #include <sys/types.h> +#include <pthread.h> +#ifdef NEED_KERNEL +#define _KERNEL +#endif +struct mtx { + pthread_mutex_t p0; +}; +struct rwlock { + pthread_mutex_t p0; +}; +struct rmlock { + pthread_mutex_t p0; +}; +extern pthread_mutex_t dummynet_mtx_p; +extern pthread_mutex_t ipfw_dyn_mtx_p; +extern pthread_mutex_t pfil_global_lock_p; + +#define mtx_assert(a, b) +/* + * the first argument to mtx_init is often a static variable, + * so use (void)m to prevent a compiler warning + */ +#define mtx_init(m, a,b,c) do { \ + (void)m; pthread_mutex_init(&((m)->p0), NULL); } while (0) +#define MTX_SYSINIT(a, m, c, d) // pthread_mutex_init(m##_p, NULL) +#define mtx_lock(m) pthread_mutex_lock(m.p0) +#define mtx_unlock(m) pthread_mutex_unlock(m.p0) +#define mtx_destroy(m) pthread_mutex_destroy(m.p0) +#if 1 +//------------------ + +#if 1 // used for IPFW_UH +#define rw_assert(a, b) +#define rw_destroy(_l) +#define rw_init(_l, msg) // XXX mtx_init((_l), 0, 0, 0) +#define rw_rlock(_l) mtx_lock(_l) +#define rw_runlock(_l) mtx_unlock(_l) +#define rw_wlock(_l) mtx_lock(_l) +#define rw_wunlock(_l) mtx_unlock(_l) +#define rw_init_flags(_l, s, v) +#endif // XXX not used anymore + +#define rm_init(_l, msg) // mtx_init(...) +#define rm_rlock(_l, _t) ((void)_t, mtx_lock(_l)) +#define rm_runlock(_l, _t) mtx_unlock(_l) +#define rm_wlock(_l) mtx_lock(_l) +#define rm_wunlock(_l) mtx_unlock(_l) +#define rm_destroy(_l) // XXX +#define rm_assert(_l, _w) // XXX + + +#endif // locking on linux ? + +/* end of locking support */ + +/* + * Reference to an ipfw rule that can be carried outside critical sections. + * A rule is identified by rulenum:rule_id which is ordered. + * In version chain_id the rule can be found in slot 'slot', so + * we don't need a lookup if chain_id == chain->id. + * + * On exit from the firewall this structure refers to the rule after + * the matching one (slot points to the new rule; rulenum:rule_id-1 + * is the matching rule), and additional info (e.g. info often contains + * the insn argument or tablearg in the low 16 bits, in host format). + * On entry, the structure is valid if slot>0, and refers to the starting + * rules. 'info' contains the reason for reinject, e.g. divert port, + * divert direction, and so on. + */ +struct ipfw_rule_ref { + uint32_t slot; /* slot for matching rule */ + uint32_t rulenum; /* matching rule number */ + uint32_t rule_id; /* matching rule id */ + uint32_t chain_id; /* ruleset id */ + uint32_t info; /* see below */ +}; + +/* ISO C restricts enumerator values to range of 'int' + * so we need IN to have a smaller value + */ +enum { + IPFW_INFO_MASK = 0x0000ffff, + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ + IPFW_INFO_IN = 0x00800000, /* incoming, overloads dir */ + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ + IPFW_IS_MASK = 0x30000000, /* which source ? */ + IPFW_IS_DIVERT = 0x20000000, + IPFW_IS_DUMMYNET =0x10000000, + IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ +}; + +/* in netinet/in.h */ +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) + +/* ip_dummynet.c */ +#ifndef __FreeBSD_version +#define __FreeBSD_version 500035 +#endif + +/* define some macro for ip_dummynet */ + +struct malloc_type { +}; + +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ + struct malloc_type type[1]; void *md_dummy_ ## type = type + +#define CTASSERT(x) + + +/* + * gettimeofday would be in sys/time.h but it is not + * visible if _KERNEL is defined + */ +//int gettimeofday(struct timeval *, struct timezone *); + + +extern int hz; +extern long tick; /* exists in 2.4 but not in 2.6 */ +extern int bootverbose; +extern struct timeval boottime; + +/* time_uptime is a FreeBSD variable increased each second */ +extern time_t time_uptime; + +extern int max_linkhdr; +extern int ip_defttl; +extern u_long in_ifaddrhmask; /* mask for hash table */ +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +/*-------------------------------------------------*/ + +/* define, includes and functions missing in linux */ +/* include and define */ +#include <arpa/inet.h> /* inet_ntoa */ + +struct mbuf; +// XXX #define M_MCAST 0x04 /* send/received as link-level multicast */ + + +/* used by ip_dummynet.c */ +void reinject_drop(struct mbuf* m); + +#include <sys/socket.h> /* for ETHERTYPE_IP */ + +#ifdef _KERNEL +#define IF_NAMESIZE 16 +#ifndef IFNAMSIZ +#define IFNAMSIZ IF_NAMESIZE +#endif +//#include <net/if.h> /* IFNAMESIZ */ +#endif + +/* + * some network structure can be defined in the bsd way + * by using the _FAVOR_BSD definition. This is not true + * for icmp structure. + * XXX struct icmp contains bsd names in + * /usr/include/netinet/ip_icmp.h + */ + +/* missing definition */ +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_ACK 0x10 + +/* 20131101 IPTOS from ip.h */ +/* + * Definitions for DiffServ Codepoints as per RFC2474 + */ +#define IPTOS_DSCP_CS0 0x00 +#define IPTOS_DSCP_CS1 0x20 +#define IPTOS_DSCP_AF11 0x28 +#define IPTOS_DSCP_AF12 0x30 +#define IPTOS_DSCP_AF13 0x38 +#define IPTOS_DSCP_CS2 0x40 +#define IPTOS_DSCP_AF21 0x48 +#define IPTOS_DSCP_AF22 0x50 +#define IPTOS_DSCP_AF23 0x58 +#define IPTOS_DSCP_CS3 0x60 +#define IPTOS_DSCP_AF31 0x68 +#define IPTOS_DSCP_AF32 0x70 +#define IPTOS_DSCP_AF33 0x78 +#define IPTOS_DSCP_CS4 0x80 +#define IPTOS_DSCP_AF41 0x88 +#define IPTOS_DSCP_AF42 0x90 +#define IPTOS_DSCP_AF43 0x98 +#define IPTOS_DSCP_CS5 0xa0 +#define IPTOS_DSCP_EF 0xb8 +#define IPTOS_DSCP_CS6 0xc0 +#define IPTOS_DSCP_CS7 0xe0 + +/* + * ECN (Explicit Congestion Notification) codepoints in RFC3168 mapped to the + * lower 2 bits of the TOS field. + */ +#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */ +#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ +#define IPTOS_ECN_MASK 0x03 /* ECN field mask */ + +/*------------------------- */ + +#define RTF_CLONING 0x100 /* generate new routes on use */ + +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_CARP 112 /* CARP */ +#define CARP_VERSION 2 +#define CARP_ADVERTISEMENT 0x01 +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define NETISR_IP 2 /* same as AF_INET */ +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ + +extern int securelevel; + +#define if_xname name +#define if_snd XXX + +// XXX we could use this to point to the incoming peer +struct ifnet { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ + uint32_t if_index; // IP_FW_3 +}; + +struct ifaltq { + + void *ifq_head; +}; +int ffs(int); // XXX where +int fls(int); // XXX where + +struct ip; +/* machine/in_cksum.h */ +int in_cksum(struct mbuf *m, int len); +#ifndef __FreeBSD__ +u_short in_cksum_hdr(struct ip *); +#endif + + +#define CTR3(a, ...) +#define uma_zone_set_max(a, b) // XXX + +/* + * ifnet->if_snd is used in ip_dummynet.c to take the transmission + * clock. + */ +#if defined( __linux__) +#define if_xname name +#define if_snd XXX + +struct route_in6 { +}; + +#elif defined( _WIN32 ) +/* used in ip_dummynet.c */ +struct ifnet { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +// struct ifaltq if_snd; /* output queue (includes altq) */ +}; + +struct net_device { + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ +}; +#elif defined(__APPLE__) +typedef u_int32_t tcp_cc; +#ifndef s6_addr32 // XXX +#define s6_addr32 __u6_addr.__u6_addr32 +#endif +#include <netinet/tcp.h> + +struct route_in6 { +}; + +struct icmphdr { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ +}; + +#define IPPROTO_SCTP 132 /* SCTP */ + +/* defined in linux/sctp.h with no bsd definition */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* Adler32 C-Sum */ + /* chunks follow... */ +}; + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif +}; + + +struct pim { + int dummy; /* windows compiler does not like empty definition */ +}; + +#endif + +/* involves mbufs */ +//int in_cksum(struct mbuf *m, int len); +#define divert_cookie(mtag) 0 +#define divert_info(mtag) 0 +#define pf_find_mtag(a) NULL +#define pf_get_mtag(a) NULL +#if !defined(_WIN32) && !defined(AF_LINK) +#define AF_LINK AF_ASH /* ? our sys/socket.h */ +#endif + +/* search local the ip addresses, used for the "me" keyword */ +#define INADDR_TO_IFP(ip, b) b = NULL + +/* we don't pullup, either success or free and fail */ +#define m_pullup(m, x) \ + ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + sa_family_t af; /* for ECN */ + u_int32_t qid; /* queue id */ +}; + +/* missing kernel functions */ +char *inet_ntoa(struct in_addr ina); +long random(void); + +/* + * Return the risult of a/b + * + * this is used in linux kernel space, + * since the 64bit division needs to + * be done using a macro + */ +//int64_t div64(int64_t a, int64_t b); + +/* from bsd sys/queue.h */ +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +/*-------------------------------------------------*/ +#define RT_NUMFIBS 1 +extern u_int rt_numfibs; + +/* involves kernel locking function */ +#ifdef RTFREE +#undef RTFREE +#define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); +#endif + +void getmicrouptime(struct timeval *tv); + +/* from sys/netinet/ip_output.c */ +struct ip_moptions; +struct route; +struct ip; + +struct inpcb; +struct mbuf *ip_reass(struct mbuf *); +int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp); + +/* from net/netisr.c -- fails on FreeBSD */ +int netisr_dispatch(u_int proto, struct mbuf *m); + + +/* definition moved in missing.c */ +int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); +int copyout(const void *kaddr, void *uaddr, size_t len); + +int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); + +/* defined in session.c */ +int priv_check(struct thread *td, int priv); + +/* struct ucred is in linux/socket.h and has pid, uid, gid. + * We need a 'bsd_ucred' to store also the extra info + */ + +struct bsd_ucred { + uid_t uid; + gid_t gid; + uint32_t xid; + uint32_t nid; +}; + +#ifdef _KERNEL + +#if 0 // XXX +int +cred_check(void *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, + struct sk_buff *skb); +#endif + +struct ucred; +int securelevel_ge(struct ucred *cr, int level); + +/* + * stripped down version of the sysctl api + */ +struct sysctl_oid; +struct sysctl_req { + void *oldptr; /* store here the original value */ + int oldlen; + void *newptr; /* NULL on reads */ + int newlen; +}; + +#ifdef _WIN32 +#define module_param_named(_name, _var, _ty, _perm) +#else /* !_WIN32 */ + +#endif /* !_WIN32 so maybe __linux__ */ + +#if 0 // XXX disable sysctl defined (__linux__) && !defined (EMULATE_SYSCTL) +#define SYSCTL_DECL(_1) +#define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) +#define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) +#define _SYSCTL_BASE(_name, _var, _ty, _perm) \ + module_param_named(_name, *(_var), _ty, \ + ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) +#define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) + +#define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, int, _mode) + +#define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, long, _mode) + +#define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, ulong, _mode) + +#define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ + _SYSCTL_BASE(_name, _var, uint, _mode) + +#define TUNABLE_INT(_name, _ptr) + +#define SYSCTL_VNET_PROC SYSCTL_PROC +#define SYSCTL_VNET_INT SYSCTL_INT +#define SYSCTL_VNET_UINT SYSCTL_UINT + +#endif + +#define SYSCTL_HANDLER_ARGS \ + struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req +typedef int (sysctl_h_fn_t)(SYSCTL_HANDLER_ARGS); +int sysctl_handle_int(SYSCTL_HANDLER_ARGS); +int sysctl_handle_long(SYSCTL_HANDLER_ARGS); + +#ifdef EMULATE_SYSCTL /* mandatory here */ + +#define STRINGIFY(x) #x + +#ifdef SYSCTL_NODE +#undef SYSCTL_NODE +#endif +#define SYSCTL_NODE(a,b,c,d,e,f) int a; (void)a +#define SYSCTL_DECL(a) + +#define GST_HARD_LIMIT 100 + +/* In the module, GST is implemented as an array of + * sysctlentry, but while passing data to the userland + * pointers are useless, the buffer is actually made of: + * - sysctlhead (fixed size, containing lengths) + * - data (typically 32 bit) + * - name (zero-terminated and padded to mod4) + */ + +struct sysctlentry { + struct sysctlhead head; + char* name; + void* data; + sysctl_h_fn_t *fn; +}; + +struct sysctltable { + int count; //number of valid tables + int totalsize; //total size of valid entries of al the valid tables + void* namebuffer; //a buffer for all chained names + struct sysctlentry entry[GST_HARD_LIMIT]; +}; + +#ifdef SYSBEGIN +#undef SYSBEGIN +#endif +#define SYSBEGIN(x) void sysctl_addgroup_##x() { +#ifdef SYSEND +#undef SYSEND +#endif +#define SYSEND } + +/* XXX remove duplication */ +#define SYSCTL_INT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e, NULL) + +#define SYSCTL_UINT(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e, NULL) + +#define SYSCTL_LONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e, NULL) + +#define SYSCTL_ULONG(a,b,c,d,e,f,g) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e, NULL) +#define TUNABLE_INT(a,b) + +#define SYSCTL_PROC(a,b,c,d,e,f,g,h,i) \ + sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ + (d), 4 /* XXX large */, (void *)(f /* arg2 */), g) + +#define SYSCTL_VNET_PROC SYSCTL_PROC +#define SYSCTL_VNET_INT SYSCTL_INT +#define SYSCTL_VNET_UINT SYSCTL_UINT + +void keinit_GST(void); +void keexit_GST(void); +int kesysctl_emu_set(void* p, int l); +int kesysctl_emu_get(struct sockopt* sopt); +void sysctl_pushback(char* name, int flags, int datalen, void* data, sysctl_h_fn_t *fn); + +#endif /* EMULATE_SYSCTL */ + +struct ifnet; +void ether_demux(struct ifnet *ifp, struct mbuf *m); + +int ether_output_frame(struct ifnet *ifp, struct mbuf *m); + +void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); + +void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); + +#define in_localip(_x) (0) + +#ifndef __FreeBSD__ +struct rtentry; +#endif +void rtfree(struct rtentry *rt); + +u_short in_cksum_skip(struct mbuf *m, int len, int skip); + +#ifdef INP_LOCK_ASSERT +#undef INP_LOCK_ASSERT +#define INP_LOCK_ASSERT(a) +#endif + +int jailed(struct ucred *cred); + +/* +* Return 1 if an internet address is for a ``local'' host +* (one to which we have a connection). If subnetsarelocal +* is true, this includes other subnets of the local net. +* Otherwise, it includes only the directly-connected (sub)nets. +*/ +int in_localaddr(struct in_addr in); + +int fnmatch(const char *pattern, const char *string, int flags); + +/* vnet wrappers, in vnet.h and ip_var.h */ +//int ipfw_init(void); +//void ipfw_destroy(void); + +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ +#define MTAG_IPFW_CALL 1308397630 /* call stack */ + +#ifdef __APPLE__ +#define offsetof(type, field) __builtin_offsetof(type, field) +#endif +struct ip_fw_args; +extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); + +#if 1 /* include vnet.h */ +#define curvnet NULL +#define CURVNET_SET(_v) +#define CURVNET_RESTORE() +#define VNET_ASSERT(condition) + +#define VNET_NAME(n) n +#define VNET_DECLARE(t, n) extern t n +#define VNET_DEFINE(t, n) t n +#define _VNET_PTR(b, n) &VNET_NAME(n) +/* + * Virtualized global variable accessor macros. + */ +#define VNET_VNET_PTR(vnet, n) (&(n)) +#define VNET_VNET(vnet, n) (n) + +#define VNET_PTR(n) (&(n)) +#define VNET(n) (n) + +#define IS_DEFAULT_VNET(x) (1) // always true +#endif + +VNET_DECLARE(int, ip_defttl); +#define V_ip_defttl VNET(ip_defttl); + + +// int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); +// XXX used in netmap_io.c +int ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); +int ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); + +/* hooks for divert */ +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); + +extern int (*ip_dn_ctl_ptr)(struct sockopt *); +typedef int ip_fw_ctl_t(struct sockopt *); +extern ip_fw_ctl_t *ip_fw_ctl_ptr; + + +/* netgraph prototypes */ +typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); +extern ng_ipfw_input_t *ng_ipfw_input_p; + +/* For kernel ipfw_ether and ipfw_bridge. */ +struct ip_fw_args; + +#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) +#define V_tcbinfo VNET(tcbinfo) +#define V_udbinfo VNET(udbinfo) +#endif /* _KERNEL */ + +// sys/eventhandler.h +#define EVENTHANDLER_DECLARE(a, b) + +/* application specific */ +struct sess; +typedef int (handler_t)(struct sess *sess, void *arg); + +/* + * flags to control the callback + * WANT_READ select on read + * WANT_WRITE select on write + * WANT_RUN run unconditionally + * WANT_DELETE session is exiting + */ +enum flags_t { + WANT_READ=1, WANT_WRITE=2, WANT_RUN=4, + WANT_DELETE=0x8000 +}; + +struct sess { + struct sess *next; + int fd; + handler_t *func; + void *arg; + enum flags_t flags; + void *private; /* pointer managed by the session code */ +}; +struct sess * +new_session(int fd, handler_t *func, void *arg, enum flags_t flags); + + +void netmap_add_port(const char *dev); +#endif /* !_MISSING_H_ */ diff --git a/example/ipfw/extra/session.c b/example/ipfw/extra/session.c new file mode 100644 index 0000000..333edd3 --- /dev/null +++ b/example/ipfw/extra/session.c @@ -0,0 +1,644 @@ +/* + * Session handler to simulate soopt* and network communication + * over a TCP socket, and also run the callbacks. + */ + +#ifdef _KERNEL +#undef _KERNEL +#endif +/* these headers need to be compiled without _KERNEL */ +#include <sys/types.h> +#include <sys/select.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> // TCP_NODELAY +#include <sys/cpuset.h> // freebsd, used in rmlock +#include <net/pfil.h> // PFIL_IN +#include <sys/errno.h> +extern int errno; + + +#ifdef free +/* we are built in a pseudo-kernel env so malloc and free are redefined */ +#undef free +#undef malloc +#endif /* free */ + +#include <stdio.h> +#include <pthread.h> +#include <fcntl.h> +#include <sys/time.h> /* timersub */ +#include <stdlib.h> +#include <string.h> +#include <unistd.h> /* read() */ + +#include <sys/mbuf.h> /* mbuf */ +#define _KERNEL + +/* args for ipfw */ +#include <netinet/ip_fw.h> +#include <netpfil/ipfw/ip_fw_private.h> + +/* + * Global variables need to be somewhere... + */ +void ip_dn_init(void); +int ipfw_init(void); +void ipfw_destroy(void); + +extern int (*ip_fw_ctl_ptr)(struct sockopt *); +extern int (*ip_dn_ctl_ptr)(struct sockopt *); +extern struct ip_fw *ip_fw_default_rule; + +extern int ticks; /* kernel ticks counter */ + +int callout_startup(void); +int callout_run(void); + +/* + * generic handler for sockopt functions + */ +static int +ctl_handler(struct sockopt *sopt) +{ + int error = EINVAL; + + ND("called, level %d", sopt->sopt_level); + if (sopt->sopt_level != IPPROTO_IP) + return (EINVAL); + switch (sopt->sopt_name) { + default: + D("command not recognised %d", sopt->sopt_name); + break; + case IP_FW3: // XXX untested + case IP_FW_ADD: /* ADD actually returns the body... */ + case IP_FW_GET: + case IP_FW_DEL: + case IP_FW_TABLE_GETSIZE: + case IP_FW_TABLE_LIST: + case IP_FW_NAT_GET_CONFIG: + case IP_FW_NAT_GET_LOG: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_RESETLOG: + case IP_FW_TABLE_ADD: + case IP_FW_TABLE_DEL: + case IP_FW_TABLE_FLUSH: + case IP_FW_NAT_CFG: + case IP_FW_NAT_DEL: + if (ip_fw_ctl_ptr != NULL) + error = ip_fw_ctl_ptr(sopt); + else { + D("ipfw not enabled"); + error = ENOPROTOOPT; + } + break; + + case IP_DUMMYNET_GET: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET3: + if (ip_dn_ctl_ptr != NULL) + error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break ; + } + ND("returning error %d", error); + return error; +} + +/* + * copy data back to userland + */ +int +sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) +{ + size_t valsize = sopt->sopt_valsize; + + ND("data len %d sopt_len %d", (int)len, (int)valsize); + if (len < valsize) + sopt->sopt_valsize = valsize = len; + bcopy(buf, sopt->sopt_val, valsize); + return 0; +} + +int +copyout(const void *kaddr, void *uaddr, size_t len) +{ + bcopy(kaddr, uaddr, len); + return 0; /* no fault */ +} + +/* + * copy data from userland to kernel + */ +int +sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) +{ + size_t valsize = sopt->sopt_valsize; + + ND("have %d len %d minlen %d", (int)valsize, (int)len, (int)minlen); + if (valsize < minlen) + return EINVAL; + if (valsize > len) + sopt->sopt_valsize = valsize = len; + bcopy(sopt->sopt_val, buf, valsize); + return 0; +} + +/* + * session description for event-based programming + */ +/* event-based session support */ + +#define SOCK_QLEN 5 /* listen lenght for incoming connection */ + +static struct sess *all_sessions, *new_sessions; + +struct sess * +new_session(int fd, handler_t *func, void *arg, enum flags_t flags) +{ + struct sess *desc; + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) + return NULL; + desc->fd = fd; + desc->func = func; + desc->arg = arg; + desc->flags = flags; + desc->next = new_sessions; + new_sessions = desc; + return desc; +} + +/* remove deleted sessions, merge with new ones */ +static void +merge_sessions(void) +{ + struct sess *cur, *prev, *tmp; + + for (prev = NULL, cur = all_sessions; cur; prev = cur, cur = tmp) { + tmp = cur->next; + if ( (cur->flags & WANT_DELETE) == 0) + continue; + if (prev) + prev->next = cur->next; + else + all_sessions = cur->next; + memset(cur, 0, sizeof(*cur)); + free(cur); + cur = prev; + } + if (prev) + prev->next = new_sessions; + else + all_sessions = new_sessions; + new_sessions = NULL; +} + +/* set the fdset, return the fdmax+1 for select() */ +int +set_sessions(fd_set *r, fd_set *w) +{ + struct sess *cur; + int fd_max = -1; + int count = 0,ready = 0; + + FD_ZERO(r); + FD_ZERO(w); + merge_sessions(); + for (cur = all_sessions; cur; cur = cur->next) { + count++; + if (cur->flags & WANT_RUN) { + ND("WANT_RUN on session %p", cur); + cur->flags &= ~WANT_RUN; + cur->func(cur, cur->arg); + } + if (cur->flags & WANT_READ) + FD_SET(cur->fd, r); + if (cur->flags & WANT_WRITE) + FD_SET(cur->fd, w); + if (cur->flags & (WANT_WRITE|WANT_READ)) { + ready ++; + if (cur->fd > fd_max) + fd_max = cur->fd; + } + } + ND("%d session %d waiting", count, ready); + return fd_max + 1; +} + +int +run_sessions(fd_set *r, fd_set *w) +{ + struct sess *cur; + + for (cur = all_sessions; cur; cur = cur->next) { + int fd = cur->fd; + // fprintf(stderr, "%s sess %p\n", __FUNCTION__, cur); + if (FD_ISSET(fd, r) || FD_ISSET(fd, w)) + cur->func(cur, cur->arg); + } + return 0; +} + +struct sess_buf { + int len; /* allocation length */ + int used; /* bytes used */ + int start; /* start position for next write */ + char data[0]; +}; + +struct sess_buf * +get_buf(int size, struct sess_buf *old) +{ + struct sess_buf *p = old; + + if (!p) { + ND("new buffer size %d", size); + p = calloc(1, sizeof(*p) + size); + } else if (p->len >= size) { + return p; + } else { + ND("calling realloc %p %d", old, size); + p = realloc(old, sizeof(*p) + size); + } + if (!p) { + if (old) + free(old); + } else { + p->len = size; + } + return p; +} + +/* + * do a non-blocking read into the buffer, reallocating if space + * is needed. + */ +static struct sess_buf * +get_data(int fd, struct sess_buf *buf, int want) +{ + int l; + + buf = get_buf(want, buf); + if (buf == NULL) + return buf; + l = read(fd, buf->data + buf->used, want - buf->used); + if (l > 0) + buf->used += l; + return buf; +} + +/* + * Handler for a request coming from the control socket. + */ +enum sockopt_state { + READING = 0, WRITING = 1 +}; + +struct sockopt_desc { + int state; /* internal state */ + struct sess_buf *rd; + struct sess_buf *wr; +}; + +/* header prepended to data in all transactions */ +struct rx_hdr { + uint32_t optlen; /* data len */ + uint32_t level; /* or error ? */ + uint32_t optname; /* or desired len ? */ + uint32_t dir; /* in or out */ +}; + +/* + * Return the number of remainig bytes from the buffer. + * The meessage is int optname; [int optlen; int data] + * where the second part is present or not depending on the + * message type. + */ +int +get_want(struct sess_buf *rd, struct rx_hdr *r) +{ + struct rx_hdr _r; + int l = sizeof(_r); + + if (r == NULL) + r = &_r; + if (!rd || rd->used < l) { + ND("short buffer (%d), return %d to bootstrap", + rd ? rd->used : -1, l); + return l; + } + bcopy(rd->data, r, l); + /* header fields are in network format, convert to host fmt */ + r->optlen = ntohl(r->optlen); + r->level = ntohl(r->level); + r->optname = ntohl(r->optname); + r->dir = ntohl(r->dir); + l += r->optlen; + return l; +} + +/* + * The sockopt commands are sent in network format (at least the header) + */ +int +sockopt_handler(struct sess *sess, void *arg) +{ + struct sockopt_desc *d; + int error = 1; + + ND("sess %p arg %p", sess, arg); + if (sess->private == NULL) + sess->private = calloc(1, sizeof(struct sockopt_desc)); + d = sess->private; + if (d == NULL) + goto done; + if (sess->flags & WANT_READ) { + int l, want, prev; + struct rx_hdr r; + struct sockopt sopt; + struct thread dummy; + + want = get_want(d->rd, &r); + prev = d->rd ? d->rd->used : 0; + ND("total message size is %d (prev %d)", want, prev); + + d->rd = get_data(sess->fd, d->rd, want); + l = d->rd ? d->rd->used : 0; + ND("read %d prev %d want %d", l, prev, want); + if (l == prev) /* no data -> error */ + goto done; + want = get_want(d->rd, &r); + ND("again, want %d l %d", want, l); + if (l < want) /* must read more data */ + return 0; + sopt.sopt_dir = r.dir; + sopt.sopt_level = r.level; + sopt.sopt_name = r.optname; + sopt.sopt_val = + (l <= sizeof(r)) ? NULL : d->rd->data + sizeof(r); + sopt.sopt_valsize = r.optlen; + sopt.sopt_td = &dummy; + ND("dir 0x%x lev %d opt %d optval %p optlen %d", + sopt.sopt_dir, + sopt.sopt_level, + sopt.sopt_name, + sopt.sopt_val, + (int)sopt.sopt_valsize); + + /* now call the handler */ + r.level = htonl(ctl_handler(&sopt)); + ND("handler returns %d", ntohl(r.level)); + r.optlen = htonl(0); /* default len */ + r.dir = htonl(sopt.sopt_dir); + /* prepare the buffer for writing */ + if (d->wr != NULL) { /* previous write buffer */ + free(d->wr); + } + d->wr = d->rd; + d->rd = NULL; + d->wr->used = sopt.sopt_valsize + sizeof(r); + d->wr->start = 0; + /* now update the header */ + if (sopt.sopt_dir == SOPT_GET) + r.optlen = htonl(sopt.sopt_valsize); + + bcopy(&r, d->wr->data, sizeof(r)); + + sess->flags = WANT_WRITE; + return 0; + } + if (sess->flags & WANT_WRITE) { + struct sess_buf *wr = d->wr; + + int l = write(sess->fd, wr->data + wr->start, + wr->used - wr->start); + ND("written %d bytes out of %d", l, + wr->used - wr->start); + if (l <= 0) { + if (errno == EAGAIN) + return 0; + goto done; /* error */ + } + wr->start += l; + if (wr->start < wr->used) + return 0; + // prepare for another rpc + sess->flags = WANT_READ; + return 0; + //goto done; + } +done: + ND("closing session"); + if (d) { + if (sess->fd >= 0) + close(sess->fd); + if (d->rd) + free(d->rd); + if (d->wr) + free(d->wr); + d->rd = d->wr = NULL; + free(d); /* private data */ + sess->flags = WANT_DELETE; + } + return error; +} + + +/* + * testing code when reading fake packets from socket 5556. + * Turns out that ipfw_check_hook() is a lot slower than ipfw_chk() + * XXX new ipfw uses ipfw_check_frame or ipfw_check_packet + */ +int +packet_handler(struct sess *sess, void *arg) +{ + char fake_buf[2048]; + struct mbuf dm; + int i; + + bzero(&dm, sizeof(dm)); + dm.m_data = fake_buf + 14; /* skip mac hdr */ + dm.m_len = dm.m_pkthdr.len = 128; + fake_buf[14] = 0x45; // ip + *(uint16_t *)(fake_buf+16) = htons(64); // bytes + *(uint32_t *)(fake_buf+26) = htonl(0x01020304); // src + *(uint32_t *)(fake_buf+30) = htonl(0x05060708); // dst + { +#if 0 + struct ip_fw_args args; + bzero(&args, sizeof(args)); + args.m = &dm; + for (i = 0; i < 1000; i++) + ipfw_chk(&args); +#else + struct ifnet *ifp = NULL; + struct inpcb *inp = NULL; + struct mbuf *m = &dm; + ND("sess %p arg %p", sess, arg); + for (i = 0; i < 1000; i++) + ipfw_check_packet(NULL, &m, ifp, PFIL_IN, inp); +#endif + } + return 0; +} + + +/* + * This task accepts a new connection and creates a new session. + */ +static int +listener(struct sess *sess, void *arg) +{ + int fd; + + ND("sess %p arg %p", sess, arg); + fd = accept(sess->fd, NULL, NULL); + if (fd < 0) + return -1; + fcntl(fd, F_SETFL, O_NONBLOCK); +#ifdef setsockopt /* make sure we don't redefine it */ +#error cannot compile this +#endif + { + int on = 1, ret; + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + ND("TCP_NODELAY returns %d", ret); + } + new_session(fd, sess->arg ? sockopt_handler: packet_handler, + sess->arg, WANT_READ); + return 0; +} + +/* + * listen on a socket, + * return the listen fd or -1 on error. + */ +static int +do_server(const char *addr, int port) +{ + int fd = -1, on; + struct sockaddr_in server; + + /* open the listen socket */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror( "socket" ); + return -1; + } + + on = 1; +#ifdef SO_REUSEADDR + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) + perror("SO_REUSEADDR failed(non fatal)"); +#endif +#ifdef SO_REUSEPORT + on = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)) == -1) + perror("SO_REUSEPORT failed(non fatal)"); +#endif + + /* fill the server struct */ + bzero(&server, sizeof(server)); + server.sin_family = AF_INET; + inet_aton(addr, &server.sin_addr); + server.sin_port = htons(port); + + /* bind the local address */ + if (bind(fd, (struct sockaddr*) &server, sizeof(server)) < 0) { + perror( "bind" ); + return -1; + } + D("+++ listening tcp %s:%d", + inet_ntoa(server.sin_addr), ntohs(server.sin_port)); + + /* listen for incoming connection */ + if (listen(fd, SOCK_QLEN) < 0) { + perror( "listen" ); + return -1; + } + return fd; +} + +extern int ipfw_module_init(void); + +/* + * main program for ipfw kernel side when running an userspace emulation: + * open a socket on which we receive requests from userland, + * another socket for calls from the 'kernel' (simulating packet + * arrivals etc), and then periodically run the tick handler. + */ +int +mainloop(int argc, char *argv[]) +{ + int listen_fd; + struct timeval t0; + const char *s, *addr = LOCALADDR; + int port = IPFW_PORT; + int i; + int old_ticks; + uint64_t callouts = 0, skipped = 0; + + gettimeofday(&t0, NULL); + old_ticks = ticks = 0; + callout_startup(); + + ipfw_module_init(); + + /* override the host if set in the environment */ + s = getenv("IPFW_HOST"); + if (s) + addr = s; + s = getenv("IPFW_PORT"); + if (s && atoi(s) > 0) + port = atoi(s); + /* start the server */ + listen_fd = do_server(addr, port); + if (listen_fd < 0) { + printf("Error starting server\n"); + return -1; + } + new_session(listen_fd, listener, (void *)1, WANT_READ); + +#ifdef WITH_NETMAP + for (i = 1; i < argc; i++) { + netmap_add_port(argv[i]); + } +#endif /* WITH_NETMAP */ + +#if 0 // test code: a telnet on 5556 becomes an infinite source + { + int net_fd = do_server(addr, port+1); + if (net_fd >= 0) + new_session(net_fd, listener, NULL, WANT_READ); + } +#endif + + for (;;) { + struct timeval now, delta = { 0, tick} ; + int n; + fd_set r, w; + + n = set_sessions(&r, &w); + select(n, &r, &w, NULL, &delta); + run_sessions(&r, &w); + gettimeofday(&now, 0); + timersub(&now, &t0, &delta); + /* compute absolute ticks. */ + ticks = (delta.tv_sec * hz) + (delta.tv_usec * hz) / 1000000; + if (old_ticks != ticks) { + callouts++; + callout_run(); + old_ticks = ticks; + } else { + skipped++; + } + RD(1, "callouts %lu skipped %lu", (u_long)callouts, (u_long)skipped); + } + ipfw_destroy(); + return 0; +} diff --git a/example/ipfw/extra/sys/contrib/pf/net/pfvar.h b/example/ipfw/extra/sys/contrib/pf/net/pfvar.h new file mode 100644 index 0000000..257bbd6 --- /dev/null +++ b/example/ipfw/extra/sys/contrib/pf/net/pfvar.h @@ -0,0 +1,27 @@ +/* + * replacement for FreeBSD's pfqueue.h + */ +#include <sys/queue.h> + +#define DIOCSTARTALTQ _IO ('D', 42) +#define DIOCSTOPALTQ _IO ('D', 43) + +struct pf_altq { + TAILQ_ENTRY(pf_altq) entries; + /* ... */ + u_int32_t qid; /* return value */ + +#define PF_QNAME_SIZE 64 + char qname[PF_QNAME_SIZE]; /* queue name */ + +}; + +struct pfioc_altq { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + struct pf_altq altq; +}; + +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) diff --git a/example/ipfw/extra/sys/sys/kernel.h b/example/ipfw/extra/sys/sys/kernel.h new file mode 100644 index 0000000..f234d4a --- /dev/null +++ b/example/ipfw/extra/sys/sys/kernel.h @@ -0,0 +1,26 @@ +/* + * from freebsd's kernel.h + */ +#ifndef _SYS_KERNEL_H_ +#define _SYS_KERNEL_H_ + +#define SYSINIT(a, b, c, d, e) \ + int (*sysinit_ ## d)(void *) = (int (*)(void *))(d) +#define VNET_SYSINIT(a, b, c, d, e) \ + SYSINIT(a, b, c, d, e) +#define SYSUNINIT(a, b, c, d, e) \ + int (*sysuninit_ ## d)(void *) = (int (*)(void *))(d) +#define VNET_SYSUNINIT(a, b, c, d, e) \ + SYSUNINIT(a, b, c, d, e) + +/* + * Some enumerated orders; "ANY" sorts last. + */ +enum sysinit_elem_order { + SI_ORDER_FIRST = 0x0000000, /* first*/ + SI_ORDER_SECOND = 0x0000001, /* second*/ + SI_ORDER_THIRD = 0x0000002, /* third*/ + SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ + SI_ORDER_ANY = 0xfffffff /* last*/ +}; +#endif diff --git a/example/ipfw/extra/sys/sys/malloc.h b/example/ipfw/extra/sys/sys/malloc.h new file mode 100644 index 0000000..9bc64a3 --- /dev/null +++ b/example/ipfw/extra/sys/sys/malloc.h @@ -0,0 +1,13 @@ +/* + * $Id$ + * replacement for sys/malloc.h to compile kernel in userspace + */ + +#ifndef _SYS_MALLOC_H_ +#define _SYS_MALLOC_H_ + +#define M_WAITOK 0x0000 /* can block */ +#define M_NOWAIT 0x0001 /* do not block */ +#define M_ZERO 0x0100 /* bzero the allocation */ +#endif /* _SYS_MALLOC_H_ */ + diff --git a/example/ipfw/extra/sys/sys/mbuf.h b/example/ipfw/extra/sys/sys/mbuf.h new file mode 100644 index 0000000..1f3af63 --- /dev/null +++ b/example/ipfw/extra/sys/sys/mbuf.h @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2012 Luigi Rizzo, Universita` di Pisa + * + * BSD copyright. + * + * A simple compatibility interface to map mbufs onto userspace structs + */ + +#ifndef _SYS_MBUF_H_ +#define _SYS_MBUF_H_ +#define VM_UMA_H // kill this one // maybe not needed +#define _VM_UMA_H_ // kill this one too + +// #include <sys/malloc.h> /* we use free() */ +/* hopefully queue.h is already included by someone else */ +#include <sys/queue.h> +#ifdef _KERNEL + +/* bzero not present on linux, but this should go in glue.h */ +// #define bzero(s, n) memset(s, 0, n) + +/* + * We implement a very simplified UMA allocator where the backend + * is simply malloc, and uma_zone only stores the length of the components. + */ +typedef int uma_zone_t; /* the zone size */ + +#define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) +typedef int (*uma_init)(void *mem, int size, int flags); +typedef void (*uma_fini)(void *mem, int size); + + +#define uma_zfree(zone, item) free(item, M_IPFW) +#define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) +#define uma_zdestroy(zone) do {} while (0) + +/*- + * Macros for type conversion: + * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. + */ +#define mtod(m, t) ((t)((m)->m_data)) + +#endif /* _KERNEL */ + +/* + * Packet tag structure (see below for details). + */ +struct m_tag { + SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ + u_int16_t m_tag_id; /* Tag ID */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_cookie; /* ABI/Module ID */ +// void (*m_tag_free)(struct m_tag *); +}; + +/* + * Auxiliary structure to store values from the sk_buf. + * Note that we should not alter the sk_buff, and if we do + * so make sure to keep the values in sync between the mbuf + * and the sk_buff (especially m_len and m_pkthdr.len). + */ + +struct skbuf; + +struct mbuf { + struct mbuf *m_next; + struct mbuf *m_nextpkt; + void * m_data; /* XXX should change to caddr_t */ + int32_t m_len; /* length in this mbuf */ + int m_flags; + struct { + struct ifnet *rcvif; + int len; /* total packet len */ + SLIST_HEAD (packet_tags, m_tag) tags; + } m_pkthdr; + struct skbuf *m_skb; + int __max_m_len; /* original value */ + + /* + * in-stack mbuffers point to an external buffer, + * the two variables below contain base and size, + * and have M_STACK set in m_flags. + * Buffers from the heap have __m_extbuf = (char *)m + MSIZE + */ + void *__m_extbuf; /* external buffer base */ + int __m_extlen; /* data in ext buffer */ + void (*__m_callback)(struct mbuf *, int); + void *__m_peer; /* argument attached to the mbuf */ +}; + +/* + * note we also have M_FASTFWD_OURS mapped to M_PROTO1 0x10 + */ +#ifndef M_SKIP_FIREWALL /* XXX conflict in FreeBSD */ +#define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ +#else +#define M_PROTO3 0x01 // FreeBSD 10 and 11 +#endif /* XXX conflict in FreeBSD */ + +#define M_BCAST 0x02 /* send/received as link-level broadcast */ +#define M_MCAST 0x04 /* send/received as link-level multicast */ +#define M_PROTO1 0x10 +#define M_PROTO2 0x20 +#define M_FASTFWD_OURS M_PROTO1 +#define M_IP_NEXTHOP M_PROTO2 +#define M_STACK 0x1000 /* allocated on the stack */ + +void m_freem(struct mbuf *m); + +#ifdef _KERNEL + +/* + * m_dup() is used in the TEE case, currently unsupported so we + * just return. + */ +static __inline struct mbuf *m_dup(struct mbuf *m, int n) +{ + (void)m; /* UNUSED */ + (void)n; /* UNUSED */ + D("unimplemented, expect panic"); + return NULL; +} + + +static __inline void +m_tag_prepend(struct mbuf *m, struct m_tag *t) +{ + ND("m %p tag %p", m, t); + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); +} + +/* + * Unlink a tag from the list of tags associated with an mbuf. + */ +static __inline void +m_tag_unlink(struct mbuf *m, struct m_tag *t) +{ + + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); +} + +/* + * Return the next tag in the list of tags associated with an mbuf. + */ +static __inline struct m_tag * +m_tag_next(struct mbuf *m, struct m_tag *t) +{ + D("mbuf %p tag %p", m, t); + return (SLIST_NEXT(t, m_tag_link)); +} + +extern SLIST_HEAD (tags_freelist, m_tag) tags_freelist; +extern int tags_minlen; +extern int tags_freelist_count; + +extern int max_protohdr; /* uipc_mbuf.c - max proto header */ + +/* + * Create an mtag of the given type + */ +static __inline struct m_tag * +m_tag_alloc(uint32_t cookie, int type, int length, int wait) +{ + static int maxlen = 0; + int l = length + sizeof(struct m_tag); + struct m_tag *m = NULL; + + if (l > maxlen) { + D("new maxlen %d (%d)", l, length ); + maxlen = l; + } + if (l <= tags_minlen) { + l = tags_minlen; + m = SLIST_FIRST(&tags_freelist); + } + if (m) { + SLIST_REMOVE_HEAD(&tags_freelist, m_tag_link); + ND("allocate from freelist"); + tags_freelist_count--; + } else { + ND("size %d allocate from malloc", l); + m = malloc(l, 0, M_NOWAIT); + } + if (m) { + bzero(m, l); + m->m_tag_id = type; + m->m_tag_len = length; + m->m_tag_cookie = cookie; + ND("tag %p cookie %d type %d", m, cookie, type); + } + return m; +} + +#define MTAG_ABI_COMPAT 0 /* compatibility ABI */ + +static __inline struct m_tag * +m_tag_get(int type, int length, int wait) +{ + return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); +} + +static __inline struct m_tag * +m_tag_first(struct mbuf *m) +{ + struct m_tag *t; + t = SLIST_FIRST(&m->m_pkthdr.tags); + ND("mbuf %p has %p", m, t); + return t; +} + +static __inline void +m_tag_delete(struct mbuf *m, struct m_tag *t) +{ + D("mbuf %p tag %p, ******* unimplemented", m, t); +} + +static __inline struct m_tag * +m_tag_locate(struct mbuf *m, u_int32_t cookie, int x, struct m_tag *t) +{ + struct m_tag *tag; + + ND("search %d %d in mbuf %p at %p", cookie, x, m, t); + if (t) + D("--- XXX ignore non-null t %p", t); + tag = SLIST_FIRST(&m->m_pkthdr.tags); + if (tag == NULL) + return NULL; + + ND("found tag %p cookie %d type %d (want %d %d)", + tag, tag->m_tag_cookie, tag->m_tag_id, cookie, x); + if (tag->m_tag_cookie != cookie || tag->m_tag_id != x) { + ND("want %d %d have %d %d, expect panic", + cookie, x, tag->m_tag_cookie, tag->m_tag_id); + return NULL; + } else + return tag; +} + +static __inline struct m_tag * +m_tag_find(struct mbuf *m, int type, struct m_tag *start) +{ + D("m %p", m); + return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : + m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); +} + +#define M_SETFIB(_m, _fib) /* nothing on linux */ + + +/* m_pullup is not supported, there is a macro in missing.h */ + +#define M_GETFIB(_m) 0 + +/* macro used to create a new mbuf */ +#define MT_DATA 1 /* dynamic (data) allocation */ +#ifndef MSIZE // defined on osx +#define MSIZE 256 /* size of an mbuf */ +#endif +#define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) +#define MY_MCLBYTES 2048 /* XXX make slightly less */ + + +extern struct mbuf *mbuf_freelist; + +/* allocate and init a new mbuf using the same structure of FreeBSD */ +/* + * XXX for the userspace version, we actually allocate + * MCLBYTES right after the buffer to store a copy of the packet. + */ +static __inline struct mbuf * +m_gethdr(int how, short type) +{ + struct mbuf *m; + static const struct mbuf m0; /* zero-initialized */ + + if (mbuf_freelist) { + m = mbuf_freelist; + mbuf_freelist = m->m_next; + *m = m0; + } else { + m = malloc(MY_MCLBYTES, M_IPFW, M_NOWAIT); + } + + ND("new mbuf %p", m); + if (m == NULL) { + panic("mgethdr failed"); + return m; + } + + /* here we have MSIZE - sizeof(struct mbuf) available */ + m->m_data = m + 1; + m->__m_extbuf = (char *)m + MSIZE; + m->__m_extlen = MY_MCLBYTES - MSIZE; + + return m; +} + + +/* + * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be + * allocated, how specifies whether to wait. If the allocation fails, the + * original mbuf chain is freed and m is set to NULL. + */ +static inline void M_PREPEND(struct mbuf *m, int plen, int how) +{ \ + if (plen < 0 || plen + m->m_len > m->__max_m_len) { + D("size too large"); + } else { + m->m_data -= plen; + m->m_len += plen; + } +} + +static inline void +m_adj(struct mbuf *mp, int req_len) +{ + if (req_len < 0 || req_len > mp->m_len) { + D("no m_adj for len %d in mlen %d", req_len, mp->m_len); + } else { + mp->m_data += req_len; + mp->m_len -= req_len; + } +} + +#define M_PREPEND_GOOD(m, plen, how) do { \ + struct mbuf **_mmp = &(m); \ + struct mbuf *_mm = *_mmp; \ + int _mplen = (plen); \ + int __mhow = (how); \ + \ + MBUF_CHECKSLEEP(how); \ + if (M_LEADINGSPACE(_mm) >= _mplen) { \ + _mm->m_data -= _mplen; \ + _mm->m_len += _mplen; \ + } else \ + _mm = m_prepend(_mm, _mplen, __mhow); \ + if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ + _mm->m_pkthdr.len += _mplen; \ + *_mmp = _mm; \ +} while (0) + +/* + * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise + * tags are expected to ``vanish'' when they pass through a network + * interface. For most interfaces this happens normally as the tags are + * reclaimed when the mbuf is free'd. However in some special cases + * reclaiming must be done manually. An example is packets that pass through + * the loopback interface. Also, one must be careful to do this when + * ``turning around'' packets (e.g., icmp_reflect). + * + * To mark a tag persistent bit-or this flag in when defining the tag id. + * The tag will then be treated as described above. + */ +#define MTAG_PERSISTENT 0x800 + +#define PACKET_TAG_NONE 0 /* Nadda */ + +/* Packet tags for use with PACKET_ABI_COMPAT. */ +#define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ +#define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ +#define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ +#define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ +#define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ +#define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ +#define PACKET_TAG_GIF 8 /* GIF processing done */ +#define PACKET_TAG_GRE 9 /* GRE processing done */ +#define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ +#define PACKET_TAG_ENCAP 11 /* Encap. processing */ +#define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ +#define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ +#define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ +#define PACKET_TAG_DUMMYNET 15 /* dummynet info */ +#define PACKET_TAG_DIVERT 17 /* divert info */ +#define PACKET_TAG_IPFORWARD 18 /* ipforward info */ +#define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ +#define PACKET_TAG_PF 21 /* PF + ALTQ information */ +#define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ +#define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ +#define PACKET_TAG_CARP 28 /* CARP info */ + +#endif /* _KERNEL */ +#endif /* !_SYS_MBUF_H_ */ diff --git a/example/ipfw/extra/sys/sys/module.h b/example/ipfw/extra/sys/sys/module.h new file mode 100644 index 0000000..310e22b --- /dev/null +++ b/example/ipfw/extra/sys/sys/module.h @@ -0,0 +1,43 @@ +/* + * trivial module support + */ +#ifndef _SYS_MODULE_H_ +#define _SYS_MODULE_H_ +typedef struct module *module_t; +typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); + +typedef enum modeventtype { + MOD_LOAD, + MOD_UNLOAD, + MOD_SHUTDOWN, + MOD_QUIESCE +} modeventtype_t; + +typedef struct moduledata { + const char *name; /* module name */ + modeventhand_t evhand; /* event handler */ + void *priv; /* extra data */ +} moduledata_t; + +/* + * Hook the module descriptor, md, into our list of things to do. + * We should in principle respect the order of loading. + * + * XXX use the gcc .init functions + */ +#define DECLARE_MODULE(a, md, c,d) \ + moduledata_t *moddesc_##a = &md + +/* + * XXX MODULE_VERSION is define in linux too + */ +#define MODULE_DEPEND(a,b,c,d,e) struct __module_depend +#if 1 // !defined(__FreeBSD__) // defined( __linux__ ) || defined( _WIN32 ) +#undef MODULE_VERSION +#define MODULE_VERSION(a,b) struct __module_version +#endif + +#define FEATURE(a, b) struct __feature + +#endif /* _SYS_MODULE_H_ */ + diff --git a/example/ipfw/extra/sys/sys/systm.h b/example/ipfw/extra/sys/sys/systm.h new file mode 100644 index 0000000..94036c9 --- /dev/null +++ b/example/ipfw/extra/sys/sys/systm.h @@ -0,0 +1,159 @@ +#ifndef _SYS_SYSTM_H_ +#define _SYS_SYSTM_H_ + +#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ +#define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ + +#if defined(USERSPACE) // freebsd userspace + +#include <sys/queue.h> +#ifdef __FreeBSD__ +#include <sys/taskqueue.h> +#endif + +/// SLIST_HEAD(callout_list, callout); +struct callout; +TAILQ_HEAD(callout_tailq, callout); +struct callout { + union { + //SLIST_ENTRY(callout) sle; + TAILQ_ENTRY(callout) tqe; + } c_links; + int c_time; /* ticks to the event */ + void *c_arg; /* function argument */ + void (*c_func)(void *); /* function to call */ + struct lock_object *c_lock; /* lock to handle */ + int c_flags; /* state of this entry */ + volatile int c_cpu; /* CPU we're scheduled on */ + +}; + + +int callout_drain(struct callout *c); +void callout_init(struct callout *c, int safe); +int callout_reset(struct callout *c, int ticks, void (*fn)(void *), void *arg); +int callout_reset_on(struct callout *c, int ticks, void (*fn)(void *), void *arg, int cpu); + +#else /* linux or windows */ + +#ifndef _WIN32 /* this is the linux version */ +/* callout support, in <sys/callout.h> on FreeBSD */ +/* + * callout support on linux module is done using timers + */ +#include <linux/timer.h> +#ifdef LINUX_24 +#include <linux/sched.h> /* jiffies definition is here in 2.4 */ +#endif +#define callout timer_list +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + co->expires = jiffies + ticks; + co->function = (void (*)(unsigned long))fn; + co->data = (unsigned long)arg; + /* + * Linux 2.6.31 and above has add_timer_on(co, cpu), + * otherwise add_timer() always schedules a callout on the same + * CPU used the first time, so we don't need more. + */ + add_timer(co); + return 0; +} + +#define callout_init(co, safe) init_timer(co) +#define callout_drain(co) del_timer(co) +#define callout_stop(co) del_timer(co) + +#else /* _WIN32 */ +#include <ndis.h> + +/* This is the windows part for callout support */ +struct callout { + KTIMER thetimer; + KDPC timerdpc; + int dpcinitialized; + LARGE_INTEGER duetime; +}; + +void dummynet (void*); +VOID dummynet_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +VOID ipfw_dpc( + __in struct _KDPC *Dpc, + __in_opt PVOID DeferredContext, + __in_opt PVOID SystemArgument1, + __in_opt PVOID SystemArgument2 + ); + +/* callout_reset must handle two problems: + * - dummynet() scheduler must be run always on the same processor + * because do_gettimeofday() is based on cpu performance counter, and + * _occasionally_ can leap backward in time if we query another cpu. + * typically this won't happen that much, and the cpu will almost always + * be the same even without the affinity restriction, but better to be sure. + * - ipfw_tick() does not have the granularity requirements of dummynet() + * but we need to pass a pointer as argument. + * + * for these reasons, if we are called for dummynet() timer, + * KeInitializeDpc is called only once as it should be, and the thread + * is forced on cpu0 (which is always present), while if we're called + * for ipfw_tick(), we re-initialize the DPC each time, using + * parameter DeferredContext to pass the needed pointer. since this + * timer is called only once a sec, this won't hurt that much. + */ +static __inline int +callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) +{ + if(fn == &dummynet) + { + if(co->dpcinitialized == 0) + { + KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); + KeSetTargetProcessorDpc(&co->timerdpc, cpu); + co->dpcinitialized = 1; + } + } + else + { + KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); + } + co->duetime.QuadPart = (-ticks)*10000; + KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); + return 0; +} + +static __inline void +callout_init(struct callout* co, int safe) +{ + printf("%s: initializing timer at %p\n",__FUNCTION__,co); + KeInitializeTimer(&co->thetimer); +} + +static __inline int +callout_drain(struct callout* co) +{ + BOOLEAN canceled = KeCancelTimer(&co->thetimer); + while (canceled != TRUE) + { + canceled = KeCancelTimer(&co->thetimer); + } + printf("%s: stopping timer at %p\n",__FUNCTION__,co); + return 0; +} + +static __inline int +callout_stop(struct callout* co) +{ + return callout_drain(co); +} + +#endif /* _WIN32 */ +#endif /* linux or windows */ + +#endif /* _SYS_SYSTM_H_ */ diff --git a/example/ipfw/extra/sys/sys/taskqueue.h b/example/ipfw/extra/sys/sys/taskqueue.h new file mode 100644 index 0000000..a9f79a0 --- /dev/null +++ b/example/ipfw/extra/sys/sys/taskqueue.h @@ -0,0 +1,51 @@ +#ifndef _SYS_TASKQUEUE_H_ +#define _SYS_TASKQUEUE_H_ + +/* + * Remap taskqueue to direct calls + */ + +#ifdef _WIN32 +struct task { + void (*func)(void*, int); +}; +#define taskqueue_enqueue_fast(tq, ta) (ta)->func(NULL,1) +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (c); } while (0) +#else +struct task { + void (*func)(void); +}; +#define taskqueue_enqueue_fast(tq, ta) (ta)->func() +#define TASK_INIT(a,b,c,d) do { \ + (a)->func = (void (*)(void))c; } while (0) + + +#endif +typedef void (*taskqueue_enqueue_fn)(void *context); + +// #define taskqueue_create(_a, _b, _c, _d) NULL +struct taskqueue *taskqueue_create_fast(const char *name, int mflags, + taskqueue_enqueue_fn enqueue, + void *context); +void taskqueue_thread_enqueue(void *context); + + +// #define taskqueue_create_fast(_a, _b, _c, _d) NULL +int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, + const char *name, ...) __printflike(4, 5); + + +// #define taskqueue_drain(_a, _b) /* XXX to be completed */ +// #define taskqueue_free(_a) /* XXX to be completed */ +void taskqueue_drain(struct taskqueue *queue, struct task *task); +void taskqueue_free(struct taskqueue *queue); + + +#define PRI_MIN (0) /* Highest priority. */ +#define PRI_MIN_ITHD (PRI_MIN) +#ifndef __FreeBSD__ +#define PI_NET (PRI_MIN_ITHD + 16) +#endif + +#endif /* !_SYS_TASKQUEUE_H_ */ diff --git a/example/ipfw/ipfw/altq.c b/example/ipfw/ipfw/altq.c new file mode 100644 index 0000000..ba6b639 --- /dev/null +++ b/example/ipfw/ipfw/altq.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/altq.c 270424 2014-08-23 17:37:18Z melifaro $ + * + * altq interface + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/sockio.h> + +#include "ipfw2.h" + +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <fcntl.h> + +#include <net/if.h> /* IFNAMSIZ */ +#include <net/pfvar.h> +#include <netinet/in.h> /* in_addr */ +#include <netinet/ip_fw.h> + +/* + * Map between current altq queue id numbers and names. + */ +static TAILQ_HEAD(, pf_altq) altq_entries = + TAILQ_HEAD_INITIALIZER(altq_entries); + +void +altq_set_enabled(int enabled) +{ + int pffd; + + pffd = open("/dev/pf", O_RDWR); + if (pffd == -1) + err(EX_UNAVAILABLE, + "altq support opening pf(4) control device"); + if (enabled) { + if (ioctl(pffd, DIOCSTARTALTQ) != 0 && errno != EEXIST) + err(EX_UNAVAILABLE, "enabling altq"); + } else { + if (ioctl(pffd, DIOCSTOPALTQ) != 0 && errno != ENOENT) + err(EX_UNAVAILABLE, "disabling altq"); + } + close(pffd); +} + +static void +altq_fetch(void) +{ + struct pfioc_altq pfioc; + struct pf_altq *altq; + int pffd; + unsigned int mnr; + static int altq_fetched = 0; + + if (altq_fetched) + return; + altq_fetched = 1; + pffd = open("/dev/pf", O_RDONLY); + if (pffd == -1) { + warn("altq support opening pf(4) control device"); + return; + } + bzero(&pfioc, sizeof(pfioc)); + if (ioctl(pffd, DIOCGETALTQS, &pfioc) != 0) { + warn("altq support getting queue list"); + close(pffd); + return; + } + mnr = pfioc.nr; + for (pfioc.nr = 0; pfioc.nr < mnr; pfioc.nr++) { + if (ioctl(pffd, DIOCGETALTQ, &pfioc) != 0) { + if (errno == EBUSY) + break; + warn("altq support getting queue list"); + close(pffd); + return; + } + if (pfioc.altq.qid == 0) + continue; + altq = safe_calloc(1, sizeof(*altq)); + *altq = pfioc.altq; + TAILQ_INSERT_TAIL(&altq_entries, altq, entries); + } + close(pffd); +} + +u_int32_t +altq_name_to_qid(const char *name) +{ + struct pf_altq *altq; + + altq_fetch(); + TAILQ_FOREACH(altq, &altq_entries, entries) + if (strcmp(name, altq->qname) == 0) + break; + if (altq == NULL) + errx(EX_DATAERR, "altq has no queue named `%s'", name); + return altq->qid; +} + +static const char * +altq_qid_to_name(u_int32_t qid) +{ + struct pf_altq *altq; + + altq_fetch(); + TAILQ_FOREACH(altq, &altq_entries, entries) + if (qid == altq->qid) + break; + if (altq == NULL) + return NULL; + return altq->qname; +} + +void +print_altq_cmd(struct buf_pr *bp, ipfw_insn_altq *altqptr) +{ + if (altqptr) { + const char *qname; + + qname = altq_qid_to_name(altqptr->qid); + if (qname == NULL) + bprintf(bp, " altq ?<%u>", altqptr->qid); + else + bprintf(bp, " altq %s", qname); + } +} diff --git a/example/ipfw/ipfw/dummynet.c b/example/ipfw/ipfw/dummynet.c new file mode 100644 index 0000000..1938307 --- /dev/null +++ b/example/ipfw/ipfw/dummynet.c @@ -0,0 +1,1410 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD: head/sbin/ipfw/dummynet.c 270424 2014-08-23 17:37:18Z melifaro $ + * + * dummynet support + */ + +#include <sys/types.h> +#include <sys/socket.h> +/* XXX there are several sysctl leftover here */ +#include <sys/sysctl.h> + +#include "ipfw2.h" + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <libutil.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <arpa/inet.h> /* inet_ntoa */ + + +static struct _s_x dummynet_params[] = { + { "plr", TOK_PLR }, + { "noerror", TOK_NOERROR }, + { "buckets", TOK_BUCKETS }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, + { "all", TOK_ALL }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, + { "droptail", TOK_DROPTAIL }, + { "ecn", TOK_ECN }, + { "red", TOK_RED }, + { "gred", TOK_GRED }, + { "bw", TOK_BW }, + { "bandwidth", TOK_BW }, + { "delay", TOK_DELAY }, + { "link", TOK_LINK }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, + { "flow-id", TOK_FLOWID}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "profile", TOK_PROFILE}, + { "burst", TOK_BURST}, + { "dummynet-params", TOK_NULL }, + { NULL, 0 } /* terminator */ +}; + +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +#if 0 +static int +sort_q(void *arg, const void *pa, const void *pb) +{ + int rev = (co.do_sort < 0); + int field = rev ? -co.do_sort : co.do_sort; + long long res = 0; + const struct dn_flow_queue *a = pa; + const struct dn_flow_queue *b = pb; + + switch (field) { + case 1: /* pkts */ + res = a->len - b->len; + break; + case 2: /* bytes */ + res = a->len_bytes - b->len_bytes; + break; + + case 3: /* tot pkts */ + res = a->tot_pkts - b->tot_pkts; + break; + + case 4: /* tot bytes */ + res = a->tot_bytes - b->tot_bytes; + break; + } + if (res < 0) + res = -1; + if (res > 0) + res = 1; + return (int)(rev ? res : -res); +} +#endif + +/* print a mask and header for the subsequent list of flows */ +static void +print_mask(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->extra ? "queue," : "", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); + } else { + char buf[255]; + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", + id->extra ? "queue," : "", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); + } +} + +static void +print_header(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + else + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); +} + +static void +list_flow(struct buf_pr *bp, struct dn_flow *ni) +{ + char buff[255]; + struct protoent *pe = NULL; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; + + pe = getprotobynumber(id->proto); + /* XXX: Should check for IPv4 flows */ + bprintf(bp, "%3u%c", (ni->oid.id) & 0xff, + id->extra ? '*' : ' '); + if (!IS_IP6_FLOW_ID(id)) { + if (pe) + bprintf(bp, "%-4s ", pe->p_name); + else + bprintf(bp, "%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); + bprintf(bp, "%15s/%-5d ", + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); + bprintf(bp, "%15s/%-5d ", + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ + if (pe != NULL) + bprintf(bp, "%9s ", pe->p_name); + else + bprintf(bp, "%9u ", id->proto); + bprintf(bp, "%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); + bprintf(bp, " %39s/%-5d ", + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); + } + pr_u64(bp, &ni->tot_pkts, 4); + pr_u64(bp, &ni->tot_bytes, 8); + bprintf(bp, "%2u %4u %3u", + ni->length, ni->len_bytes, ni->drops); +} + +static void +print_flowset_parms(struct dn_fs *fs, char *prefix) +{ + int l; + char qs[30]; + char plr[30]; + char red[90]; /* Display RED parameters */ + + l = fs->qsize; + if (fs->flags & DN_QSIZE_BYTES) { + if (l >= 8192) + sprintf(qs, "%d KB", l / 1024); + else + sprintf(qs, "%d B", l); + } else + sprintf(qs, "%3d sl.", l); + if (fs->plr) + sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); + else + plr[0] = '\0'; + + if (fs->flags & DN_IS_RED) { /* RED parameters */ + sprintf(red, + "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', + 1.0 * fs->w_q / (double)(1 << SCALE_RED), + fs->min_th, + fs->max_th, + 1.0 * fs->max_p / (double)(1 << SCALE_RED)); + if (fs->flags & DN_IS_ECN) + strncat(red, " (ecn)", 6); + } else + sprintf(red, "droptail"); + + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } +} + +static void +print_extra_delay_parms(struct dn_profile *p) +{ + double loss; + if (p->samples_no <= 0) + return; + + loss = p->loss_level; + loss /= p->samples_no; + printf("\t profile: name \"%s\" loss %f samples %d\n", + p->name, loss, p->samples_no); +} + +static void +flush_buf(char *buf) +{ + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + int toPrint = 1; /* print header */ + struct buf_pr bp; + + buf[0] = '\0'; + bp_alloc(&bp, 4096); + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); + + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; + + case DN_FLOW: + if (toPrint != 0) { + print_header(&((struct dn_flow *)oid)->fid); + toPrint = 0; + } + list_flow(&bp, (struct dn_flow *)oid); + printf("%s\n", bp.buf); + break; + + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; + + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); + + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; + + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); + } + flush_buf(buf); // XXX does it really go here ? + } + + bp_free(&bp); +} + +/* + * Delete pipe, queue or scheduler i + */ +int +ipfw_delete_pipe(int do_pipe, int i) +{ + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); + if (i) { + i = 1; + warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); + } + return i; +} + +/* + * Code to parse delay profiles. + * + * Some link types introduce extra delays in the transmission + * of a packet, e.g. because of MAC level framing, contention on + * the use of the channel, MAC level retransmissions and so on. + * From our point of view, the channel is effectively unavailable + * for this extra time, which is constant or variable depending + * on the link type. Additionally, packets may be dropped after this + * time (e.g. on a wireless link after too many retransmissions). + * We can model the additional delay with an empirical curve + * that represents its distribution. + * + * cumulative probability + * 1.0 ^ + * | + * L +-- loss-level x + * | ****** + * | * + * | ***** + * | * + * | ** + * | * + * +-------*-------------------> + * delay + * + * The empirical curve may have both vertical and horizontal lines. + * Vertical lines represent constant delay for a range of + * probabilities; horizontal lines correspond to a discontinuty + * in the delay distribution: the link will use the largest delay + * for a given probability. + * + * To pass the curve to dummynet, we must store the parameters + * in a file as described below, and issue the command + * + * ipfw pipe <n> config ... bw XXX profile <filename> ... + * + * The file format is the following, with whitespace acting as + * a separator and '#' indicating the beginning a comment: + * + * samples N + * the number of samples used in the internal + * representation (2..1024; default 100); + * + * loss-level L + * The probability above which packets are lost. + * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); + * + * name identifier + * Optional a name (listed by "ipfw pipe show") + * to identify the distribution; + * + * "delay prob" | "prob delay" + * One of these two lines is mandatory and defines + * the format of the following lines with data points. + * + * XXX YYY + * 2 or more lines representing points in the curve, + * with either delay or probability first, according + * to the chosen format. + * The unit for delay is milliseconds. + * + * Data points does not need to be ordered or equal to the number + * specified in the "samples" line. ipfw will sort and interpolate + * the curve as needed. + * + * Example of a profile file: + + name bla_bla_bla + samples 100 + loss-level 0.86 + prob delay + 0 200 # minimum overhead is 200ms + 0.5 200 + 0.5 300 + 0.8 1000 + 0.9 1300 + 1 1300 + + * Internally, we will convert the curve to a fixed number of + * samples, and when it is time to transmit a packet we will + * model the extra delay as extra bits in the packet. + * + */ + +#define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN +#define ED_TOK_SAMPLES "samples" +#define ED_TOK_LOSS "loss-level" +#define ED_TOK_NAME "name" +#define ED_TOK_DELAY "delay" +#define ED_TOK_PROB "prob" +#define ED_TOK_BW "bw" +#define ED_SEPARATORS " \t\n" +#define ED_MIN_SAMPLES_NO 2 + +/* + * returns 1 if s is a non-negative number, with at least one '.' + */ +static int +is_valid_number(const char *s) +{ + int i, dots_found = 0; + int len = strlen(s); + + for (i = 0; i<len; ++i) + if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1)) + return 0; + return 1; +} + +/* + * Take as input a string describing a bandwidth value + * and return the numeric bandwidth value. + * set clocking interface or bandwidth value + */ +static void +read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) +{ + if (*bandwidth != -1) + warnx("duplicate token, override bandwidth value!"); + + if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } + if (namelen >= IFNAMSIZ) + warn("interface name truncated"); + namelen--; + /* interface name */ + strncpy(if_name, arg, namelen); + if_name[namelen] = '\0'; + *bandwidth = 0; + } else { /* read bandwidth value */ + int bw; + char *end = NULL; + + bw = strtoul(arg, &end, 0); + if (*end == 'K' || *end == 'k') { + end++; + bw *= 1000; + } else if (*end == 'M' || *end == 'm') { + end++; + bw *= 1000000; + } + if ((*end == 'B' && + _substrcmp2(end, "Bi", "Bit/s") != 0) || + _substrcmp2(end, "by", "bytes") == 0) + bw *= 8; + + if (bw < 0) + errx(EX_DATAERR, "bandwidth too large"); + + *bandwidth = bw; + if (if_name) + if_name[0] = '\0'; + } +} + +struct point { + double prob; + double delay; +}; + +static int +compare_points(const void *vp1, const void *vp2) +{ + const struct point *p1 = vp1; + const struct point *p2 = vp2; + double res = 0; + + res = p1->prob - p2->prob; + if (res == 0) + res = p1->delay - p2->delay; + if (res < 0) + return -1; + else if (res > 0) + return 1; + else + return 0; +} + +#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno + +static void +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) +{ + char line[ED_MAX_LINE_LEN]; + FILE *f; + int lineno = 0; + int i; + + int samples = -1; + double loss = -1.0; + char profile_name[ED_MAX_NAME_LEN]; + int delay_first = -1; + int do_points = 0; + struct point points[ED_MAX_SAMPLES_NO]; + int points_no = 0; + + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + + profile_name[0] = '\0'; + f = fopen(filename, "r"); + if (f == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ + char *s, *cur = line, *name = NULL, *arg = NULL; + + ++lineno; + + /* parse the line */ + while (cur) { + s = strsep(&cur, ED_SEPARATORS); + if (s == NULL || *s == '#') + break; + if (*s == '\0') + continue; + if (arg) + errx(ED_EFMT("too many arguments")); + if (name == NULL) + name = s; + else + arg = s; + } + if (name == NULL) /* empty line */ + continue; + if (arg == NULL) + errx(ED_EFMT("missing arg for %s"), name); + + if (!strcasecmp(name, ED_TOK_SAMPLES)) { + if (samples > 0) + errx(ED_EFMT("duplicate ``samples'' line")); + if (atoi(arg) <=0) + errx(ED_EFMT("invalid number of samples")); + samples = atoi(arg); + if (samples>ED_MAX_SAMPLES_NO) + errx(ED_EFMT("too many samples, maximum is %d"), + ED_MAX_SAMPLES_NO); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_BW)) { + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); + } else if (!strcasecmp(name, ED_TOK_LOSS)) { + if (loss != -1.0) + errx(ED_EFMT("duplicated token: %s"), name); + if (!is_valid_number(arg)) + errx(ED_EFMT("invalid %s"), arg); + loss = atof(arg); + if (loss > 1) + errx(ED_EFMT("%s greater than 1.0"), name); + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_NAME)) { + if (profile_name[0] != '\0') + errx(ED_EFMT("duplicated token: %s"), name); + strncpy(profile_name, arg, sizeof(profile_name) - 1); + profile_name[sizeof(profile_name)-1] = '\0'; + do_points = 0; + } else if (!strcasecmp(name, ED_TOK_DELAY)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 1; + do_points = 1; + } else if (!strcasecmp(name, ED_TOK_PROB)) { + if (do_points) + errx(ED_EFMT("duplicated token: %s"), name); + delay_first = 0; + do_points = 1; + } else if (do_points) { + if (!is_valid_number(name) || !is_valid_number(arg)) + errx(ED_EFMT("invalid point found")); + if (delay_first) { + points[points_no].delay = atof(name); + points[points_no].prob = atof(arg); + } else { + points[points_no].delay = atof(arg); + points[points_no].prob = atof(name); + } + if (points[points_no].prob > 1.0) + errx(ED_EFMT("probability greater than 1.0")); + ++points_no; + } else { + errx(ED_EFMT("unrecognised command '%s'"), name); + } + } + + fclose (f); + + if (samples == -1) { + warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); + samples = 100; + } + + if (loss == -1.0) { + warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); + loss = 1; + } + + /* make sure that there are enough points. */ + if (points_no < ED_MIN_SAMPLES_NO) + errx(ED_EFMT("too few samples, need at least %d"), + ED_MIN_SAMPLES_NO); + + qsort(points, points_no, sizeof(struct point), compare_points); + + /* interpolation */ + for (i = 0; i<points_no-1; ++i) { + double y1 = points[i].prob * samples; + double x1 = points[i].delay; + double y2 = points[i+1].prob * samples; + double x2 = points[i+1].delay; + + int ix = y1; + int stop = y2; + + if (x1 == x2) { + for (; ix<stop; ++ix) + p->samples[ix] = x1; + } else { + double m = (y2-y1)/(x2-x1); + double c = y1 - m*x1; + for (; ix<stop ; ++ix) + p->samples[ix] = (ix - c)/m; + } + } + p->samples_no = samples; + p->loss_level = loss * samples; + strncpy(p->name, profile_name, sizeof(p->name)); +} + +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta <parameters> + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N <parameters> + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ +void +ipfw_config_pipe(int ac, char **av) +{ + int i; + u_int j; + char *end; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; + + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + av++; ac--; + /* Pipe number */ + if (ac && isdigit(**av)) { + i = atoi(*av); av++; ac--; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; + } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; + while (ac > 0) { + double d; + int tok = match_token(dummynet_params, *av); + ac--; av++; + + switch(tok) { + case TOK_NOERROR: + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; + break; + + case TOK_PLR: + NEED(fs, "plr is only for pipes"); + NEED1("plr needs argument 0..1\n"); + d = strtod(av[0], NULL); + if (d > 1) + d = 1; + else if (d < 0) + d = 0; + fs->plr = (int)(d*0x7fffffff); + ac--; av++; + break; + + case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); + NEED1("queue needs queue size\n"); + end = NULL; + fs->qsize = strtoul(av[0], &end, 0); + if (*end == 'K' || *end == 'k') { + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; + } else if (*end == 'B' || + _substrcmp2(end, "by", "bytes") == 0) { + fs->flags |= DN_QSIZE_BYTES; + } + ac--; av++; + break; + + case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); + NEED1("buckets needs argument\n"); + *buckets = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: + case TOK_MASK: + NEED(mask, "tok_mask"); + NEED1("mask needs mask specifier\n"); + /* + * per-flow queue, mask is dst_ip, dst_port, + * src_ip, src_port, proto measured in bits + */ + + bzero(mask, sizeof(*mask)); + end = NULL; + + while (ac >= 1) { + uint32_t *p32 = NULL; + uint16_t *p16 = NULL; + uint32_t *p20 = NULL; + struct in6_addr *pa6 = NULL; + uint32_t a; + + tok = match_token(dummynet_params, *av); + ac--; av++; + switch(tok) { + case TOK_ALL: + /* + * special case, all bits significant + * except 'extra' (the queue number) + */ + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_QUEUE: + mask->extra = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_DSTIP: + mask->addr_type = 4; + p32 = &mask->dst_ip; + break; + + case TOK_SRCIP: + mask->addr_type = 4; + p32 = &mask->src_ip; + break; + + case TOK_DSTIP6: + mask->addr_type = 6; + pa6 = &mask->dst_ip6; + break; + + case TOK_SRCIP6: + mask->addr_type = 6; + pa6 = &mask->src_ip6; + break; + + case TOK_FLOWID: + mask->addr_type = 6; + p20 = &mask->flow_id6; + break; + + case TOK_DSTPORT: + p16 = &mask->dst_port; + break; + + case TOK_SRCPORT: + p16 = &mask->src_port; + break; + + case TOK_PROTO: + break; + + default: + ac++; av--; /* backtrack */ + goto end_mask; + } + if (ac < 1) + errx(EX_USAGE, "mask: value missing"); + if (*av[0] == '/') { + a = strtoul(av[0]+1, &end, 0); + if (pa6 == NULL) + a = (a == 32) ? ~0 : (1 << a) - 1; + } else + a = strtoul(av[0], &end, 0); + if (p32 != NULL) + *p32 = a; + else if (p16 != NULL) { + if (a > 0xFFFF) + errx(EX_DATAERR, + "port mask must be 16 bit"); + *p16 = (uint16_t)a; + } else if (p20 != NULL) { + if (a > 0xfffff) + errx(EX_DATAERR, + "flow_id mask must be 20 bit"); + *p20 = (uint32_t)a; + } else if (pa6 != NULL) { + if (a > 128) + errx(EX_DATAERR, + "in6addr invalid mask len"); + else + n2mask(pa6, a); + } else { + if (a > 0xFF) + errx(EX_DATAERR, + "proto mask must be 8 bit"); + mask->proto = (uint8_t)a; + } + if (a != 0) + *flags |= DN_HAVE_MASK; + ac--; av++; + } /* end while, config masks */ +end_mask: + break; + + case TOK_RED: + case TOK_GRED: + NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); + fs->flags |= DN_IS_RED; + if (tok == TOK_GRED) + fs->flags |= DN_IS_GENTLE_RED; + /* + * the format for parameters is w_q/min_th/max_th/max_p + */ + if ((end = strsep(&av[0], "/"))) { + double w_q = strtod(end, NULL); + if (w_q > 1 || w_q <= 0) + errx(EX_DATAERR, "0 < w_q <= 1"); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); + } + if ((end = strsep(&av[0], "/"))) { + fs->min_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->min_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + fs->max_th = strtoul(end, &end, 0); + if (*end == 'K' || *end == 'k') + fs->max_th *= 1024; + } + if ((end = strsep(&av[0], "/"))) { + double max_p = strtod(end, NULL); + if (max_p > 1 || max_p < 0) + errx(EX_DATAERR, "0 <= max_p <= 1"); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); + } + ac--; av++; + break; + + case TOK_ECN: + fs->flags |= DN_IS_ECN; + break; + + case TOK_DROPTAIL: + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + break; + + case TOK_BW: + NEED(p, "bw is only for links"); + NEED1("bw needs bandwidth or interface\n"); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); + ac--; av++; + break; + + case TOK_DELAY: + NEED(p, "delay is only for links"); + NEED1("delay needs argument 0..10000ms\n"); + p->delay = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ + ac--; av++; + break; + } + + case TOK_WEIGHT: + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: + case TOK_PIPE: + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); + ac--; av++; + break; + + case TOK_PROFILE: + NEED((!pf), "profile already set"); + NEED(p, "profile"); + { + NEED1("extra delay needs the file name\n"); + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? + --ac; ++av; + } + break; + + case TOK_BURST: + NEED(p, "burst"); + NEED1("burst needs argument\n"); + errno = 0; + if (expand_number(av[0], &p->burst) < 0) + if (errno != ERANGE) + errx(EX_DATAERR, + "burst: invalid argument"); + if (errno || p->burst > (1ULL << 48) - 1) + errx(EX_DATAERR, + "burst: out of range (0..2^48-1)"); + ac--; av++; + break; + + default: + errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); + } + } + + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) + errx(EX_DATAERR, "delay must be < 10000"); + if (p->bandwidth == -1) + p->bandwidth = 0; + } + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", + &limit, &len, NULL, 0) == -1) + limit = 1024*1024; + if (fs->qsize > limit) + errx(EX_DATAERR, "queue size must be < %ldB", limit); + } else { + size_t len; + long limit; + + len = sizeof(limit); + if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", + &limit, &len, NULL, 0) == -1) + limit = 100; + if (fs->qsize > limit) + errx(EX_DATAERR, "2 <= queue size <= %ld", limit); + } + + if ((fs->flags & DN_IS_ECN) && !(fs->flags & DN_IS_RED)) + errx(EX_USAGE, "enable red/gred for ECN"); + + if (fs->flags & DN_IS_RED) { + size_t len; + int lookup_depth, avg_pkt_size; + + if (!(fs->flags & DN_IS_ECN) && (fs->min_th >= fs->max_th)) + errx(EX_DATAERR, "min_th %d must be < than max_th %d", + fs->min_th, fs->max_th); + else if ((fs->flags & DN_IS_ECN) && (fs->min_th > fs->max_th)) + errx(EX_DATAERR, "min_th %d must be =< than max_th %d", + fs->min_th, fs->max_th); + + if (fs->max_th == 0) + errx(EX_DATAERR, "max_th must be > 0"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", + &lookup_depth, &len, NULL, 0) == -1) + lookup_depth = 256; + if (lookup_depth == 0) + errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" + " must be greater than zero"); + + len = sizeof(int); + if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", + &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; + + if (avg_pkt_size == 0) + errx(EX_DATAERR, + "net.inet.ip.dummynet.red_avg_pkt_size must" + " be greater than zero"); + +#if 0 /* the following computation is now done in the kernel */ + /* + * Ticks needed for sending a medium-sized packet. + * Unfortunately, when we are configuring a WF2Q+ queue, we + * do not have bandwidth information, because that is stored + * in the parent pipe, and also we have multiple queues + * competing for it. So we set s=0, which is not very + * correct. But on the other hand, why do we want RED with + * WF2Q+ ? + */ + if (p.bandwidth==0) /* this is a WF2Q+ queue */ + s = 0; + else + s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; + /* + * max idle time (in ticks) before avg queue size becomes 0. + * NOTA: (3/w_q) is approx the value x so that + * (1-w_q)^x < 10^-3. + */ + w_q = ((double)fs->w_q) / (1 << SCALE_RED); + idle = s * 3. / w_q; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; + weight = 1 - w_q; + for (t = fs->lookup_step; t > 1; --t) + weight *= 1 - w_q; + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif /* code moved in the kernel */ + } + } + + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); + + if (i) + err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); +} + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' + * Returns the number of ranges, and possibly stores them + * in the array v of size len. + */ +static int +parse_range(int ac, char *av[], uint32_t *v, int len) +{ + int n = 0; + char *endptr, *s; + uint32_t base[2]; + + if (v == NULL || len < 2) { + v = base; + len = 2; + } + + for (s = *av; s != NULL; av++, ac--) { + v[0] = strtoul(s, &endptr, 10); + v[1] = (*endptr != '-') ? v[0] : + strtoul(endptr+1, &endptr, 10); + if (*endptr == '\0') { /* prepare for next round */ + s = (ac > 0) ? *(av+1) : NULL; + } else { + if (*endptr != ',') { + warn("invalid number: %s", s); + s = ++endptr; + continue; + } + /* continue processing from here */ + s = ++endptr; + ac++; + av--; + } + if (v[1] < v[0] || + v[1] >= DN_MAX_ID-1 || + v[1] >= DN_MAX_ID-1) { + continue; /* invalid entry */ + } + n++; + /* translate if 'pipe list' */ + if (co.do_pipe == 1) { + v[0] += DN_MAX_ID; + v[1] += DN_MAX_ID; + } + v = (n*2 < len) ? v + 2 : base; + } + return n; +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * av may contain filtering arguments, either individual entries + * or ranges, or lists (space or commas are valid separators). + * Format for a range can be n1-n2 or n3 n4 n5 ... + * In a range n1 must be <= n2, otherwise the range is ignored. + * A number 'n4' is translate in a range 'n4-n4' + * All number must be > 0 and < DN_MAX_ID-1 + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id *oid, *x = NULL; + int ret, i; + int n; /* # of ranges */ + u_int buflen, l; + u_int max_size; /* largest obj passed up */ + + (void)show_counters; // XXX unused, but we should use it. + ac--; + av++; /* skip 'list' | 'show' word */ + + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ + + /* Allocate space to store ranges */ + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; + oid = safe_calloc(1, l); + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); + + if (n > 0) /* store ranges in idx */ + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); + /* + * Compute the size of the largest object returned. If the + * response leaves at least this much spare space in the + * buffer, then surely the response is complete; otherwise + * there might be a risk of truncation and we will need to + * retry with a larger buffer. + * XXX don't bother with smaller structs. + */ + max_size = sizeof(struct dn_fs); + if (max_size < sizeof(struct dn_sch)) + max_size = sizeof(struct dn_sch); + if (max_size < sizeof(struct dn_flow)) + max_size = sizeof(struct dn_flow); + + switch (co.do_pipe) { + case 1: + oid->subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid->subtype = DN_FS; /* list queue */ + break; + case 3: + oid->subtype = DN_SCH; /* list sched */ + break; + } + + /* + * Ask the kernel an estimate of the required space (result + * in oid.id), unless we are requesting a subset of objects, + * in which case the kernel does not give an exact answer. + * In any case, space might grow in the meantime due to the + * creation of new queues, so we must be prepared to retry. + */ + if (n > 0) { + buflen = 4*1024; + } else { + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0 || oid->id <= sizeof(*oid)) + goto done; + buflen = oid->id + max_size; + oid->len = sizeof(*oid); /* restore */ + } + /* Try a few times, until the buffer fits */ + for (i = 0; i < 20; i++) { + l = buflen; + x = safe_realloc(x, l); + bcopy(oid, x, oid->len); + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + if (ret != 0 || x->id <= sizeof(*oid)) + goto done; /* no response */ + if (l + max_size <= buflen) + break; /* ok */ + buflen *= 2; /* double for next attempt */ + } + list_pipes(x, O_NEXT(x, l)); +done: + if (x) + free(x); + free(oid); +} diff --git a/example/ipfw/ipfw/ipfw.8 b/example/ipfw/ipfw/ipfw.8 new file mode 100644 index 0000000..9b8946b --- /dev/null +++ b/example/ipfw/ipfw/ipfw.8 @@ -0,0 +1,3723 @@ +.\" +.\" $FreeBSD: head/sbin/ipfw/ipfw.8 274925 2014-11-23 21:00:00Z joel $ +.\" +.Dd Aug 13, 2014 +.Dt IPFW 8 +.Os +.Sh NAME +.Nm ipfw +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. +.Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION +.Nm +.Op Fl cq +.Cm add +.Ar rule +.Nm +.Op Fl acdefnNStT +.Op Cm set Ar N +.Brq Cm list | show +.Op Ar rule | first-last ... +.Nm +.Op Fl f | q +.Op Cm set Ar N +.Cm flush +.Nm +.Op Fl q +.Op Cm set Ar N +.Brq Cm delete | zero | resetlog +.Op Ar number ... +.Pp +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Nm +.Cm set move +.Op Cm rule +.Ar number Cm to Ar number +.Nm +.Cm set swap Ar number number +.Nm +.Cm set show +.Ss SYSCTL SHORTCUTS +.Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Ss LOOKUP TABLES +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm create Ar create-options +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm destroy +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm modify Ar modify-options +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm swap Ar name +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm add Ar table-key Op Ar value +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm add Op Ar table-key Ar value ... +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm atomic add Op Ar table-key Ar value ... +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm delete Op Ar table-key ... +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm lookup Ar addr +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm lock +.Nm +.Oo Cm set Ar N Oc Cm table Ar name Cm unlock +.Nm +.Oo Cm set Ar N Oc Cm table +.Brq Ar name | all +.Cm list +.Nm +.Oo Cm set Ar N Oc Cm table +.Brq Ar name | all +.Cm info +.Nm +.Oo Cm set Ar N Oc Cm table +.Brq Ar name | all +.Cm detail +.Nm +.Oo Cm set Ar N Oc Cm table +.Brq Ar name | all +.Cm flush +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) +.Nm +.Brq Cm pipe | queue | sched +.Ar number +.Cm config +.Ar config-options +.Nm +.Op Fl s Op Ar field +.Brq Cm pipe | queue | sched +.Brq Cm delete | list | show +.Op Ar number ... +.Ss IN-KERNEL NAT +.Nm +.Op Fl q +.Cm nat +.Ar number +.Cm config +.Ar config-options +.Pp +.Nm +.Op Fl cfnNqS +.Oo +.Fl p Ar preproc +.Oo +.Ar preproc-flags +.Oc +.Oc +.Ar pathname +.Ss INTERNAL DIAGNOSTICS +.Nm +.Cm internal iflist +.Nm +.Cm internal talist +.Nm +.Cm internal vlist +.Sh DESCRIPTION +The +.Nm +utility is the user interface for controlling the +.Xr ipfw 4 +firewall, the +.Xr dummynet 4 +traffic shaper/packet scheduler, and the +in-kernel NAT services. +.Pp +A firewall configuration, or +.Em ruleset , +is made of a list of +.Em rules +numbered from 1 to 65535. +Packets are passed to the firewall +from a number of different places in the protocol stack +(depending on the source and destination of the packet, +it is possible for the firewall to be +invoked multiple times on the same packet). +The packet passed to the firewall is compared +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). +When a match is found, the action corresponding to the +matching rule is performed. +.Pp +Depending on the action and certain system settings, packets +can be reinjected into the firewall at some rule after the +matching one for further processing. +.Pp +A ruleset always includes a +.Em default +rule (numbered 65535) which cannot be modified or deleted, +and matches all packets. +The action associated with the +.Em default +rule can be either +.Cm deny +or +.Cm allow +depending on how the kernel is configured. +.Pp +If the ruleset includes one or more rules with the +.Cm keep-state +or +.Cm limit +option, +the firewall will have a +.Em stateful +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e., rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked +at the first occurrence of a +.Cm check-state , +.Cm keep-state +or +.Cm limit +rule, and are typically used to open the firewall on-demand to +legitimate traffic only. +See the +.Sx STATEFUL FIREWALL +and +.Sx EXAMPLES +Sections below for more information on the stateful behaviour of +.Nm . +.Pp +All rules (including dynamic ones) have a few associated counters: +a packet count, a byte count, a log count and a timestamp +indicating the time of the last match. +Counters can be displayed or reset with +.Nm +commands. +.Pp +Each rule belongs to one of 32 different +.Em sets +, and there are +.Nm +commands to atomically manipulate sets, such as enable, +disable, swap sets, move all rules in a set to another +one, delete all rules in a set. +These can be useful to +install temporary configurations, or to test them. +See Section +.Sx SETS OF RULES +for more information on +.Em sets . +.Pp +Rules can be added with the +.Cm add +command; deleted individually or in groups with the +.Cm delete +command, and globally (except those in set 31) with the +.Cm flush +command; displayed, optionally with the content of the +counters, using the +.Cm show +and +.Cm list +commands. +Finally, counters can be reset with the +.Cm zero +and +.Cm resetlog +commands. +.Pp +.Ss COMMAND OPTIONS +The following general options are available when invoking +.Nm : +.Bl -tag -width indent +.It Fl a +Show counter values when listing rules. +The +.Cm show +command implies this option. +.It Fl b +Only show the action and the comment, not the body of a rule. +Implies +.Fl c . +.It Fl c +When entering or showing rules, print them in compact form, +i.e., omitting the "ip from any to any" string +when this does not carry any additional information. +.It Fl d +When listing, show dynamic rules in addition to static ones. +.It Fl e +When listing and +.Fl d +is specified, also show expired dynamic rules. +.It Fl f +Do not ask for confirmation for commands that can cause problems +if misused, i.e., +.Cm flush . +If there is no tty associated with the process, this is implied. +.It Fl i +When listing a table (see the +.Sx LOOKUP TABLES +section below for more information on lookup tables), format values +as IP addresses. +By default, values are shown as integers. +.It Fl n +Only check syntax of the command strings, without actually passing +them to the kernel. +.It Fl N +Try to resolve addresses and service names in output. +.It Fl q +Be quiet when executing the +.Cm add , +.Cm nat , +.Cm zero , +.Cm resetlog +or +.Cm flush +commands; +(implies +.Fl f ) . +This is useful when updating rulesets by executing multiple +.Nm +commands in a script +(e.g., +.Ql sh\ /etc/rc.firewall ) , +or by processing a file with many +.Nm +rules across a remote login session. +It also stops a table add or delete +from failing if the entry already exists or is not present. +.Pp +The reason why this option may be important is that +for some of these actions, +.Nm +may print a message; if the action results in blocking the +traffic to the remote client, +the remote login session will be closed +and the rest of the ruleset will not be processed. +Access to the console would then be required to recover. +.It Fl S +When listing rules, show the +.Em set +each rule belongs to. +If this flag is not specified, disabled rules will not be +listed. +.It Fl s Op Ar field +When listing pipes, sort according to one of the four +counters (total or current packets or bytes). +.It Fl t +When listing, show last match timestamp converted with ctime(). +.It Fl T +When listing, show last match timestamp as seconds from the epoch. +This form can be more convenient for postprocessing by scripts. +.El +.Ss LIST OF RULES AND PREPROCESSING +To ease configuration, rules can be put into a file which is +processed using +.Nm +as shown in the last synopsis line. +An absolute +.Ar pathname +must be used. +The file will be read line by line and applied as arguments to the +.Nm +utility. +.Pp +Optionally, a preprocessor can be specified using +.Fl p Ar preproc +where +.Ar pathname +is to be piped through. +Useful preprocessors include +.Xr cpp 1 +and +.Xr m4 1 . +If +.Ar preproc +does not start with a slash +.Pq Ql / +as its first character, the usual +.Ev PATH +name search is performed. +Care should be taken with this in environments where not all +file systems are mounted (yet) by the time +.Nm +is being run (e.g.\& when they are mounted over NFS). +Once +.Fl p +has been specified, any additional arguments are passed on to the preprocessor +for interpretation. +This allows for flexible configuration files (like conditionalizing +them on the local hostname) and the use of macros to centralize +frequently required arguments like IP addresses. +.Ss TRAFFIC SHAPER CONFIGURATION +The +.Nm +.Cm pipe , queue +and +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section below for details. +.Pp +If the world and the kernel get out of sync the +.Nm +ABI may break, preventing you from being able to add any rules. +This can adversely affect the booting process. +You can use +.Nm +.Cm disable +.Cm firewall +to temporarily disable the firewall to regain access to the network, +allowing you to fix the problem. +.Sh PACKET FLOW +A packet is checked against the active ruleset in multiple places +in the protocol stack, under control of several sysctl variables. +These places and variables are shown below, and it is important to +have this picture in mind in order to design a correct ruleset. +.Bd -literal -offset indent + ^ to upper layers V + | | + +----------->-----------+ + ^ V + [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 + | | + ^ V + [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 + | | + +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 + ^ V + | to devices | +.Ed +.Pp +The number of +times the same packet goes through the firewall can +vary between 0 and 4 depending on packet source and +destination, and system configuration. +.Pp +Note that as packets flow through the stack, headers can be +stripped or added to it, and so they may or may not be available +for inspection. +E.g., incoming packets will include the MAC header when +.Nm +is invoked from +.Cm ether_demux() , +but the same packets will have the MAC header stripped off when +.Nm +is invoked from +.Cm ip_input() +or +.Cm ip6_input() . +.Pp +Also note that each packet is always checked against the complete ruleset, +irrespective of the place where the check occurs, or the source of the packet. +If a rule contains some match patterns or actions which are not valid +for the place of invocation (e.g.\& trying to match a MAC header within +.Cm ip_input +or +.Cm ip6_input ), +the match pattern will not match, but a +.Cm not +operator in front of such patterns +.Em will +cause the pattern to +.Em always +match on those packets. +It is thus the responsibility of +the programmer, if necessary, to write a suitable ruleset to +differentiate among the possible places. +.Cm skipto +rules can be useful here, as an example: +.Bd -literal -offset indent +# packets from ether_demux or bdg_forward +ipfw add 10 skipto 1000 all from any to any layer2 in +# packets from ip_input +ipfw add 10 skipto 2000 all from any to any not layer2 in +# packets from ip_output +ipfw add 10 skipto 3000 all from any to any not layer2 out +# packets from ether_output_frame +ipfw add 10 skipto 4000 all from any to any layer2 out +.Ed +.Pp +(yes, at the moment there is no way to differentiate between +ether_demux and bdg_forward). +.Sh SYNTAX +In general, each keyword or argument must be provided as +a separate command line argument, with no leading or trailing +spaces. +Keywords are case-sensitive, whereas arguments may +or may not be case-sensitive depending on their nature +(e.g.\& uid's are, hostnames are not). +.Pp +Some arguments (e.g., port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make +the line more readable. +You can also put the entire +command (including flags) into a single argument. +E.g., the following forms are equivalent: +.Bd -literal -offset indent +ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 +ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 +ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" +.Ed +.Sh RULE FORMAT +The format of firewall rules is the following: +.Bd -ragged -offset indent +.Bk -words +.Op Ar rule_number +.Op Cm set Ar set_number +.Op Cm prob Ar match_probability +.Ar action +.Op Cm log Op Cm logamount Ar number +.Op Cm altq Ar queue +.Oo +.Bro Cm tag | untag +.Brc Ar number +.Oc +.Ar body +.Ek +.Ed +.Pp +where the body of the rule specifies which information is used +for filtering packets, among the following: +.Pp +.Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact +.It Layer-2 header fields +When available +.It IPv4 and IPv6 Protocol +TCP, UDP, ICMP, etc. +.It Source and dest. addresses and ports +.It Direction +See Section +.Sx PACKET FLOW +.It Transmit and receive interface +By name or address +.It Misc. IP header fields +Version, type of service, datagram length, identification, +fragment flag (non-zero IP offset), +Time To Live +.It IP options +.It IPv6 Extension headers +Fragmentation, Hop-by-Hop options, +Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. +.It IPv6 Flow-ID +.It Misc. TCP header fields +TCP flags (SYN, FIN, ACK, RST, etc.), +sequence number, acknowledgment number, +window +.It TCP options +.It ICMP types +for ICMP packets +.It ICMP6 types +for ICMP6 packets +.It User/group ID +When the packet can be associated with a local socket. +.It Divert status +Whether a packet came from a divert socket (e.g., +.Xr natd 8 ) . +.It Fib annotation state +Whether a packet has been tagged for using a specific FIB (routing table) +in future forwarding decisions. +.El +.Pp +Note that some of the above information, e.g.\& source MAC or IP addresses and +TCP/UDP ports, can be easily spoofed, so filtering on those fields +alone might not guarantee the desired results. +.Bl -tag -width indent +.It Ar rule_number +Each rule is associated with a +.Ar rule_number +in the range 1..65535, with the latter reserved for the +.Em default +rule. +Rules are checked sequentially by rule number. +Multiple rules can have the same number, in which case they are +checked (and listed) according to the order in which they have +been added. +If a rule is entered without specifying a number, the kernel will +assign one in such a way that the rule becomes the last one +before the +.Em default +rule. +Automatic rule numbers are assigned by incrementing the last +non-default rule number by the value of the sysctl variable +.Ar net.inet.ip.fw.autoinc_step +which defaults to 100. +If this is not possible (e.g.\& because we would go beyond the +maximum allowed rule number), the number of the last +non-default value is used instead. +.It Cm set Ar set_number +Each rule is associated with a +.Ar set_number +in the range 0..31. +Sets can be individually disabled and enabled, so this parameter +is of fundamental importance for atomic ruleset manipulation. +It can be also used to simplify deletion of groups of rules. +If a rule is entered without specifying a set number, +set 0 will be used. +.br +Set 31 is special in that it cannot be disabled, +and rules in set 31 are not deleted by the +.Nm ipfw flush +command (but you can delete them with the +.Nm ipfw delete set 31 +command). +Set 31 is also used for the +.Em default +rule. +.It Cm prob Ar match_probability +A match is only declared with the specified probability +(floating point number between 0 and 1). +This can be useful for a number of applications such as +random packet drop or +(in conjunction with +.Nm dummynet ) +to simulate the effect of multiple paths leading to out-of-order +packet delivery. +.Pp +Note: this condition is checked before any other condition, including +ones such as keep-state or check-state which might have side effects. +.It Cm log Op Cm logamount Ar number +Packets matching a rule with the +.Cm log +keyword will be made available for logging in two ways: +if the sysctl variable +.Va net.inet.ip.fw.verbose +is set to 0 (default), one can use +.Xr bpf 4 +attached to the +.Li ipfw0 +pseudo interface. +This pseudo interface can be created after a boot +manually by using the following command: +.Bd -literal -offset indent +# ifconfig ipfw0 create +.Ed +.Pp +Or, automatically at boot time by adding the following +line to the +.Xr rc.conf 5 +file: +.Bd -literal -offset indent +firewall_logif="YES" +.Ed +.Pp +There is no overhead if no +.Xr bpf 4 +is attached to the pseudo interface. +.Pp +If +.Va net.inet.ip.fw.verbose +is set to 1, packets will be logged to +.Xr syslogd 8 +with a +.Dv LOG_SECURITY +facility up to a maximum of +.Cm logamount +packets. +If no +.Cm logamount +is specified, the limit is taken from the sysctl variable +.Va net.inet.ip.fw.verbose_limit . +In both cases, a value of 0 means unlimited logging. +.Pp +Once the limit is reached, logging can be re-enabled by +clearing the logging counter or the packet counter for that entry, see the +.Cm resetlog +command. +.Pp +Note: logging is done after all other packet matching conditions +have been successfully verified, and before performing the final +action (accept, deny, etc.) on the packet. +.It Cm tag Ar number +When a packet matches a rule with the +.Cm tag +keyword, the numeric tag for the given +.Ar number +in the range 1..65534 will be attached to the packet. +The tag acts as an internal marker (it is not sent out over +the wire) that can be used to identify these packets later on. +This can be used, for example, to provide trust between interfaces +and to start doing policy-based filtering. +A packet can have multiple tags at the same time. +Tags are "sticky", meaning once a tag is applied to a packet by a +matching rule it exists until explicit removal. +Tags are kept with the packet everywhere within the kernel, but are +lost when packet leaves the kernel, for example, on transmitting +packet out to the network or sending packet to a +.Xr divert 4 +socket. +.Pp +To check for previously applied tags, use the +.Cm tagged +rule option. +To delete previously applied tag, use the +.Cm untag +keyword. +.Pp +Note: since tags are kept with the packet everywhere in kernelspace, +they can be set and unset anywhere in the kernel network subsystem +(using the +.Xr mbuf_tags 9 +facility), not only by means of the +.Xr ipfw 4 +.Cm tag +and +.Cm untag +keywords. +For example, there can be a specialized +.Xr netgraph 4 +node doing traffic analyzing and tagging for later inspecting +in firewall. +.It Cm untag Ar number +When a packet matches a rule with the +.Cm untag +keyword, the tag with the number +.Ar number +is searched among the tags attached to this packet and, +if found, removed from it. +Other tags bound to packet, if present, are left untouched. +.It Cm altq Ar queue +When a packet matches a rule with the +.Cm altq +keyword, the ALTQ identifier for the given +.Ar queue +(see +.Xr altq 4 ) +will be attached. +Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, +and not being rejected or going to divert sockets. +Note that if there is insufficient memory at the time the packet is +processed, it will not be tagged, so it is wise to make your ALTQ +"default" queue policy account for this. +If multiple +.Cm altq +rules match a single packet, only the first one adds the ALTQ classification +tag. +In doing so, traffic may be shaped by using +.Cm count Cm altq Ar queue +rules for classification early in the ruleset, then later applying +the filtering decision. +For example, +.Cm check-state +and +.Cm keep-state +rules may come later and provide the actual filtering decisions in +addition to the fallback ALTQ tag. +.Pp +You must run +.Xr pfctl 8 +to set up the queues before IPFW will be able to look them up by name, +and if the ALTQ disciplines are rearranged, the rules in containing the +queue identifiers in the kernel will likely have gone stale and need +to be reloaded. +Stale queue identifiers will probably result in misclassification. +.Pp +All system ALTQ processing can be turned on or off via +.Nm +.Cm enable Ar altq +and +.Nm +.Cm disable Ar altq . +The usage of +.Va net.inet.ip.fw.one_pass +is irrelevant to ALTQ traffic shaping, as the actual rule action is followed +always after adding an ALTQ tag. +.El +.Ss RULE ACTIONS +A rule can be associated with one of the following actions, which +will be executed when the packet matches the body of the rule. +.Bl -tag -width indent +.It Cm allow | accept | pass | permit +Allow packets that match rule. +The search terminates. +.It Cm check-state +Checks the packet against the dynamic ruleset. +If a match is found, execute the action associated with +the rule which generated this dynamic rule, otherwise +move to the next rule. +.br +.Cm Check-state +rules do not have a body. +If no +.Cm check-state +rule is found, the dynamic ruleset is checked at the first +.Cm keep-state +or +.Cm limit +rule. +.It Cm count +Update counters for all packets that match rule. +The search continues with the next rule. +.It Cm deny | drop +Discard packets that match this rule. +The search terminates. +.It Cm divert Ar port +Divert packets that match this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search terminates. +.It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port +Change the next-hop on matching packets to +.Ar ipaddr , +which can be an IP address or a host name. +For IPv4, the next hop can also be supplied by the last table +looked up for the packet by using the +.Cm tablearg +keyword instead of an explicit address. +The search terminates if this rule matches. +.Pp +If +.Ar ipaddr +is a local address, then matching packets will be forwarded to +.Ar port +(or the port number in the packet if one is not specified in the rule) +on the local machine. +.br +If +.Ar ipaddr +is not a local address, then the port number +(if specified) is ignored, and the packet will be +forwarded to the remote address, using the route as found in +the local routing table for that IP. +.br +A +.Ar fwd +rule will not match layer-2 packets (those received +on ether_input, ether_output, or bridged). +.br +The +.Cm fwd +action does not change the contents of the packet at all. +In particular, the destination address remains unmodified, so +packets forwarded to another system will usually be rejected by that system +unless there is a matching rule on that system to capture them. +For packets forwarded locally, +the local address of the socket will be +set to the original destination address of the packet. +This makes the +.Xr netstat 1 +entry look rather weird but is intended for +use with transparent proxy servers. +.It Cm nat Ar nat_nr | tablearg +Pass packet to a +nat instance +(for network address translation, address redirect, etc.): +see the +.Sx NETWORK ADDRESS TRANSLATION (NAT) +Section for further information. +.It Cm pipe Ar pipe_nr +Pass packet to a +.Nm dummynet +.Dq pipe +(for bandwidth limitation, delay, etc.). +See the +.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +Section for further information. +The search terminates; however, on exit from the pipe and if +the +.Xr sysctl 8 +variable +.Va net.inet.ip.fw.one_pass +is not set, the packet is passed again to the firewall code +starting from the next rule. +.It Cm queue Ar queue_nr +Pass packet to a +.Nm dummynet +.Dq queue +(for bandwidth limitation using WF2Q+). +.It Cm reject +(Deprecated). +Synonym for +.Cm unreach host . +.It Cm reset +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm reset6 +Discard packets that match this rule, and if the +packet is a TCP packet, try to send a TCP reset (RST) notice. +The search terminates. +.It Cm skipto Ar number | tablearg +Skip all subsequent rules numbered less than +.Ar number . +The search continues with the first rule numbered +.Ar number +or higher. +It is possible to use the +.Cm tablearg +keyword with a skipto for a +.Em computed +skipto. Skipto may work either in O(log(N)) or in O(1) depending +on amount of memory and/or sysctl variables. +See the +.Sx SYSCTL VARIABLES +section for more details. +.It Cm call Ar number | tablearg +The current rule number is saved in the internal stack and +ruleset processing continues with the first rule numbered +.Ar number +or higher. +If later a rule with the +.Cm return +action is encountered, the processing returns to the first rule +with number of this +.Cm call +rule plus one or higher +(the same behaviour as with packets returning from +.Xr divert 4 +socket after a +.Cm divert +action). +This could be used to make somewhat like an assembly language +.Dq subroutine +calls to rules with common checks for different interfaces, etc. +.Pp +Rule with any number could be called, not just forward jumps as with +.Cm skipto . +So, to prevent endless loops in case of mistakes, both +.Cm call +and +.Cm return +actions don't do any jumps and simply go to the next rule if memory +cannot be allocated or stack overflowed/underflowed. +.Pp +Internally stack for rule numbers is implemented using +.Xr mbuf_tags 9 +facility and currently has size of 16 entries. +As mbuf tags are lost when packet leaves the kernel, +.Cm divert +should not be used in subroutines to avoid endless loops +and other undesired effects. +.It Cm return +Takes rule number saved to internal stack by the last +.Cm call +action and returns ruleset processing to the first rule +with number greater than number of corresponding +.Cm call +rule. +See description of the +.Cm call +action for more details. +.Pp +Note that +.Cm return +rules usually end a +.Dq subroutine +and thus are unconditional, but +.Nm +command-line utility currently requires every action except +.Cm check-state +to have body. +While it is sometimes useful to return only on some packets, +usually you want to print just +.Dq return +for readability. +A workaround for this is to use new syntax and +.Fl c +switch: +.Bd -literal -offset indent +# Add a rule without actual body +ipfw add 2999 return via any + +# List rules without "from any to any" part +ipfw -c list +.Ed +.Pp +This cosmetic annoyance may be fixed in future releases. +.It Cm tee Ar port +Send a copy of packets matching this rule to the +.Xr divert 4 +socket bound to port +.Ar port . +The search continues with the next rule. +.It Cm unreach Ar code +Discard packets that match this rule, and try to send an ICMP +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0 to 255, or one of these aliases: +.Cm net , host , protocol , port , +.Cm needfrag , srcfail , net-unknown , host-unknown , +.Cm isolated , net-prohib , host-prohib , tosnet , +.Cm toshost , filter-prohib , host-precedence +or +.Cm precedence-cutoff . +The search terminates. +.It Cm unreach6 Ar code +Discard packets that match this rule, and try to send an ICMPv6 +unreachable notice with code +.Ar code , +where +.Ar code +is a number from 0, 1, 3 or 4, or one of these aliases: +.Cm no-route, admin-prohib, address +or +.Cm port . +The search terminates. +.It Cm netgraph Ar cookie +Divert packet into netgraph with given +.Ar cookie . +The search terminates. +If packet is later returned from netgraph it is either +accepted or continues with the next rule, depending on +.Va net.inet.ip.fw.one_pass +sysctl variable. +.It Cm ngtee Ar cookie +A copy of packet is diverted into netgraph, original +packet continues with the next rule. +See +.Xr ng_ipfw 4 +for more information on +.Cm netgraph +and +.Cm ngtee +actions. +.It Cm setfib Ar fibnum | tablearg +The packet is tagged so as to use the FIB (routing table) +.Ar fibnum +in any subsequent forwarding decisions. +In the current implementation, this is limited to the values 0 through 15, see +.Xr setfib 2 . +Processing continues at the next rule. +It is possible to use the +.Cm tablearg +keyword with setfib. +If the tablearg value is not within the compiled range of fibs, +the packet's fib is set to 0. +.It Cm setdscp Ar DSCP | number | tablearg +Set specified DiffServ codepoint for an IPv4/IPv6 packet. +Processing continues at the next rule. +Supported values are: +.Pp +.Cm CS0 +.Pq Dv 000000 , +.Cm CS1 +.Pq Dv 001000 , +.Cm CS2 +.Pq Dv 010000 , +.Cm CS3 +.Pq Dv 011000 , +.Cm CS4 +.Pq Dv 100000 , +.Cm CS5 +.Pq Dv 101000 , +.Cm CS6 +.Pq Dv 110000 , +.Cm CS7 +.Pq Dv 111000 , +.Cm AF11 +.Pq Dv 001010 , +.Cm AF12 +.Pq Dv 001100 , +.Cm AF13 +.Pq Dv 001110 , +.Cm AF21 +.Pq Dv 010010 , +.Cm AF22 +.Pq Dv 010100 , +.Cm AF23 +.Pq Dv 010110 , +.Cm AF31 +.Pq Dv 011010 , +.Cm AF32 +.Pq Dv 011100 , +.Cm AF33 +.Pq Dv 011110 , +.Cm AF41 +.Pq Dv 100010 , +.Cm AF42 +.Pq Dv 100100 , +.Cm AF43 +.Pq Dv 100110 , +.Cm EF +.Pq Dv 101110 , +.Cm BE +.Pq Dv 000000 . +Additionally, DSCP value can be specified by number (0..64). +It is also possible to use the +.Cm tablearg +keyword with setdscp. +If the tablearg value is not within the 0..64 range, lower 6 bits of supplied +value are used. +.It Cm reass +Queue and reassemble IP fragments. +If the packet is not fragmented, counters are updated and +processing continues with the next rule. +If the packet is the last logical fragment, the packet is reassembled and, if +.Va net.inet.ip.fw.one_pass +is set to 0, processing continues with the next rule. +Otherwise, the packet is allowed to pass and the search terminates. +If the packet is a fragment in the middle of a logical group of fragments, +it is consumed and +processing stops immediately. +.Pp +Fragment handling can be tuned via +.Va net.inet.ip.maxfragpackets +and +.Va net.inet.ip.maxfragsperpacket +which limit, respectively, the maximum number of processable +fragments (default: 800) and +the maximum number of fragments per packet (default: 16). +.Pp +NOTA BENE: since fragments do not contain port numbers, +they should be avoided with the +.Nm reass +rule. +Alternatively, direction-based (like +.Nm in +/ +.Nm out +) and source-based (like +.Nm via +) match patterns can be used to select fragments. +.Pp +Usually a simple rule like: +.Bd -literal -offset indent +# reassemble incoming fragments +ipfw add reass all from any to any in +.Ed +.Pp +is all you need at the beginning of your ruleset. +.El +.Ss RULE BODY +The body of a rule contains zero or more patterns (such as +specific source and destination addresses or ports, +protocol options, incoming or outgoing interfaces, etc.) +that the packet must match in order to be recognised. +In general, the patterns are connected by (implicit) +.Cm and +operators -- i.e., all must match in order for the +rule to match. +Individual patterns can be prefixed by the +.Cm not +operator to reverse the result of the match, as in +.Pp +.Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" +.Pp +Additionally, sets of alternative match patterns +.Pq Em or-blocks +can be constructed by putting the patterns in +lists enclosed between parentheses ( ) or braces { }, and +using the +.Cm or +operator as follows: +.Pp +.Dl "ipfw add 100 allow ip from { x or not y or z } to any" +.Pp +Only one level of parentheses is allowed. +Beware that most shells have special meanings for parentheses +or braces, so it is advisable to put a backslash \\ in front of them +to prevent such interpretations. +.Pp +The body of a rule must in general include a source and destination +address specifier. +The keyword +.Ar any +can be used in various places to specify that the content of +a required field is irrelevant. +.Pp +The rule body has the following format: +.Bd -ragged -offset indent +.Op Ar proto Cm from Ar src Cm to Ar dst +.Op Ar options +.Ed +.Pp +The first part (proto from src to dst) is for backward +compatibility with earlier versions of +.Fx . +In modern +.Fx +any match pattern (including MAC headers, IP protocols, +addresses and ports) can be specified in the +.Ar options +section. +.Pp +Rule fields have the following meaning: +.Bl -tag -width indent +.It Ar proto : protocol | Cm { Ar protocol Cm or ... } +.It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number +An IP protocol specified by number or name +(for a complete list see +.Pa /etc/protocols ) , +or one of the following keywords: +.Bl -tag -width indent +.It Cm ip4 | ipv4 +Matches IPv4 packets. +.It Cm ip6 | ipv6 +Matches IPv6 packets. +.It Cm ip | all +Matches any packet. +.El +.Pp +The +.Cm ipv6 +in +.Cm proto +option will be treated as inner protocol. +And, the +.Cm ipv4 +is not available in +.Cm proto +option. +.Pp +The +.Cm { Ar protocol Cm or ... } +format (an +.Em or-block ) +is provided for convenience only but its use is deprecated. +.It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports +An address (or a list, see below) +optionally followed by +.Ar ports +specifiers. +.Pp +The second format +.Em ( or-block +with multiple addresses) is provided for convenience only and +its use is discouraged. +.It Ar addr : Oo Cm not Oc Bro +.Cm any | me | me6 | +.Cm table Ns Pq Ar name Ns Op , Ns Ar value +.Ar | addr-list | addr-set +.Brc +.Bl -tag -width indent +.It Cm any +matches any IP address. +.It Cm me +matches any IP address configured on an interface in the system. +.It Cm me6 +matches any IPv6 address configured on an interface in the system. +The address list is evaluated at the time the packet is +analysed. +.It Cm table Ns Pq Ar name Ns Op , Ns Ar value +Matches any IPv4 or IPv6 address for which an entry exists in the lookup table +.Ar number . +If an optional 32-bit unsigned +.Ar value +is also specified, an entry will match only if it has this value. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.El +.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list +.It Ar ip-addr : +A host or subnet address specified in one of the following ways: +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv4 address, specified as dotted-quad or a hostname. +Hostnames are resolved at the time the rule is added to the firewall list. +.It Ar addr Ns / Ns Ar masklen +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and mask width of +.Cm masklen +bits. +As an example, 1.2.3.4/25 or 1.2.3.0/25 will match +all IP numbers from 1.2.3.0 to 1.2.3.127 . +.It Ar addr Ns : Ns Ar mask +Matches all addresses with base +.Ar addr +(specified as an IP address, a network number, or a hostname) +and the mask of +.Ar mask , +specified as a dotted quad. +As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match +1.*.3.*. +This form is advised only for non-contiguous +masks. +It is better to resort to the +.Ar addr Ns / Ns Ar masklen +format for contiguous masks, which is more compact and less +error-prone. +.El +.It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } +.It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list +Matches all addresses with base address +.Ar addr +(specified as an IP address, a network number, or a hostname) +and whose last byte is in the list between braces { } . +Note that there must be no spaces between braces and +numbers (spaces after commas are allowed). +Elements of the list can be specified as single entries +or ranges. +The +.Ar masklen +field is used to limit the size of the set of addresses, +and can have any value between 24 and 32. +If not specified, +it will be assumed as 24. +.br +This format is particularly useful to handle sparse address sets +within a single rule. +Because the matching occurs using a +bitmask, it takes constant time and dramatically reduces +the complexity of rulesets. +.br +As an example, an address specified as 1.2.3.4/24{128,35-55,89} +or 1.2.3.0/24{128,35-55,89} +will match the following IP addresses: +.br +1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . +.It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list +.It Ar ip6-addr : +A host or subnet specified one of the following ways: +.Bl -tag -width indent +.It Ar numeric-ip | hostname +Matches a single IPv6 address as allowed by +.Xr inet_pton 3 +or a hostname. +Hostnames are resolved at the time the rule is added to the firewall +list. +.It Ar addr Ns / Ns Ar masklen +Matches all IPv6 addresses with base +.Ar addr +(specified as allowed by +.Xr inet_pton +or a hostname) +and mask width of +.Cm masklen +bits. +.El +.Pp +No support for sets of IPv6 addresses is provided because IPv6 addresses +are typically random past the initial prefix. +.It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports +For protocols which support port numbers (such as TCP and UDP), optional +.Cm ports +may be specified as one or more ports or port ranges, separated +by commas but no spaces, and an optional +.Cm not +operator. +The +.Ql \&- +notation specifies a range of ports (including boundaries). +.Pp +Service names (from +.Pa /etc/services ) +may be used instead of numeric port values. +The length of the port list is limited to 30 ports or ranges, +though one can specify larger ranges by using an +.Em or-block +in the +.Cm options +section of the rule. +.Pp +A backslash +.Pq Ql \e +can be used to escape the dash +.Pq Ql - +character in a service name (from a shell, the backslash must be +typed twice to avoid the shell itself interpreting it as an escape +character). +.Pp +.Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" +.Pp +Fragmented packets which have a non-zero offset (i.e., not the first +fragment) will never match a rule which has one or more port +specifications. +See the +.Cm frag +option for details on matching fragmented packets. +.El +.Ss RULE OPTIONS (MATCH PATTERNS) +Additional match patterns can be used within +rules. +Zero or more of these so-called +.Em options +can be present in a rule, optionally prefixed by the +.Cm not +operand, and possibly grouped into +.Em or-blocks . +.Pp +The following match patterns can be used (listed in alphabetical order): +.Bl -tag -width indent +.It Cm // this is a comment. +Inserts the specified text as a comment in the rule. +Everything following // is considered as a comment and stored in the rule. +You can have comment-only rules, which are listed as having a +.Cm count +action followed by the comment. +.It Cm bridged +Alias for +.Cm layer2 . +.It Cm diverted +Matches only packets generated by a divert socket. +.It Cm diverted-loopback +Matches only packets coming from a divert socket back into the IP stack +input for delivery. +.It Cm diverted-output +Matches only packets going from a divert socket back outward to the IP +stack output for delivery. +.It Cm dst-ip Ar ip-address +Matches IPv4 packets whose destination IP is one of the address(es) +specified as argument. +.It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address +Matches IPv6 packets whose destination IP is one of the address(es) +specified as argument. +.It Cm dst-port Ar ports +Matches IP packets whose destination port is one of the port(s) +specified as argument. +.It Cm established +Matches TCP packets that have the RST or ACK bits set. +.It Cm ext6hdr Ar header +Matches IPv6 packets containing the extended header given by +.Ar header . +Supported headers are: +.Pp +Fragment, +.Pq Cm frag , +Hop-to-hop options +.Pq Cm hopopt , +any type of Routing Header +.Pq Cm route , +Source routing Routing Header Type 0 +.Pq Cm rthdr0 , +Mobile IPv6 Routing Header Type 2 +.Pq Cm rthdr2 , +Destination options +.Pq Cm dstopt , +IPSec authentication headers +.Pq Cm ah , +and IPsec encapsulated security payload headers +.Pq Cm esp . +.It Cm fib Ar fibnum +Matches a packet that has been tagged to use +the given FIB (routing table) number. +.It Cm flow Ar table Ns Pq Ar name Ns Op , Ns Ar value +Search for the flow entry in lookup table +.Ar name . +If not found, the match fails. +Otherwise, the match succeeds and +.Cm tablearg +is set to the value extracted from the table. +.Pp +This option can be useful to quickly dispatch traffic based on +certain packet fields. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.It Cm flow-id Ar labels +Matches IPv6 packets containing any of the flow labels given in +.Ar labels . +.Ar labels +is a comma separated list of numeric flow labels. +.It Cm frag +Matches packets that are fragments and not the first +fragment of an IP datagram. +Note that these packets will not have +the next protocol header (e.g.\& TCP, UDP) so options that look into +these headers cannot match. +.It Cm gid Ar group +Matches all TCP or UDP packets sent by or received for a +.Ar group . +A +.Ar group +may be specified by name or number. +.It Cm jail Ar prisonID +Matches all TCP or UDP packets sent by or received for the +jail whos prison ID is +.Ar prisonID . +.It Cm icmptypes Ar types +Matches ICMP packets whose ICMP type is in the list +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +The supported ICMP types are: +.Pp +echo reply +.Pq Cm 0 , +destination unreachable +.Pq Cm 3 , +source quench +.Pq Cm 4 , +redirect +.Pq Cm 5 , +echo request +.Pq Cm 8 , +router advertisement +.Pq Cm 9 , +router solicitation +.Pq Cm 10 , +time-to-live exceeded +.Pq Cm 11 , +IP header bad +.Pq Cm 12 , +timestamp request +.Pq Cm 13 , +timestamp reply +.Pq Cm 14 , +information request +.Pq Cm 15 , +information reply +.Pq Cm 16 , +address mask request +.Pq Cm 17 +and address mask reply +.Pq Cm 18 . +.It Cm icmp6types Ar types +Matches ICMP6 packets whose ICMP6 type is in the list of +.Ar types . +The list may be specified as any combination of +individual types (numeric) separated by commas. +.Em Ranges are not allowed . +.It Cm in | out +Matches incoming or outgoing packets, respectively. +.Cm in +and +.Cm out +are mutually exclusive (in fact, +.Cm out +is implemented as +.Cm not in Ns No ). +.It Cm ipid Ar id-list +Matches IPv4 packets whose +.Cm ip_id +field has value included in +.Ar id-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm iplen Ar len-list +Matches IP packets whose total length, including header and data, is +in the set +.Ar len-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipoptions Ar spec +Matches packets whose IPv4 header contains the comma separated list of +options specified in +.Ar spec . +The supported IP options are: +.Pp +.Cm ssrr +(strict source route), +.Cm lsrr +(loose source route), +.Cm rr +(record packet route) and +.Cm ts +(timestamp). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm ipprecedence Ar precedence +Matches IPv4 packets whose precedence field is equal to +.Ar precedence . +.It Cm ipsec +Matches packets that have IPSEC history associated with them +(i.e., the packet comes encapsulated in IPSEC, the kernel +has IPSEC support and IPSEC_FILTERTUNNEL option, and can correctly +decapsulate it). +.Pp +Note that specifying +.Cm ipsec +is different from specifying +.Cm proto Ar ipsec +as the latter will only look at the specific IP protocol field, +irrespective of IPSEC kernel support and the validity of the IPSEC data. +.Pp +Further note that this flag is silently ignored in kernels without +IPSEC support. +It does not affect rule processing when given and the +rules are handled as if with no +.Cm ipsec +flag. +.It Cm iptos Ar spec +Matches IPv4 packets whose +.Cm tos +field contains the comma separated list of +service types specified in +.Ar spec . +The supported IP types of service are: +.Pp +.Cm lowdelay +.Pq Dv IPTOS_LOWDELAY , +.Cm throughput +.Pq Dv IPTOS_THROUGHPUT , +.Cm reliability +.Pq Dv IPTOS_RELIABILITY , +.Cm mincost +.Pq Dv IPTOS_MINCOST , +.Cm congestion +.Pq Dv IPTOS_ECN_CE . +The absence of a particular type may be denoted +with a +.Ql \&! . +.It Cm dscp spec Ns Op , Ns Ar spec +Matches IPv4/IPv6 packets whose +.Cm DS +field value is contained in +.Ar spec +mask. +Multiple values can be specified via +the comma separated list. +Value can be one of keywords used in +.Cm setdscp +action or exact number. +.It Cm ipttl Ar ttl-list +Matches IPv4 packets whose time to live is included in +.Ar ttl-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm ipversion Ar ver +Matches IP packets whose IP version field is +.Ar ver . +.It Cm keep-state +Upon a match, the firewall will create a dynamic rule, whose +default behaviour is to match bidirectional traffic between +source and destination IP/port using the same protocol. +The rule has a limited lifetime (controlled by a set of +.Xr sysctl 8 +variables), and the lifetime is refreshed every time a matching +packet is found. +.It Cm layer2 +Matches only layer2 packets, i.e., those passed to +.Nm +from ether_demux() and ether_output_frame(). +.It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N +The firewall will only allow +.Ar N +connections with the same +set of parameters as specified in the rule. +One or more +of source and destination addresses and ports can be +specified. +Currently, +only IPv4 flows are supported. +.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar name +Search an entry in lookup table +.Ar name +that matches the field specified as argument. +If not found, the match fails. +Otherwise, the match succeeds and +.Cm tablearg +is set to the value extracted from the table. +.Pp +This option can be useful to quickly dispatch traffic based on +certain packet fields. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.It Cm { MAC | mac } Ar dst-mac src-mac +Match packets with a given +.Ar dst-mac +and +.Ar src-mac +addresses, specified as the +.Cm any +keyword (matching any MAC address), or six groups of hex digits +separated by colons, +and optionally followed by a mask indicating the significant bits. +The mask may be specified using either of the following methods: +.Bl -enum -width indent +.It +A slash +.Pq / +followed by the number of significant bits. +For example, an address with 33 significant bits could be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60/33 any" +.It +An ampersand +.Pq & +followed by a bitmask specified as six groups of hex digits separated +by colons. +For example, an address in which the last 16 bits are significant could +be specified as: +.Pp +.Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" +.Pp +Note that the ampersand character has a special meaning in many shells +and should generally be escaped. +.El +Note that the order of MAC addresses (destination first, +source second) is +the same as on the wire, but the opposite of the one used for +IP addresses. +.It Cm mac-type Ar mac-type +Matches packets whose Ethernet Type field +corresponds to one of those specified as argument. +.Ar mac-type +is specified in the same way as +.Cm port numbers +(i.e., one or more comma-separated single values or ranges). +You can use symbolic names for known values such as +.Em vlan , ipv4, ipv6 . +Values can be entered as decimal or hexadecimal (if prefixed by 0x), +and they are always printed as hexadecimal (unless the +.Cm -N +option is used, in which case symbolic resolution will be attempted). +.It Cm proto Ar protocol +Matches packets with the corresponding IP protocol. +.It Cm recv | xmit | via Brq Ar ifX | Ar if Ns Cm * | Ar table Ns Po Ar name Ns Oo , Ns Ar value Oc Pc | Ar ipno | Ar any +Matches packets received, transmitted or going through, +respectively, the interface specified by exact name +.Po Ar ifX Pc , +by device name +.Po Ar if* Pc , +by IP address, or through some interface. +Table +.Ar name +may be used to match interface by its kernel ifindex. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.Pp +The +.Cm via +keyword causes the interface to always be checked. +If +.Cm recv +or +.Cm xmit +is used instead of +.Cm via , +then only the receive or transmit interface (respectively) +is checked. +By specifying both, it is possible to match packets based on +both receive and transmit interface, e.g.: +.Pp +.Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" +.Pp +The +.Cm recv +interface can be tested on either incoming or outgoing packets, +while the +.Cm xmit +interface can only be tested on outgoing packets. +So +.Cm out +is required (and +.Cm in +is invalid) whenever +.Cm xmit +is used. +.Pp +A packet might not have a receive or transmit interface: packets +originating from the local host have no receive interface, +while packets destined for the local host have no transmit +interface. +.It Cm setup +Matches TCP packets that have the SYN bit set but no ACK bit. +This is the short form of +.Dq Li tcpflags\ syn,!ack . +.It Cm sockarg +Matches packets that are associated to a local socket and +for which the SO_USER_COOKIE socket option has been set +to a non-zero value. +As a side effect, the value of the +option is made available as +.Cm tablearg +value, which in turn can be used as +.Cm skipto +or +.Cm pipe +number. +.It Cm src-ip Ar ip-address +Matches IPv4 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-ip6 Ar ip6-address +Matches IPv6 packets whose source IP is one of the address(es) +specified as an argument. +.It Cm src-port Ar ports +Matches IP packets whose source port is one of the port(s) +specified as argument. +.It Cm tagged Ar tag-list +Matches packets whose tags are included in +.Ar tag-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +Tags can be applied to the packet using +.Cm tag +rule action parameter (see it's description for details on tags). +.It Cm tcpack Ar ack +TCP packets only. +Match if the TCP header acknowledgment number field is set to +.Ar ack . +.It Cm tcpdatalen Ar tcpdatalen-list +Matches TCP packets whose length of TCP data is +.Ar tcpdatalen-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm tcpflags Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +flags specified in +.Ar spec . +The supported TCP flags are: +.Pp +.Cm fin , +.Cm syn , +.Cm rst , +.Cm psh , +.Cm ack +and +.Cm urg . +The absence of a particular flag may be denoted +with a +.Ql \&! . +A rule which contains a +.Cm tcpflags +specification can never match a fragmented packet which has +a non-zero offset. +See the +.Cm frag +option for details on matching fragmented packets. +.It Cm tcpseq Ar seq +TCP packets only. +Match if the TCP header sequence number field is set to +.Ar seq . +.It Cm tcpwin Ar tcpwin-list +Matches TCP packets whose header window field is set to +.Ar tcpwin-list , +which is either a single value or a list of values or ranges +specified in the same way as +.Ar ports . +.It Cm tcpoptions Ar spec +TCP packets only. +Match if the TCP header contains the comma separated list of +options specified in +.Ar spec . +The supported TCP options are: +.Pp +.Cm mss +(maximum segment size), +.Cm window +(tcp window advertisement), +.Cm sack +(selective ack), +.Cm ts +(rfc1323 timestamp) and +.Cm cc +(rfc1644 t/tcp connection count). +The absence of a particular option may be denoted +with a +.Ql \&! . +.It Cm uid Ar user +Match all TCP or UDP packets sent by or received for a +.Ar user . +A +.Ar user +may be matched by name or identification number. +.It Cm verrevpath +For incoming packets, +a routing table lookup is done on the packet's source address. +If the interface on which the packet entered the system matches the +outgoing interface for the route, +the packet matches. +If the interfaces do not match up, +the packet does not match. +All outgoing packets or packets with no incoming interface match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast reverse-path +.Pp +This option can be used to make anti-spoofing rules to reject all +packets with source addresses not from this interface. +See also the option +.Cm antispoof . +.It Cm versrcreach +For incoming packets, +a routing table lookup is done on the packet's source address. +If a route to the source address exists, but not the default route +or a blackhole/reject route, the packet matches. +Otherwise, the packet does not match. +All outgoing packets match. +.Pp +The name and functionality of the option is intentionally similar to +the Cisco IOS command: +.Pp +.Dl ip verify unicast source reachable-via any +.Pp +This option can be used to make anti-spoofing rules to reject all +packets whose source address is unreachable. +.It Cm antispoof +For incoming packets, the packet's source address is checked if it +belongs to a directly connected network. +If the network is directly connected, then the interface the packet +came on in is compared to the interface the network is connected to. +When incoming interface and directly connected interface are not the +same, the packet does not match. +Otherwise, the packet does match. +All outgoing packets match. +.Pp +This option can be used to make anti-spoofing rules to reject all +packets that pretend to be from a directly connected network but do +not come in through that interface. +This option is similar to but more restricted than +.Cm verrevpath +because it engages only on packets with source addresses of directly +connected networks instead of all source addresses. +.El +.Sh LOOKUP TABLES +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g., ports, jail IDs, interface names). +In the rest of this section we will use the term ``key''. +Table name needs to match the following spec: +.Ar table-name . +Tables with the same name can be created in different +.Ar sets . +However, rule links to the tables in +.Ar set 0 +by default. +This behavior can be controlled by +.Va net.inet.ip.fw.tables_sets +variable. +See the +.Sx SETS OF RULES +section for more information. +There may be up to 65535 different lookup tables. +.Pp +The following table types are supported: +.Bl -tag -width indent +.It Ar table-type : Ar addr | iface | number | flow +.It Ar table-key : Ar addr Ns Oo / Ns Ar masklen Oc | iface-name | number | flow-spec +.It Ar flow-spec : Ar flow-field Ns Op , Ns Ar flow-spec +.It Ar flow-field : src-ip | proto | src-port | dst-ip | dst-port +.It Cm addr +matches IPv4 or IPv6 address. +Each entry is represented by an +.Ar addr Ns Op / Ns Ar masklen +and will match all addresses with base +.Ar addr +(specified as an IPv4/IPv6 address, or a hostname) and mask width of +.Ar masklen +bits. +If +.Ar masklen +is not specified, it defaults to 32 for IPv4 and 128 for IPv6. +When looking up an IP address in a table, the most specific +entry will match. +.It Cm iface +matches interface names. +Each entry is represented by string treated as interface name. +Wildcards are not supported. +.It Cm number +maches protocol ports, uids/gids or jail IDs. +Each entry is represented by 32-bit unsigned integer. +Ranges are not supported. +.It Cm flow +Matches packet fields specified by +.Ar flow +type suboptions with table entries. +.El +.Pp +Tables require explicit creation via +.Cm create +before use. +.Pp +The following creation options are supported: +.Bl -tag -width indent +.It Ar create-options : Ar create-option | create-options +.It Ar create-option : Cm type Ar table-type | Cm valtype Ar value-mask | Cm algo Ar algo-desc | +.Cm limit Ar number | Cm locked +.It Cm type +Table key type. +.It Cm valtype +Table value mask. +.It Cm algo +Table algorithm to use (see below). +.It Cm limit +Maximum number of items that may be inserted into table. +.It Cm locked +Restrict any table modifications. +.El +.Pp +Some of these options may be modified later via +.Cm modify +keyword. +The following options can be changed: +.Bl -tag -width indent +.It Ar modify-options : Ar modify-option | modify-options +.It Ar modify-option : Cm limit Ar number +.It Cm limit +Alter maximum number of items that may be inserted into table. +.El +.Pp +Additionally, table can be locked or unlocked using +.Cm lock +or +.Cm unlock +commands. +.Pp +Tables of the same +.Ar type +can be swapped with each other using +.Cm swap Ar name +command. +Swap may fail if tables limits are set and data exchange +would result in limits hit. +Operation is performed atomically. +.Pp +One or more entries can be added to a table at once using +.Cm add +command. +Addition of all items are performed atomically. +By default, error in addition of one entry does not influence +addition of other entries. However, non-zero error code is returned +in that case. +Special +.Cm atomic +keyword may be specified before +.Cm add +to indicate all-or-none add request. +.Pp +One or more entries can be removed from a table at once using +.Cm delete +command. +By default, error in removal of one entry does not influence +removing of other entries. However, non-zero error code is returned +in that case. +.Pp +It may be possible to check what entry will be found on particular +.Ar table-key +using +.Cm lookup +.Ar table-key +command. +This functionality is optional and may be unsupported in some algorithms. +.Pp +The following operations can be performed on +.Ar one +or +.Cm all +tables: +.Bl -tag -width indent +.It Cm list +List all entries. +.It Cm flush +Removes all entries. +.It Cm info +Shows generic table information. +.It Cm detail +Shows generic table information and algo-specific data. +.El +.Pp +The following lookup algorithms are supported: +.Bl -tag -width indent +.It Ar algo-desc : algo-name | "algo-name algo-data" +.It Ar algo-name: Ar addr:radix | addr:hash | iface:array | number:array | flow:hash +.It Cm addr:radix +Separate Radix trees for IPv4 and IPv6, the same way as the routing table (see +.Xr route 4 ) . +Default choice for +.Ar addr +type. +.It Cm addr:hash +Separate auto-growing hashes for IPv4 and IPv6. +Accepts entries with the same mask length specified initially via +.Cm "addr:hash masks=/v4,/v6" +algorithm creation options. +Assume /32 and /128 masks by default. +Search removes host bits (according to mask) from supplied address and checks +resulting key in appropriate hash. +Mostly optimized for /64 and byte-ranged IPv6 masks. +.It Cm iface:array +Array storing sorted indexes for entries which are presented in the system. +Optimized for very fast lookup. +.It Cm number:array +Array storing sorted u32 numbers. +.It Cm flow:hash +Auto-growing hash storing flow entries. +Search calculates hash on required packet fields and searches for matching +entries in selected bucket. +.El +.Pp +The +.Cm tablearg +feature provides the ability to use a value, looked up in the table, as +the argument for a rule action, action parameter or rule option. +This can significantly reduce number of rules in some configurations. +If two tables are used in a rule, the result of the second (destination) +is used. +.Pp +Each record may hold one or more values according to +.Ar value-mask . +This mask is set on table creation via +.Cm valtype +option. +The following value types are supported: +.Bl -tag -width indent +.It Ar value-mask : Ar value-type Ns Op , Ns Ar value-mask +.It Ar value-type : Ar skipto | pipe | fib | nat | dscp | tag | divert | +.Ar netgraph | limit | ipv4 +.It Cm skipto +rule number to jump to. +.It Cm pipe +Pipe number to use. +.It Cm fib +fib number to match/set. +.It Cm nat +nat number to jump to. +.It Cm dscp +dscp value to match/set. +.It Cm tag +tag number to match/set. +.It Cm divert +port number to divert traffic to. +.It Cm netgraph +hook number to move packet to. +.It Cm limit +maximum number of connections. +.It Cm ipv4 +IPv4 nexthop to fwd packets to. +.El +.Pp +The +.Cm tablearg +argument can be used with the following actions: +.Cm nat, pipe , queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib, +action parameters: +.Cm tag, untag, +rule options: +.Cm limit, tagged. +.Pp +When used with the +.Cm skipto +action, the user should be aware that the code will walk the ruleset +up to a rule equal to, or past, the given number. +.Pp +See the +.Sx EXAMPLES +Section for example usage of tables and the tablearg keyword. +.Sh SETS OF RULES +Each rule or table belongs to one of 32 different +.Em sets +, numbered 0 to 31. +Set 31 is reserved for the default rule. +.Pp +By default, rules or tables are put in set 0, unless you use the +.Cm set N +attribute when adding a new rule or table. +Sets can be individually and atomically enabled or disabled, +so this mechanism permits an easy way to store multiple configurations +of the firewall and quickly (and atomically) switch between them. +.Pp +By default, tables from set 0 are referenced when adding rule with +table opcodes regardless of rule set. +This behavior can be changed by setting +.Va net.inet.ip.fw.tables_set +variable to 1. +Rule's set will then be used for table references. +.Pp +The command to enable/disable sets is +.Bd -ragged -offset indent +.Nm +.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... +.Ed +.Pp +where multiple +.Cm enable +or +.Cm disable +sections can be specified. +Command execution is atomic on all the sets specified in the command. +By default, all sets are enabled. +.Pp +When you disable a set, its rules behave as if they do not exist +in the firewall configuration, with only one exception: +.Bd -ragged -offset indent +dynamic rules created from a rule before it had been disabled +will still be active until they expire. +In order to delete +dynamic rules you have to explicitly delete the parent rule +which generated them. +.Ed +.Pp +The set number of rules can be changed with the command +.Bd -ragged -offset indent +.Nm +.Cm set move +.Brq Cm rule Ar rule-number | old-set +.Cm to Ar new-set +.Ed +.Pp +Also, you can atomically swap two rulesets with the command +.Bd -ragged -offset indent +.Nm +.Cm set swap Ar first-set second-set +.Ed +.Pp +See the +.Sx EXAMPLES +Section on some possible uses of sets of rules. +.Sh STATEFUL FIREWALL +Stateful operation is a way for the firewall to dynamically +create rules for specific flows when packets that +match a given pattern are detected. +Support for stateful +operation comes through the +.Cm check-state , keep-state +and +.Cm limit +options of +.Nm rules . +.Pp +Dynamic rules are created when a packet matches a +.Cm keep-state +or +.Cm limit +rule, causing the creation of a +.Em dynamic +rule which will match all and only packets with +a given +.Em protocol +between a +.Em src-ip/src-port dst-ip/dst-port +pair of addresses +.Em ( src +and +.Em dst +are used here only to denote the initial match addresses, but they +are completely equivalent afterwards). +Dynamic rules will be checked at the first +.Cm check-state, keep-state +or +.Cm limit +occurrence, and the action performed upon a match will be the same +as in the parent rule. +.Pp +Note that no additional attributes other than protocol and IP addresses +and ports are checked on dynamic rules. +.Pp +The typical use of dynamic rules is to keep a closed firewall configuration, +but let the first TCP SYN packet from the inside network install a +dynamic rule for the flow so that packets belonging to that session +will be allowed through the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow tcp from my-subnet to any setup keep-state" +.Dl "ipfw add deny tcp from any to any" +.Pp +A similar approach can be used for UDP, where an UDP packet coming +from the inside will install a dynamic rule to let the response through +the firewall: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add allow udp from my-subnet to any keep-state" +.Dl "ipfw add deny udp from any to any" +.Pp +Dynamic rules expire after some time, which depends on the status +of the flow and the setting of some +.Cm sysctl +variables. +See Section +.Sx SYSCTL VARIABLES +for more details. +For TCP sessions, dynamic rules can be instructed to periodically +send keepalive packets to refresh the state of the rule when it is +about to expire. +.Pp +See Section +.Sx EXAMPLES +for more examples on how to use dynamic rules. +.Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION +.Nm +is also the user interface for the +.Nm dummynet +traffic shaper, packet scheduler and network emulator, a subsystem that +can artificially queue, delay or drop packets +emulating the behaviour of certain network links +or queueing systems. +.Pp +.Nm dummynet +operates by first using the firewall to select packets +using any match pattern that can be used in +.Nm +rules. +Matching packets are then passed to either of two +different objects, which implement the traffic regulation: +.Bl -hang -offset XXXX +.It Em pipe +A +.Em pipe +emulates a +.Em link +with given bandwidth and propagation delay, +driven by a FIFO scheduler and a single queue with programmable +queue size and packet loss rate. +Packets are appended to the queue as they come out from +.Nm ipfw , +and then transferred in FIFO order to the link at the desired rate. +.It Em queue +A +.Em queue +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. +Packets sent to a +.Em queue +are first grouped into flows according to a mask on the 5-tuple. +Flows are then passed to the scheduler associated to the +.Em queue , +and each flow uses scheduling parameters (weight and others) +as configured in the +.Em queue +itself. +A scheduler in turn is connected to an emulated link, +and arbitrates the link's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. +.El +.Pp +In practice, +.Em pipes +can be used to set hard limits to the bandwidth that a flow can use, whereas +.Em queues +can be used to determine how different flows share the available bandwidth. +.Pp +A graphical representation of the binding of queues, +flows, schedulers and links is below. +.Bd -literal -offset indent + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ +.Ed +It is important to understand the role of the SCHED_MASK +and FLOW_MASK, which are configured through the commands +.Dl "ipfw sched N config mask SCHED_MASK ..." +and +.Dl "ipfw queue X config mask FLOW_MASK ..." . +.Pp +The SCHED_MASK is used to assign flows to one or more +scheduler instances, one for each +value of the packet's 5-tuple after applying SCHED_MASK. +As an example, using ``src-ip 0xffffff00'' creates one instance +for each /24 destination subnet. +.Pp +The FLOW_MASK, together with the SCHED_MASK, is used to split +packets into flows. +As an example, using +``src-ip 0x000000ff'' +together with the previous SCHED_MASK makes a flow for +each individual source address. +In turn, flows for each /24 +subnet will be sent to the same scheduler instance. +.Pp +The above diagram holds even for the +.Em pipe +case, with the only restriction that a +.Em pipe +only supports a SCHED_MASK, and forces the use of a FIFO +scheduler (these are for backward compatibility reasons; +in fact, internally, a +.Nm dummynet's +pipe is implemented exactly as above). +.Pp +There are two modes of +.Nm dummynet +operation: +.Dq normal +and +.Dq fast . +The +.Dq normal +mode tries to emulate a real link: the +.Nm dummynet +scheduler ensures that the packet will not leave the pipe faster than it +would on the real link with a given bandwidth. +The +.Dq fast +mode allows certain packets to bypass the +.Nm dummynet +scheduler (if packet flow does not exceed pipe's bandwidth). +This is the reason why the +.Dq fast +mode requires less CPU cycles per packet (on average) and packet latency +can be significantly lower in comparison to a real link with the same +bandwidth. +The default mode is +.Dq normal . +The +.Dq fast +mode can be enabled by setting the +.Va net.inet.ip.dummynet.io_fast +.Xr sysctl 8 +variable to a non-zero value. +.Pp +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION +The +.Em pipe , +.Em queue +and +.Em scheduler +configuration commands are the following: +.Bd -ragged -offset indent +.Cm pipe Ar number Cm config Ar pipe-configuration +.Pp +.Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration +.Ed +.Pp +The following parameters can be configured for a pipe: +.Pp +.Bl -tag -width indent -compact +.It Cm bw Ar bandwidth | device +Bandwidth, measured in +.Sm off +.Op Cm K | M +.Brq Cm bit/s | Byte/s . +.Sm on +.Pp +A value of 0 (default) means unlimited bandwidth. +The unit must immediately follow the number, as in +.Pp +.Dl "ipfw pipe 1 config bw 300Kbit/s" +.Pp +If a device name is specified instead of a numeric value, as in +.Pp +.Dl "ipfw pipe 1 config bw tun0" +.Pp +then the transmit clock is supplied by the specified device. +At the moment only the +.Xr tun 4 +device supports this +functionality, for use in conjunction with +.Xr ppp 8 . +.Pp +.It Cm delay Ar ms-delay +Propagation delay, measured in milliseconds. +The value is rounded to the next multiple of the clock tick +(typically 10ms, but it is a good practice to run kernels +with +.Dq "options HZ=1000" +to reduce +the granularity to 1ms or less). +The default value is 0, meaning no delay. +.Pp +.It Cm burst Ar size +If the data to be sent exceeds the pipe's bandwidth limit +(and the pipe was previously idle), up to +.Ar size +bytes of data are allowed to bypass the +.Nm dummynet +scheduler, and will be sent as fast as the physical link allows. +Any additional data will be transmitted at the rate specified +by the +.Nm pipe +bandwidth. +The burst size depends on how long the pipe has been idle; +the effective burst size is calculated as follows: +MAX( +.Ar size +, +.Nm bw +* pipe_idle_time). +.Pp +.It Cm profile Ar filename +A file specifying the additional overhead incurred in the transmission +of a packet on the link. +.Pp +Some link types introduce extra delays in the transmission +of a packet, e.g., because of MAC level framing, contention on +the use of the channel, MAC level retransmissions and so on. +From our point of view, the channel is effectively unavailable +for this extra time, which is constant or variable depending +on the link type. +Additionally, packets may be dropped after this +time (e.g., on a wireless link after too many retransmissions). +We can model the additional delay with an empirical curve +that represents its distribution. +.Bd -literal -offset indent + cumulative probability + 1.0 ^ + | + L +-- loss-level x + | ****** + | * + | ***** + | * + | ** + | * + +-------*-------------------> + delay +.Ed +The empirical curve may have both vertical and horizontal lines. +Vertical lines represent constant delay for a range of +probabilities. +Horizontal lines correspond to a discontinuity in the delay +distribution: the pipe will use the largest delay for a +given probability. +.Pp +The file format is the following, with whitespace acting as +a separator and '#' indicating the beginning a comment: +.Bl -tag -width indent +.It Cm name Ar identifier +optional name (listed by "ipfw pipe show") +to identify the delay distribution; +.It Cm bw Ar value +the bandwidth used for the pipe. +If not specified here, it must be present +explicitly as a configuration parameter for the pipe; +.It Cm loss-level Ar L +the probability above which packets are lost. +(0.0 <= L <= 1.0, default 1.0 i.e., no loss); +.It Cm samples Ar N +the number of samples used in the internal +representation of the curve (2..1024; default 100); +.It Cm "delay prob" | "prob delay" +One of these two lines is mandatory and defines +the format of the following lines with data points. +.It Ar XXX Ar YYY +2 or more lines representing points in the curve, +with either delay or probability first, according +to the chosen format. +The unit for delay is milliseconds. +Data points do not need to be sorted. +Also, the number of actual lines can be different +from the value of the "samples" parameter: +.Nm +utility will sort and interpolate +the curve as needed. +.El +.Pp +Example of a profile file: +.Bd -literal -offset indent +name bla_bla_bla +samples 100 +loss-level 0.86 +prob delay +0 200 # minimum overhead is 200ms +0.5 200 +0.5 300 +0.8 1000 +0.9 1300 +1 1300 +#configuration file end +.Ed +.El +.Pp +The following parameters can be configured for a queue: +.Pp +.Bl -tag -width indent -compact +.It Cm pipe Ar pipe_nr +Connects a queue to the specified pipe. +Multiple queues (with the same or different weights) can be connected to +the same pipe, which specifies the aggregate rate for the set of queues. +.Pp +.It Cm weight Ar weight +Specifies the weight to be used for flows matching this queue. +The weight must be in the range 1..100, and defaults to 1. +.El +.Pp +The following case-insensitive parameters can be configured for a +scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2q+ | rr | qfq} +specifies the scheduling algorithm to use. +.Bl -tag -width indent -compact +.It Cm fifo +is just a FIFO scheduler (which means that all packets +are stored in the same queue as they arrive to the scheduler). +FIFO has O(1) per-packet time complexity, with very low +constants (estimate 60-80ns on a 2GHz desktop machine) +but gives no service guarantees. +.It Cm wf2q+ +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing +algorithm which permits flows to share bandwidth according to +their weights. +Note that weights are not priorities; even a flow +with a minuscule weight will never starve. +WF2Q+ has O(log N) per-packet processing cost, where N is the number +of flows, and is the default algorithm used by previous versions +dummynet's queues. +.It Cm rr +implements the Deficit Round Robin algorithm, which has O(1) processing +costs (roughly, 100-150ns per packet) +and permits bandwidth allocation according to weights, but +with poor service guarantees. +.It Cm qfq +implements the QFQ algorithm, which is a very fast variant of +WF2Q+, with similar service guarantees and O(1) processing +costs (roughly, 200-250ns per packet). +.El +.El +.Pp +In addition to the type, all parameters allowed for a pipe can also +be specified for a scheduler. +.Pp +Finally, the following parameters can be configured for both +pipes and queues: +.Pp +.Bl -tag -width XXXX -compact +.It Cm buckets Ar hash-table-size +Specifies the size of the hash table used for storing the +various queues. +Default value is 64 controlled by the +.Xr sysctl 8 +variable +.Va net.inet.ip.dummynet.hash_size , +allowed range is 16 to 65536. +.Pp +.It Cm mask Ar mask-specifier +Packets sent to a given pipe or queue by an +.Nm +rule can be further classified into multiple flows, each of which is then +sent to a different +.Em dynamic +pipe or queue. +A flow identifier is constructed by masking the IP addresses, +ports and protocol types as specified with the +.Cm mask +options in the configuration of the pipe or queue. +For each different flow identifier, a new pipe or queue is created +with the same parameters as the original object, and matching packets +are sent to it. +.Pp +Thus, when +.Em dynamic pipes +are used, each flow will get the same bandwidth as defined by the pipe, +whereas when +.Em dynamic queues +are used, each flow will share the parent's pipe bandwidth evenly +with other flows generated by the same queue (note that other queues +with different weights might be connected to the same pipe). +.br +Available mask specifiers are a combination of one or more of the following: +.Pp +.Cm dst-ip Ar mask , +.Cm dst-ip6 Ar mask , +.Cm src-ip Ar mask , +.Cm src-ip6 Ar mask , +.Cm dst-port Ar mask , +.Cm src-port Ar mask , +.Cm flow-id Ar mask , +.Cm proto Ar mask +or +.Cm all , +.Pp +where the latter means all bits in all fields are significant. +.Pp +.It Cm noerror +When a packet is dropped by a +.Nm dummynet +queue or pipe, the error +is normally reported to the caller routine in the kernel, in the +same way as it happens when a device queue fills up. +Setting this +option reports the packet as successfully delivered, which can be +needed for some experimental setups where you want to simulate +loss or congestion at a remote router. +.Pp +.It Cm plr Ar packet-loss-rate +Packet loss rate. +Argument +.Ar packet-loss-rate +is a floating-point number between 0 and 1, with 0 meaning no +loss, 1 meaning 100% loss. +The loss rate is internally represented on 31 bits. +.Pp +.It Cm queue Brq Ar slots | size Ns Cm Kbytes +Queue size, in +.Ar slots +or +.Cm KBytes . +Default value is 50 slots, which +is the typical queue size for Ethernet devices. +Note that for slow speed links you should keep the queue +size short or your traffic might be affected by a significant +queueing delay. +E.g., 50 max-sized ethernet packets (1500 bytes) mean 600Kbit +or 20s of queue on a 30Kbit/s pipe. +Even worse effects can result if you get packets from an +interface with a much larger MTU, e.g.\& the loopback interface +with its 16KB packets. +The +.Xr sysctl 8 +variables +.Em net.inet.ip.dummynet.pipe_byte_limit +and +.Em net.inet.ip.dummynet.pipe_slot_limit +control the maximum lengths that can be specified. +.Pp +.It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p +[ecn] +Make use of the RED (Random Early Detection) queue management algorithm. +.Ar w_q +and +.Ar max_p +are floating +point numbers between 0 and 1 (inclusive), while +.Ar min_th +and +.Ar max_th +are integer numbers specifying thresholds for queue management +(thresholds are computed in bytes if the queue has been defined +in bytes, in slots otherwise). +The two parameters can also be of the same value if needed. The +.Nm dummynet +also supports the gentle RED variant (gred) and ECN (Explicit Congestion +Notification) as optional. Three +.Xr sysctl 8 +variables can be used to control the RED behaviour: +.Bl -tag -width indent +.It Va net.inet.ip.dummynet.red_lookup_depth +specifies the accuracy in computing the average queue +when the link is idle (defaults to 256, must be greater than zero) +.It Va net.inet.ip.dummynet.red_avg_pkt_size +specifies the expected average packet size (defaults to 512, must be +greater than zero) +.It Va net.inet.ip.dummynet.red_max_pkt_size +specifies the expected maximum packet size, only used when queue +thresholds are in bytes (defaults to 1500, must be greater than zero). +.El +.El +.Pp +When used with IPv6 data, +.Nm dummynet +currently has several limitations. +Information necessary to route link-local packets to an +interface is not available after processing by +.Nm dummynet +so those packets are dropped in the output path. +Care should be taken to ensure that link-local packets are not passed to +.Nm dummynet . +.Sh CHECKLIST +Here are some important points to consider when designing your +rules: +.Bl -bullet +.It +Remember that you filter both packets going +.Cm in +and +.Cm out . +Most connections need packets going in both directions. +.It +Remember to test very carefully. +It is a good idea to be near the console when doing this. +If you cannot be near the console, +use an auto-recovery script such as the one in +.Pa /usr/share/examples/ipfw/change_rules.sh . +.It +Do not forget the loopback interface. +.El +.Sh FINE POINTS +.Bl -bullet +.It +There are circumstances where fragmented datagrams are unconditionally +dropped. +TCP packets are dropped if they do not contain at least 20 bytes of +TCP header, UDP packets are dropped if they do not contain a full 8 +byte UDP header, and ICMP packets are dropped if they do not contain +4 bytes of ICMP header, enough to specify the ICMP type, code, and +checksum. +These packets are simply logged as +.Dq pullup failed +since there may not be enough good data in the packet to produce a +meaningful log entry. +.It +Another type of packet is unconditionally dropped, a TCP packet with a +fragment offset of one. +This is a valid packet, but it only has one use, to try +to circumvent firewalls. +When logging is enabled, these packets are +reported as being dropped by rule -1. +.It +If you are logged in over a network, loading the +.Xr kld 4 +version of +.Nm +is probably not as straightforward as you would think. +The following command line is recommended: +.Bd -literal -offset indent +kldload ipfw && \e +ipfw add 32000 allow ip from any to any +.Ed +.Pp +Along the same lines, doing an +.Bd -literal -offset indent +ipfw flush +.Ed +.Pp +in similar surroundings is also a bad idea. +.It +The +.Nm +filter list may not be modified if the system security level +is set to 3 or higher +(see +.Xr init 8 +for information on system security levels). +.El +.Sh PACKET DIVERSION +A +.Xr divert 4 +socket bound to the specified port will receive all packets +diverted to that port. +If no socket is bound to the destination port, or if the divert module is +not loaded, or if the kernel was not compiled with divert socket support, +the packets are dropped. +.Sh NETWORK ADDRESS TRANSLATION (NAT) +.Nm +support in-kernel NAT using the kernel version of +.Xr libalias 3 . +.Pp +The nat configuration command is the following: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config +.Ar nat-configuration +.Ek +.Ed +.Pp +The following parameters can be configured: +.Bl -tag -width indent +.It Cm ip Ar ip_address +Define an ip address to use for aliasing. +.It Cm if Ar nic +Use ip address of NIC for aliasing, dynamically changing +it if NIC's ip address changes. +.It Cm log +Enable logging on this nat instance. +.It Cm deny_in +Deny any incoming connection from outside world. +.It Cm same_ports +Try to leave the alias port numbers unchanged from +the actual local port numbers. +.It Cm unreg_only +Traffic on the local network not originating from an +unregistered address spaces will be ignored. +.It Cm reset +Reset table of the packet aliasing engine on address change. +.It Cm reverse +Reverse the way libalias handles aliasing. +.It Cm proxy_only +Obey transparent proxy rules only, packet aliasing is not performed. +.It Cm skip_global +Skip instance in case of global state lookup (see below). +.El +.Pp +Some specials value can be supplied instead of +.Va nat_number: +.Bl -tag -width indent +.It Cm global +Looks up translation state in all configured nat instances. +If an entry is found, packet is aliased according to that entry. +If no entry was found in any of the instances, packet is passed unchanged, +and no new entry will be created. +See section +.Sx MULTIPLE INSTANCES +in +.Xr natd 8 +for more information. +.It Cm tablearg +Uses argument supplied in lookup table. +See +.Sx LOOKUP TABLES +section below for more information on lookup tables. +.El +.Pp +To let the packet continue after being (de)aliased, set the sysctl variable +.Va net.inet.ip.fw.one_pass +to 0. +For more information about aliasing modes, refer to +.Xr libalias 3 . +See Section +.Sx EXAMPLES +for some examples about nat usage. +.Ss REDIRECT AND LSNAT SUPPORT IN IPFW +Redirect and LSNAT support follow closely the syntax used in +.Xr natd 8 . +See Section +.Sx EXAMPLES +for some examples on how to do redirect and lsnat. +.Ss SCTP NAT SUPPORT +SCTP nat can be configured in a similar manner to TCP through the +.Nm +command line tool. +The main difference is that +.Nm sctp nat +does not do port translation. +Since the local and global side ports will be the same, +there is no need to specify both. +Ports are redirected as follows: +.Bd -ragged -offset indent +.Bk -words +.Cm nat +.Ar nat_number +.Cm config if +.Ar nic +.Cm redirect_port sctp +.Ar ip_address [,addr_list] {[port | port-port] [,ports]} +.Ek +.Ed +.Pp +Most +.Nm sctp nat +configuration can be done in real-time through the +.Xr sysctl 8 +interface. +All may be changed dynamically, though the hash_table size will only +change for new +.Nm nat +instances. +See +.Sx SYSCTL VARIABLES +for more info. +.Sh LOADER TUNABLES +Tunables can be set in +.Xr loader 8 +prompt, +.Xr loader.conf 5 +or +.Xr kenv 1 +before ipfw module gets loaded. +.Bl -tag -width indent +.It Va net.inet.ip.fw.default_to_accept: No 0 +Defines ipfw last rule behavior. +This value overrides +.Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)" +from kernel configuration file. +.It Va net.inet.ip.fw.tables_max: No 128 +Defines number of tables available in ipfw. +Number cannot exceed 65534. +.El +.Sh SYSCTL VARIABLES +A set of +.Xr sysctl 8 +variables controls the behaviour of the firewall and +associated modules +.Pq Nm dummynet , bridge , sctp nat . +These are shown below together with their default value +(but always check with the +.Xr sysctl 8 +command what value is actually in use) and meaning: +.Bl -tag -width indent +.It Va net.inet.ip.alias.sctp.accept_global_ootb_addip: No 0 +Defines how the +.Nm nat +responds to receipt of global OOTB ASCONF-AddIP: +.Bl -tag -width indent +.It Cm 0 +No response (unless a partially matching association exists - +ports and vtags match but global address does not) +.It Cm 1 +.Nm nat +will accept and process all OOTB global AddIP messages. +.El +.Pp +Option 1 should never be selected as this forms a security risk. +An attacker can +establish multiple fake associations by sending AddIP messages. +.It Va net.inet.ip.alias.sctp.chunk_proc_limit: No 5 +Defines the maximum number of chunks in an SCTP packet that will be +parsed for a +packet that matches an existing association. +This value is enforced to be greater or equal than +.Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . +A high value is +a DoS risk yet setting too low a value may result in +important control chunks in +the packet not being located and parsed. +.It Va net.inet.ip.alias.sctp.error_on_ootb: No 1 +Defines when the +.Nm nat +responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. +An OOTB packet is a packet that arrives with no existing association +registered in the +.Nm nat +and is not an INIT or ASCONF-AddIP packet: +.Bl -tag -width indent +.It Cm 0 +ErrorM is never sent in response to OOTB packets. +.It Cm 1 +ErrorM is only sent to OOTB packets received on the local side. +.It Cm 2 +ErrorM is sent to the local side and on the global side ONLY if there is a +partial match (ports and vtags match but the source global IP does not). +This value is only useful if the +.Nm nat +is tracking global IP addresses. +.It Cm 3 +ErrorM is sent in response to all OOTB packets on both +the local and global side +(DoS risk). +.El +.Pp +At the moment the default is 0, since the ErrorM packet is not yet +supported by most SCTP stacks. +When it is supported, and if not tracking +global addresses, we recommend setting this value to 1 to allow +multi-homed local hosts to function with the +.Nm nat . +To track global addresses, we recommend setting this value to 2 to +allow global hosts to be informed when they need to (re)send an +ASCONF-AddIP. +Value 3 should never be chosen (except for debugging) as the +.Nm nat +will respond to all OOTB global packets (a DoS risk). +.It Va net.inet.ip.alias.sctp.hashtable_size: No 2003 +Size of hash tables used for +.Nm nat +lookups (100 < prime_number > 1000001). +This value sets the +.Nm hash table +size for any future created +.Nm nat +instance and therefore must be set prior to creating a +.Nm nat +instance. +The table sizes may be changed to suit specific needs. +If there will be few +concurrent associations, and memory is scarce, you may make these smaller. +If there will be many thousands (or millions) of concurrent associations, you +should make these larger. +A prime number is best for the table size. +The sysctl +update function will adjust your input value to the next highest prime number. +.It Va net.inet.ip.alias.sctp.holddown_time: No 0 +Hold association in table for this many seconds after receiving a +SHUTDOWN-COMPLETE. +This allows endpoints to correct shutdown gracefully if a +shutdown_complete is lost and retransmissions are required. +.It Va net.inet.ip.alias.sctp.init_timer: No 15 +Timeout value while waiting for (INIT-ACK|AddIP-ACK). +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit: No 2 +Defines the maximum number of chunks in an SCTP packet that will be parsed when +no existing association exists that matches that packet. +Ideally this packet +will only be an INIT or ASCONF-AddIP packet. +A higher value may become a DoS +risk as malformed packets can consume processing resources. +.It Va net.inet.ip.alias.sctp.param_proc_limit: No 25 +Defines the maximum number of parameters within a chunk that will be +parsed in a +packet. +As for other similar sysctl variables, larger values pose a DoS risk. +.It Va net.inet.ip.alias.sctp.log_level: No 0 +Level of detail in the system log messages (0 \- minimal, 1 \- event, +2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). +May be a good +option in high loss environments. +.It Va net.inet.ip.alias.sctp.shutdown_time: No 15 +Timeout value while waiting for SHUTDOWN-COMPLETE. +This value cannot be 0. +.It Va net.inet.ip.alias.sctp.track_global_addresses: No 0 +Enables/disables global IP address tracking within the +.Nm nat +and places an +upper limit on the number of addresses tracked for each association: +.Bl -tag -width indent +.It Cm 0 +Global tracking is disabled +.It Cm >1 +Enables tracking, the maximum number of addresses tracked for each +association is limited to this value +.El +.Pp +This variable is fully dynamic, the new value will be adopted for all newly +arriving associations, existing associations are treated +as they were previously. +Global tracking will decrease the number of collisions within the +.Nm nat +at a cost +of increased processing load, memory usage, complexity, and possible +.Nm nat +state +problems in complex networks with multiple +.Nm nats . +We recommend not tracking +global IP addresses, this will still result in a fully functional +.Nm nat . +.It Va net.inet.ip.alias.sctp.up_timer: No 300 +Timeout value to keep an association up with no traffic. +This value cannot be 0. +.It Va net.inet.ip.dummynet.expire : No 1 +Lazily delete dynamic pipes/queue once they have no pending traffic. +You can disable this by setting the variable to 0, in which case +the pipes/queues will only be deleted when the threshold is reached. +.It Va net.inet.ip.dummynet.hash_size : No 64 +Default size of the hash table used for dynamic pipes/queues. +This value is used when no +.Cm buckets +option is specified when configuring a pipe/queue. +.It Va net.inet.ip.dummynet.io_fast : No 0 +If set to a non-zero value, +the +.Dq fast +mode of +.Nm dummynet +operation (see above) is enabled. +.It Va net.inet.ip.dummynet.io_pkt +Number of packets passed to +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_drop +Number of packets dropped by +.Nm dummynet . +.It Va net.inet.ip.dummynet.io_pkt_fast +Number of packets bypassed by the +.Nm dummynet +scheduler. +.It Va net.inet.ip.dummynet.max_chain_len : No 16 +Target value for the maximum number of pipes/queues in a hash bucket. +The product +.Cm max_chain_len*hash_size +is used to determine the threshold over which empty pipes/queues +will be expired even when +.Cm net.inet.ip.dummynet.expire=0 . +.It Va net.inet.ip.dummynet.red_lookup_depth : No 256 +.It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 +.It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 +Parameters used in the computations of the drop probability +for the RED algorithm. +.It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 +.It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 +The maximum queue size that can be specified in bytes or packets. +These limits prevent accidental exhaustion of resources such as mbufs. +If you raise these limits, +you should make sure the system is configured so that sufficient resources +are available. +.It Va net.inet.ip.fw.autoinc_step : No 100 +Delta between rule numbers when auto-generating them. +The value must be in the range 1..1000. +.It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets +The current number of buckets in the hash table for dynamic rules +(readonly). +.It Va net.inet.ip.fw.debug : No 1 +Controls debugging messages produced by +.Nm . +.It Va net.inet.ip.fw.default_rule : No 65535 +The default rule number (read-only). +By the design of +.Nm , the default rule is the last one, so its number +can also serve as the highest number allowed for a rule. +.It Va net.inet.ip.fw.dyn_buckets : No 256 +The number of buckets in the hash table for dynamic rules. +Must be a power of 2, up to 65536. +It only takes effect when all dynamic rules have expired, so you +are advised to use a +.Cm flush +command to make sure that the hash table is resized. +.It Va net.inet.ip.fw.dyn_count : No 3 +Current number of dynamic rules +(read-only). +.It Va net.inet.ip.fw.dyn_keepalive : No 1 +Enables generation of keepalive packets for +.Cm keep-state +rules on TCP sessions. +A keepalive is generated to both +sides of the connection every 5 seconds for the last 20 +seconds of the lifetime of the rule. +.It Va net.inet.ip.fw.dyn_max : No 8192 +Maximum number of dynamic rules. +When you hit this limit, no more dynamic rules can be +installed until old ones expire. +.It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 +.It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 +.It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 +.It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 +.It Va net.inet.ip.fw.dyn_short_lifetime : No 30 +These variables control the lifetime, in seconds, of dynamic +rules. +Upon the initial SYN exchange the lifetime is kept short, +then increased after both SYN have been seen, then decreased +again during the final FIN exchange or when a RST is received. +Both +.Em dyn_fin_lifetime +and +.Em dyn_rst_lifetime +must be strictly lower than 5 seconds, the period of +repetition of keepalives. +The firewall enforces that. +.It Va net.inet.ip.fw.dyn_keep_states: No 0 +Keep dynamic states on rule/set deletion. +States are relinked to default rule (65535). +This can be handly for ruleset reload. +Turned off by default. +.It Va net.inet.ip.fw.enable : No 1 +Enables the firewall. +Setting this variable to 0 lets you run your machine without +firewall even if compiled in. +.It Va net.inet6.ip6.fw.enable : No 1 +provides the same functionality as above for the IPv6 case. +.It Va net.inet.ip.fw.one_pass : No 1 +When set, the packet exiting from the +.Nm dummynet +pipe or from +.Xr ng_ipfw 4 +node is not passed though the firewall again. +Otherwise, after an action, the packet is +reinjected into the firewall at the next rule. +.It Va net.inet.ip.fw.tables_max : No 128 +Maximum number of tables. +.It Va net.inet.ip.fw.verbose : No 1 +Enables verbose messages. +.It Va net.inet.ip.fw.verbose_limit : No 0 +Limits the number of messages produced by a verbose firewall. +.It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 +If enabled packets with unknown IPv6 Extension Headers will be denied. +.It Va net.link.ether.ipfw : No 0 +Controls whether layer-2 packets are passed to +.Nm . +Default is no. +.It Va net.link.bridge.ipfw : No 0 +Controls whether bridged packets are passed to +.Nm . +Default is no. +.El +.Sh INTERNAL DIAGNOSTICS +There are some commands that may be useful to understand current state +of certain subsystems inside kernel module. +These commands provide debugging output which may change without notice. +.Pp +Currently the following commands are available as +.Cm internal +sub-options: +.Bl -tag -width indent +.It Cm iflist +Lists all interface which are currently tracked by +.Nm +with their in-kernel status. +.It Cm talist +List all table lookup algorithms currently available. +.El +.Sh EXAMPLES +There are far too many possible uses of +.Nm +so this Section will only give a small set of examples. +.Pp +.Ss BASIC PACKET FILTERING +This command adds an entry which denies all tcp packets from +.Em cracker.evil.org +to the telnet port of +.Em wolf.tambov.su +from being forwarded by the host: +.Pp +.Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" +.Pp +This one disallows any connection from the entire cracker's +network to my host: +.Pp +.Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" +.Pp +A first and efficient way to limit access (not using dynamic rules) +is the use of the following rules: +.Pp +.Dl "ipfw add allow tcp from any to any established" +.Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" +.Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" +.Dl "..." +.Dl "ipfw add deny tcp from any to any" +.Pp +The first rule will be a quick match for normal TCP packets, +but it will not match the initial SYN packet, which will be +matched by the +.Cm setup +rules only for selected source/destination pairs. +All other SYN packets will be rejected by the final +.Cm deny +rule. +.Pp +If you administer one or more subnets, you can take advantage +of the address sets and or-blocks and write extremely +compact rulesets which selectively enable services to blocks +of clients, as below: +.Pp +.Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" +.Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" +.Dl "" +.Dl "ipfw add allow ip from ${goodguys} to any" +.Dl "ipfw add deny ip from ${badguys} to any" +.Dl "... normal policies ..." +.Pp +The +.Cm verrevpath +option could be used to do automated anti-spoofing by adding the +following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not verrevpath in" +.Pp +This rule drops all incoming packets that appear to be coming to the +system on the wrong interface. +For example, a packet with a source +address belonging to a host on a protected internal network would be +dropped if it tried to enter the system from an external interface. +.Pp +The +.Cm antispoof +option could be used to do similar but more restricted anti-spoofing +by adding the following to the top of a ruleset: +.Pp +.Dl "ipfw add deny ip from any to any not antispoof in" +.Pp +This rule drops all incoming packets that appear to be coming from another +directly connected system but on the wrong interface. +For example, a packet with a source address of +.Li 192.168.0.0/24 , +configured on +.Li fxp0 , +but coming in on +.Li fxp1 +would be dropped. +.Pp +The +.Cm setdscp +option could be used to (re)mark user traffic, +by adding the following to the appropriate place in ruleset: +.Pp +.Dl "ipfw add setdscp be ip from any to any dscp af11,af21" +.Ss DYNAMIC RULES +In order to protect a site from flood attacks involving fake +TCP packets, it is safer to use dynamic rules: +.Pp +.Dl "ipfw add check-state" +.Dl "ipfw add deny tcp from any to any established" +.Dl "ipfw add allow tcp from my-net to any setup keep-state" +.Pp +This will let the firewall install dynamic rules only for +those connection which start with a regular SYN packet coming +from the inside of our network. +Dynamic rules are checked when encountering the first +occurrence of a +.Cm check-state , +.Cm keep-state +or +.Cm limit +rule. +A +.Cm check-state +rule should usually be placed near the beginning of the +ruleset to minimize the amount of work scanning the ruleset. +Your mileage may vary. +.Pp +To limit the number of connections a user can open +you can use the following type of rules: +.Pp +.Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" +.Dl "ipfw add allow tcp from any to me setup limit src-addr 4" +.Pp +The former (assuming it runs on a gateway) will allow each host +on a /24 network to open at most 10 TCP connections. +The latter can be placed on a server to make sure that a single +client does not use more than 4 simultaneous connections. +.Pp +.Em BEWARE : +stateful rules can be subject to denial-of-service attacks +by a SYN-flood which opens a huge number of dynamic rules. +The effects of such attacks can be partially limited by +acting on a set of +.Xr sysctl 8 +variables which control the operation of the firewall. +.Pp +Here is a good usage of the +.Cm list +command to see accounting records and timestamp information: +.Pp +.Dl ipfw -at list +.Pp +or in short form without timestamps: +.Pp +.Dl ipfw -a list +.Pp +which is equivalent to: +.Pp +.Dl ipfw show +.Pp +Next rule diverts all incoming packets from 192.168.2.0/24 +to divert port 5000: +.Pp +.Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in +.Ss TRAFFIC SHAPING +The following rules show some of the applications of +.Nm +and +.Nm dummynet +for simulations and the like. +.Pp +This rule drops random incoming packets with a probability +of 5%: +.Pp +.Dl "ipfw add prob 0.05 deny ip from any to any in" +.Pp +A similar effect can be achieved making use of +.Nm dummynet +pipes: +.Pp +.Dl "ipfw add pipe 10 ip from any to any" +.Dl "ipfw pipe 10 config plr 0.05" +.Pp +We can use pipes to artificially limit bandwidth, e.g.\& on a +machine acting as a router, if we want to limit traffic from +local clients on 192.168.2.0/24 we do: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw pipe 1 config bw 300Kbit/s queue 50KBytes" +.Pp +note that we use the +.Cm out +modifier so that the rule is not used twice. +Remember in fact that +.Nm +rules are checked both on incoming and outgoing packets. +.Pp +Should we want to simulate a bidirectional link with bandwidth +limitations, the correct way is the following: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config bw 64Kbit/s queue 10Kbytes" +.Dl "ipfw pipe 2 config bw 64Kbit/s queue 10Kbytes" +.Pp +The above can be very useful, e.g.\& if you want to see how +your fancy Web page will look for a residential user who +is connected only through a slow link. +You should not use only one pipe for both directions, unless +you want to simulate a half-duplex medium (e.g.\& AppleTalk, +Ethernet, IRDA). +It is not necessary that both pipes have the same configuration, +so we can also simulate asymmetric links. +.Pp +Should we want to verify network performance with the RED queue +management algorithm: +.Pp +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" +.Pp +Another typical application of the traffic shaper is to +introduce some delay in the communication. +This can significantly affect applications which do a lot of Remote +Procedure Calls, and where the round-trip-time of the +connection often becomes a limiting factor much more than +bandwidth: +.Pp +.Dl "ipfw add pipe 1 ip from any to any out" +.Dl "ipfw add pipe 2 ip from any to any in" +.Dl "ipfw pipe 1 config delay 250ms bw 1Mbit/s" +.Dl "ipfw pipe 2 config delay 250ms bw 1Mbit/s" +.Pp +Per-flow queueing can be useful for a variety of purposes. +A very simple one is counting traffic: +.Pp +.Dl "ipfw add pipe 1 tcp from any to any" +.Dl "ipfw add pipe 1 udp from any to any" +.Dl "ipfw add pipe 1 ip from any to any" +.Dl "ipfw pipe 1 config mask all" +.Pp +The above set of rules will create queues (and collect +statistics) for all traffic. +Because the pipes have no limitations, the only effect is +collecting statistics. +Note that we need 3 rules, not just the last one, because +when +.Nm +tries to match IP packets it will not consider ports, so we +would not see connections on separate ports as different +ones. +.Pp +A more sophisticated example is limiting the outbound traffic +on a net with per-host limits, rather than per-network limits: +.Pp +.Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" +.Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" +.Dl "ipfw pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Dl "ipfw pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" +.Ss LOOKUP TABLES +In the following example, we need to create several traffic bandwidth +classes and we need different hosts/networks to fall into different classes. +We create one pipe for each class and configure them accordingly. +Then we create a single table and fill it with IP subnets and addresses. +For each subnet/host we set the argument equal to the number of the pipe +that it should use. +Then we classify traffic using a single rule: +.Pp +.Dl "ipfw pipe 1 config bw 1000Kbyte/s" +.Dl "ipfw pipe 4 config bw 4000Kbyte/s" +.Dl "..." +.Dl "ipfw table T1 create type addr" +.Dl "ipfw table T1 add 192.168.2.0/24 1" +.Dl "ipfw table T1 add 192.168.0.0/27 4" +.Dl "ipfw table T1 add 192.168.0.2 1" +.Dl "..." +.Dl "ipfw add pipe tablearg ip from 'table(T1)' to any" +.Pp +Using the +.Cm fwd +action, the table entries may include hostnames and IP addresses. +.Pp +.Dl "ipfw table T2 create type addr ftype ip" +.Dl "ipfw table T2 add 192.168.2.0/24 10.23.2.1" +.Dl "ipfw table T21 add 192.168.0.0/27 router1.dmz" +.Dl "..." +.Dl "ipfw add 100 fwd tablearg ip from any to table(1)" +.Pp +In the following example per-interface firewall is created: +.Pp +.Dl "ipfw table IN create type iface valtype skipto,fib" +.Dl "ipfw table IN add vlan20 12000,12" +.Dl "ipfw table IN add vlan30 13000,13" +.Dl "ipfw table OUT create type iface valtype skipto" +.Dl "ipfw table OUT add vlan20 22000" +.Dl "ipfw table OUT add vlan30 23000" +.Dl ".." +.Dl "ipfw add 100 ipfw setfib tablearg ip from any to any recv 'table(IN)' in" +.Dl "ipfw add 200 ipfw skipto tablearg ip from any to any recv 'table(IN)' in" +.Dl "ipfw add 300 ipfw skipto tablearg ip from any to any xmit 'table(OUT)' out" +.Pp +The following example illustrate usage of flow tables: +.Pp +.Dl "ipfw table fl create type flow:flow:src-ip,proto,dst-ip,dst-port" +.Dl "ipfw table fl add 2a02:6b8:77::88,tcp,2a02:6b8:77::99,80 11" +.Dl "ipfw table fl add 10.0.0.1,udp,10.0.0.2,53 12" +.Dl ".." +.Dl "ipfw add 100 allow ip from any to any flow 'table(fl,11)' recv ix0" +.Ss SETS OF RULES +To add a set of rules atomically, e.g.\& set 18: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18" +.Pp +To delete a set of rules atomically the command is simply: +.Pp +.Dl "ipfw delete set 18" +.Pp +To test a ruleset and disable it and regain control if something goes wrong: +.Pp +.Dl "ipfw set disable 18" +.Dl "ipfw add NN set 18 ... # repeat as needed" +.Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" +.Pp +Here if everything goes well, you press control-C before the "sleep" +terminates, and your ruleset will be left active. +Otherwise, e.g.\& if +you cannot access your box, the ruleset will be disabled after +the sleep terminates thus restoring the previous situation. +.Pp +To show rules of the specific set: +.Pp +.Dl "ipfw set 18 show" +.Pp +To show rules of the disabled set: +.Pp +.Dl "ipfw -S set 18 show" +.Pp +To clear a specific rule counters of the specific set: +.Pp +.Dl "ipfw set 18 zero NN" +.Pp +To delete a specific rule of the specific set: +.Pp +.Dl "ipfw set 18 delete NN" +.Ss NAT, REDIRECT AND LSNAT +First redirect all the traffic to nat instance 123: +.Pp +.Dl "ipfw add nat 123 all from any to any" +.Pp +Then to configure nat instance 123 to alias all the outgoing traffic with ip +192.168.0.123, blocking all incoming connections, trying to keep +same ports on both sides, clearing aliasing table on address change +and keeping a log of traffic/link statistics: +.Pp +.Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" +.Pp +Or to change address of instance 123, aliasing table will be cleared (see +reset option): +.Pp +.Dl "ipfw nat 123 config ip 10.0.0.1" +.Pp +To see configuration of nat instance 123: +.Pp +.Dl "ipfw nat 123 show config" +.Pp +To show logs of all the instances in range 111-999: +.Pp +.Dl "ipfw nat 111-999 show" +.Pp +To see configurations of all instances: +.Pp +.Dl "ipfw nat show config" +.Pp +Or a redirect rule with mixed modes could looks like: +.Pp +.Dl "ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl " redirect_port tcp 192.168.0.1:80 500" +.Dl " redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl " redirect_addr 192.168.0.10,192.168.0.11" +.Dl " 10.0.0.100 # LSNAT" +.Dl " redirect_port tcp 192.168.0.1:80,192.168.0.10:22" +.Dl " 500 # LSNAT" +.Pp +or it could be split in: +.Pp +.Dl "ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66" +.Dl "ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500" +.Dl "ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1" +.Dl "ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12" +.Dl " 10.0.0.100" +.Dl "ipfw nat 5 config redirect_port tcp" +.Dl " 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500" +.Sh SEE ALSO +.Xr cpp 1 , +.Xr m4 1 , +.Xr altq 4 , +.Xr divert 4 , +.Xr dummynet 4 , +.Xr if_bridge 4 , +.Xr ip 4 , +.Xr ipfirewall 4 , +.Xr ng_ipfw 4 , +.Xr protocols 5 , +.Xr services 5 , +.Xr init 8 , +.Xr kldload 8 , +.Xr reboot 8 , +.Xr sysctl 8 , +.Xr syslogd 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx 2.0 . +.Nm dummynet +was introduced in +.Fx 2.2.8 . +Stateful extensions were introduced in +.Fx 4.0 . +.Nm ipfw2 +was introduced in Summer 2002. +.Sh AUTHORS +.An Ugen J. S. Antsilevich , +.An Poul-Henning Kamp , +.An Alex Nash , +.An Archie Cobbs , +.An Luigi Rizzo . +.Pp +.An -nosplit +API based upon code written by +.An Daniel Boulet +for BSDI. +.Pp +Dummynet has been introduced by Luigi Rizzo in 1997-1998. +.Pp +Some early work (1999-2000) on the +.Nm dummynet +traffic shaper supported by Akamba Corp. +.Pp +The ipfw core (ipfw2) has been completely redesigned and +reimplemented by Luigi Rizzo in summer 2002. +Further +actions and +options have been added by various developer over the years. +.Pp +.An -nosplit +In-kernel NAT support written by +.An Paolo Pisati Aq Mt piso@FreeBSD.org +as part of a Summer of Code 2005 project. +.Pp +SCTP +.Nm nat +support has been developed by +.An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . +The primary developers and maintainers are David Hayes and Jason But. +For further information visit: +.Aq http://www.caia.swin.edu.au/urp/SONATA +.Pp +Delay profiles have been developed by Alessandro Cerri and +Luigi Rizzo, supported by the +European Commission within Projects Onelab and Onelab2. +.Sh BUGS +The syntax has grown over the years and sometimes it might be confusing. +Unfortunately, backward compatibility prevents cleaning up mistakes +made in the definition of the syntax. +.Pp +.Em !!! WARNING !!! +.Pp +Misconfiguring the firewall can put your computer in an unusable state, +possibly shutting down network services and requiring console access to +regain control of it. +.Pp +Incoming packet fragments diverted by +.Cm divert +are reassembled before delivery to the socket. +The action used on those packet is the one from the +rule which matches the first fragment of the packet. +.Pp +Packets diverted to userland, and then reinserted by a userland process +may lose various packet attributes. +The packet source interface name +will be preserved if it is shorter than 8 bytes and the userland process +saves and reuses the sockaddr_in +(as does +.Xr natd 8 ) ; +otherwise, it may be lost. +If a packet is reinserted in this manner, later rules may be incorrectly +applied, making the order of +.Cm divert +rules in the rule sequence very important. +.Pp +Dummynet drops all packets with IPv6 link-local addresses. +.Pp +Rules using +.Cm uid +or +.Cm gid +may not behave as expected. +In particular, incoming SYN packets may +have no uid or gid associated with them since they do not yet belong +to a TCP connection, and the uid/gid associated with a packet may not +be as expected if the associated process calls +.Xr setuid 2 +or similar system calls. +.Pp +Rule syntax is subject to the command line environment and some patterns +may need to be escaped with the backslash character +or quoted appropriately. +.Pp +Due to the architecture of +.Xr libalias 3 , +ipfw nat is not compatible with the TCP segmentation offloading (TSO). +Thus, to reliably nat your network traffic, please disable TSO +on your NICs using +.Xr ifconfig 8 . +.Pp +ICMP error messages are not implicitly matched by dynamic rules +for the respective conversations. +To avoid failures of network error detection and path MTU discovery, +ICMP error messages may need to be allowed explicitly through static +rules. +.Pp +Rules using +.Cm call +and +.Cm return +actions may lead to confusing behaviour if ruleset has mistakes, +and/or interaction with other subsystems (netgraph, dummynet, etc.) is used. +One possible case for this is packet leaving +.Nm +in subroutine on the input pass, while later on output encountering unpaired +.Cm return +first. +As the call stack is kept intact after input pass, packet will suddenly +return to the rule number used on input pass, not on output one. +Order of processing should be checked carefully to avoid such mistakes. diff --git a/example/ipfw/ipfw/ipfw2.c b/example/ipfw/ipfw/ipfw2.c new file mode 100644 index 0000000..b8ef6ee --- /dev/null +++ b/example/ipfw/ipfw/ipfw2.c @@ -0,0 +1,4968 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.c 273253 2014-10-18 15:18:31Z melifaro $ + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sysctl.h> + +#include "ipfw2.h" + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <grp.h> +#include <netdb.h> +#include <pwd.h> +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <time.h> /* ctime */ +#include <timeconv.h> /* _long_to_time */ +#include <unistd.h> +#include <fcntl.h> +#include <stddef.h> /* offsetof */ + +#include <net/ethernet.h> +#include <net/if.h> /* only IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/in_systm.h> /* only n_short, n_long */ +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +struct cmdline_opts co; /* global options */ + +struct format_opts { + int bcwidth; + int pcwidth; + int show_counters; + uint32_t set_mask; /* enabled sets mask */ + uint32_t flags; /* request flags */ + uint32_t first; /* first rule to request */ + uint32_t last; /* last rule to request */ + uint32_t dcnt; /* number of dynamic states */ + ipfw_obj_ctlv *tstate; /* table state data */ +}; + +int resvd_set_number = RESVD_SET; + +int ipfw_socket = -1; + +#define CHECK_LENGTH(v, len) do { \ + if ((v) < (len)) \ + errx(EX_DATAERR, "Rule too long"); \ + } while (0) +/* + * Check if we have enough space in cmd buffer. Note that since + * first 8? u32 words are reserved by reserved header, full cmd + * buffer can't be used, so we need to protect from buffer overrun + * only. At the beginnig, cblen is less than actual buffer size by + * size of ipfw_insn_u32 instruction + 1 u32 work. This eliminates need + * for checking small instructions fitting in given range. + * We also (ab)use the fact that ipfw_insn is always the first field + * for any custom instruction. + */ +#define CHECK_CMDLEN CHECK_LENGTH(cblen, F_LEN((ipfw_insn *)cmd)) + +#define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ + if (!av[0]) \ + errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ + if (_substrcmp(*av, "tablearg") == 0) { \ + arg = IP_FW_TARG; \ + break; \ + } \ + \ + { \ + long _xval; \ + char *end; \ + \ + _xval = strtol(*av, &end, 10); \ + \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ + errx(EX_DATAERR, "%s: invalid argument: %s", \ + match_value(s_x, tok), *av); \ + \ + if (errno == ERANGE || _xval < min || _xval > max) \ + errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ + match_value(s_x, tok), min, max, *av); \ + \ + if (_xval == IP_FW_TARG) \ + errx(EX_DATAERR, "%s: illegal argument value: %s", \ + match_value(s_x, tok), *av); \ + arg = _xval; \ + } \ +} while (0) + +static struct _s_x f_tcpflags[] = { + { "syn", TH_SYN }, + { "fin", TH_FIN }, + { "ack", TH_ACK }, + { "psh", TH_PUSH }, + { "rst", TH_RST }, + { "urg", TH_URG }, + { "tcp flag", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_tcpopts[] = { + { "mss", IP_FW_TCPOPT_MSS }, + { "maxseg", IP_FW_TCPOPT_MSS }, + { "window", IP_FW_TCPOPT_WINDOW }, + { "sack", IP_FW_TCPOPT_SACK }, + { "ts", IP_FW_TCPOPT_TS }, + { "timestamp", IP_FW_TCPOPT_TS }, + { "cc", IP_FW_TCPOPT_CC }, + { "tcp option", 0 }, + { NULL, 0 } +}; + +/* + * IP options span the range 0 to 255 so we need to remap them + * (though in fact only the low 5 bits are significant). + */ +static struct _s_x f_ipopts[] = { + { "ssrr", IP_FW_IPOPT_SSRR}, + { "lsrr", IP_FW_IPOPT_LSRR}, + { "rr", IP_FW_IPOPT_RR}, + { "ts", IP_FW_IPOPT_TS}, + { "ip option", 0 }, + { NULL, 0 } +}; + +static struct _s_x f_iptos[] = { + { "lowdelay", IPTOS_LOWDELAY}, + { "throughput", IPTOS_THROUGHPUT}, + { "reliability", IPTOS_RELIABILITY}, + { "mincost", IPTOS_MINCOST}, + { "congestion", IPTOS_ECN_CE}, + { "ecntransport", IPTOS_ECN_ECT0}, + { "ip tos option", 0}, + { NULL, 0 } +}; + +struct _s_x f_ipdscp[] = { + { "af11", IPTOS_DSCP_AF11 >> 2 }, /* 001010 */ + { "af12", IPTOS_DSCP_AF12 >> 2 }, /* 001100 */ + { "af13", IPTOS_DSCP_AF13 >> 2 }, /* 001110 */ + { "af21", IPTOS_DSCP_AF21 >> 2 }, /* 010010 */ + { "af22", IPTOS_DSCP_AF22 >> 2 }, /* 010100 */ + { "af23", IPTOS_DSCP_AF23 >> 2 }, /* 010110 */ + { "af31", IPTOS_DSCP_AF31 >> 2 }, /* 011010 */ + { "af32", IPTOS_DSCP_AF32 >> 2 }, /* 011100 */ + { "af33", IPTOS_DSCP_AF33 >> 2 }, /* 011110 */ + { "af41", IPTOS_DSCP_AF41 >> 2 }, /* 100010 */ + { "af42", IPTOS_DSCP_AF42 >> 2 }, /* 100100 */ + { "af43", IPTOS_DSCP_AF43 >> 2 }, /* 100110 */ + { "be", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ + { "ef", IPTOS_DSCP_EF >> 2 }, /* 101110 */ + { "cs0", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ + { "cs1", IPTOS_DSCP_CS1 >> 2 }, /* 001000 */ + { "cs2", IPTOS_DSCP_CS2 >> 2 }, /* 010000 */ + { "cs3", IPTOS_DSCP_CS3 >> 2 }, /* 011000 */ + { "cs4", IPTOS_DSCP_CS4 >> 2 }, /* 100000 */ + { "cs5", IPTOS_DSCP_CS5 >> 2 }, /* 101000 */ + { "cs6", IPTOS_DSCP_CS6 >> 2 }, /* 110000 */ + { "cs7", IPTOS_DSCP_CS7 >> 2 }, /* 100000 */ + { NULL, 0 } +}; + +static struct _s_x limit_masks[] = { + {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, + {"src-addr", DYN_SRC_ADDR}, + {"src-port", DYN_SRC_PORT}, + {"dst-addr", DYN_DST_ADDR}, + {"dst-port", DYN_DST_PORT}, + {NULL, 0} +}; + +/* + * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines + * This is only used in this code. + */ +#define IPPROTO_ETHERTYPE 0x1000 +static struct _s_x ether_types[] = { + /* + * Note, we cannot use "-:&/" in the names because they are field + * separators in the type specifications. Also, we use s = NULL as + * end-delimiter, because a type of 0 can be legal. + */ + { "ip", 0x0800 }, + { "ipv4", 0x0800 }, + { "ipv6", 0x86dd }, + { "arp", 0x0806 }, + { "rarp", 0x8035 }, + { "vlan", 0x8100 }, + { "loop", 0x9000 }, + { "trail", 0x1000 }, + { "at", 0x809b }, + { "atalk", 0x809b }, + { "aarp", 0x80f3 }, + { "pppoe_disc", 0x8863 }, + { "pppoe_sess", 0x8864 }, + { "ipx_8022", 0x00E0 }, + { "ipx_8023", 0x0000 }, + { "ipx_ii", 0x8137 }, + { "ipx_snap", 0x8137 }, + { "ipx", 0x8137 }, + { "ns", 0x0600 }, + { NULL, 0 } +}; + + +static struct _s_x rule_actions[] = { + { "accept", TOK_ACCEPT }, + { "pass", TOK_ACCEPT }, + { "allow", TOK_ACCEPT }, + { "permit", TOK_ACCEPT }, + { "count", TOK_COUNT }, + { "pipe", TOK_PIPE }, + { "queue", TOK_QUEUE }, + { "divert", TOK_DIVERT }, + { "tee", TOK_TEE }, + { "netgraph", TOK_NETGRAPH }, + { "ngtee", TOK_NGTEE }, + { "fwd", TOK_FORWARD }, + { "forward", TOK_FORWARD }, + { "skipto", TOK_SKIPTO }, + { "deny", TOK_DENY }, + { "drop", TOK_DENY }, + { "reject", TOK_REJECT }, + { "reset6", TOK_RESET6 }, + { "reset", TOK_RESET }, + { "unreach6", TOK_UNREACH6 }, + { "unreach", TOK_UNREACH }, + { "check-state", TOK_CHECKSTATE }, + { "//", TOK_COMMENT }, + { "nat", TOK_NAT }, + { "reass", TOK_REASS }, + { "setfib", TOK_SETFIB }, + { "setdscp", TOK_SETDSCP }, + { "call", TOK_CALL }, + { "return", TOK_RETURN }, + { NULL, 0 } /* terminator */ +}; + +static struct _s_x rule_action_params[] = { + { "altq", TOK_ALTQ }, + { "log", TOK_LOG }, + { "tag", TOK_TAG }, + { "untag", TOK_UNTAG }, + { NULL, 0 } /* terminator */ +}; + +/* + * The 'lookup' instruction accepts one of the following arguments. + * -1 is a terminator for the list. + * Arguments are passed as v[1] in O_DST_LOOKUP options. + */ +static int lookup_key[] = { + TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; + +static struct _s_x rule_options[] = { + { "tagged", TOK_TAGGED }, + { "uid", TOK_UID }, + { "gid", TOK_GID }, + { "jail", TOK_JAIL }, + { "in", TOK_IN }, + { "limit", TOK_LIMIT }, + { "keep-state", TOK_KEEPSTATE }, + { "bridged", TOK_LAYER2 }, + { "layer2", TOK_LAYER2 }, + { "out", TOK_OUT }, + { "diverted", TOK_DIVERTED }, + { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, + { "diverted-output", TOK_DIVERTEDOUTPUT }, + { "xmit", TOK_XMIT }, + { "recv", TOK_RECV }, + { "via", TOK_VIA }, + { "fragment", TOK_FRAG }, + { "frag", TOK_FRAG }, + { "fib", TOK_FIB }, + { "ipoptions", TOK_IPOPTS }, + { "ipopts", TOK_IPOPTS }, + { "iplen", TOK_IPLEN }, + { "ipid", TOK_IPID }, + { "ipprecedence", TOK_IPPRECEDENCE }, + { "dscp", TOK_DSCP }, + { "iptos", TOK_IPTOS }, + { "ipttl", TOK_IPTTL }, + { "ipversion", TOK_IPVER }, + { "ipver", TOK_IPVER }, + { "estab", TOK_ESTAB }, + { "established", TOK_ESTAB }, + { "setup", TOK_SETUP }, + { "sockarg", TOK_SOCKARG }, + { "tcpdatalen", TOK_TCPDATALEN }, + { "tcpflags", TOK_TCPFLAGS }, + { "tcpflgs", TOK_TCPFLAGS }, + { "tcpoptions", TOK_TCPOPTS }, + { "tcpopts", TOK_TCPOPTS }, + { "tcpseq", TOK_TCPSEQ }, + { "tcpack", TOK_TCPACK }, + { "tcpwin", TOK_TCPWIN }, + { "icmptype", TOK_ICMPTYPES }, + { "icmptypes", TOK_ICMPTYPES }, + { "dst-ip", TOK_DSTIP }, + { "src-ip", TOK_SRCIP }, + { "dst-port", TOK_DSTPORT }, + { "src-port", TOK_SRCPORT }, + { "proto", TOK_PROTO }, + { "MAC", TOK_MAC }, + { "mac", TOK_MAC }, + { "mac-type", TOK_MACTYPE }, + { "verrevpath", TOK_VERREVPATH }, + { "versrcreach", TOK_VERSRCREACH }, + { "antispoof", TOK_ANTISPOOF }, + { "ipsec", TOK_IPSEC }, + { "icmp6type", TOK_ICMP6TYPES }, + { "icmp6types", TOK_ICMP6TYPES }, + { "ext6hdr", TOK_EXT6HDR}, + { "flow-id", TOK_FLOWID}, + { "ipv6", TOK_IPV6}, + { "ip6", TOK_IPV6}, + { "ipv4", TOK_IPV4}, + { "ip4", TOK_IPV4}, + { "dst-ipv6", TOK_DSTIP6}, + { "dst-ip6", TOK_DSTIP6}, + { "src-ipv6", TOK_SRCIP6}, + { "src-ip6", TOK_SRCIP6}, + { "lookup", TOK_LOOKUP}, + { "flow", TOK_FLOW}, + { "//", TOK_COMMENT }, + + { "not", TOK_NOT }, /* pseudo option */ + { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ + { "or", TOK_OR }, /* pseudo option */ + { "|", /* escape */ TOK_OR }, /* pseudo option */ + { "{", TOK_STARTBRACE }, /* pseudo option */ + { "(", TOK_STARTBRACE }, /* pseudo option */ + { "}", TOK_ENDBRACE }, /* pseudo option */ + { ")", TOK_ENDBRACE }, /* pseudo option */ + { NULL, 0 } /* terminator */ +}; + +void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg); +static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, + ipfw_cfg_lheader **pcfg, size_t *psize); +static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, + ipfw_cfg_lheader *cfg, size_t sz, int ac, char **av); +static void ipfw_list_tifaces(void); + +/* + * Simple string buffer API. + * Used to simplify buffer passing between function and for + * transparent overrun handling. + */ + +/* + * Allocates new buffer of given size @sz. + * + * Returns 0 on success. + */ +int +bp_alloc(struct buf_pr *b, size_t size) +{ + memset(b, 0, sizeof(struct buf_pr)); + + if ((b->buf = calloc(1, size)) == NULL) + return (ENOMEM); + + b->ptr = b->buf; + b->size = size; + b->avail = b->size; + + return (0); +} + +void +bp_free(struct buf_pr *b) +{ + + free(b->buf); +} + +/* + * Flushes buffer so new writer start from beginning. + */ +void +bp_flush(struct buf_pr *b) +{ + + b->ptr = b->buf; + b->avail = b->size; +} + +/* + * Print message specified by @format and args. + * Automatically manage buffer space and transparently handle + * buffer overruns. + * + * Returns number of bytes that should have been printed. + */ +int +bprintf(struct buf_pr *b, char *format, ...) +{ + va_list args; + int i; + + va_start(args, format); + + i = vsnprintf(b->ptr, b->avail, format, args); + va_end(args); + + if (i > b->avail || i < 0) { + /* Overflow or print error */ + b->avail = 0; + } else { + b->ptr += i; + b->avail -= i; + } + + b->needed += i; + + return (i); +} + +/* + * Special values printer for tablearg-aware opcodes. + */ +void +bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg) +{ + + if (str != NULL) + bprintf(bp, "%s", str); + if (arg == IP_FW_TARG) + bprintf(bp, "tablearg"); + else + bprintf(bp, "%u", arg); +} + +/* + * Helper routine to print a possibly unaligned uint64_t on + * various platform. If width > 0, print the value with + * the desired width, followed by a space; + * otherwise, return the required width. + */ +int +pr_u64(struct buf_pr *b, uint64_t *pd, int width) +{ +#ifdef TCC +#define U64_FMT "I64" +#else +#define U64_FMT "llu" +#endif + uint64_t u; + unsigned long long d; + + bcopy (pd, &u, sizeof(u)); + d = u; + return (width > 0) ? + bprintf(b, "%*" U64_FMT " ", width, d) : + snprintf(NULL, 0, "%" U64_FMT, d) ; +#undef U64_FMT +} + + +void * +safe_calloc(size_t number, size_t size) +{ + void *ret = calloc(number, size); + + if (ret == NULL) + err(EX_OSERR, "calloc"); + return ret; +} + +void * +safe_realloc(void *ptr, size_t size) +{ + void *ret = realloc(ptr, size); + + if (ret == NULL) + err(EX_OSERR, "realloc"); + return ret; +} + +/* + * Compare things like interface or table names. + */ +int +stringnum_cmp(const char *a, const char *b) +{ + int la, lb; + + la = strlen(a); + lb = strlen(b); + + if (la > lb) + return (1); + else if (la < lb) + return (-01); + + return (strcmp(a, b)); +} + + +/* + * conditionally runs the command. + * Selected options or negative -> getsockopt + */ +int +do_cmd(int optname, void *optval, uintptr_t optlen) +{ + int i; + + if (co.test_only) + return 0; + + if (ipfw_socket == -1) + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (ipfw_socket < 0) + err(EX_UNAVAILABLE, "socket"); + + if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || + optname == IP_FW_ADD || optname == IP_FW3 || + optname == IP_FW_NAT_GET_CONFIG || + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; + i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, + (socklen_t *)optlen); + } else { + i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); + } + return i; +} + +/* + * do_set3 - pass ipfw control cmd to kernel + * @optname: option name + * @optval: pointer to option data + * @optlen: option length + * + * Assumes op3 header is already embedded. + * Calls setsockopt() with IP_FW3 as kernel-visible opcode. + * Returns 0 on success or errno otherwise. + */ +int +do_set3(int optname, ip_fw3_opheader *op3, uintptr_t optlen) +{ + + if (co.test_only) + return (0); + + if (ipfw_socket == -1) + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (ipfw_socket < 0) + err(EX_UNAVAILABLE, "socket"); + + op3->opcode = optname; + + return (setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, optlen)); +} + +/* + * do_get3 - pass ipfw control cmd to kernel + * @optname: option name + * @optval: pointer to option data + * @optlen: pointer to option length + * + * Assumes op3 header is already embedded. + * Calls getsockopt() with IP_FW3 as kernel-visible opcode. + * Returns 0 on success or errno otherwise. + */ +int +do_get3(int optname, ip_fw3_opheader *op3, size_t *optlen) +{ + int error; + + if (co.test_only) + return (0); + + if (ipfw_socket == -1) + ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (ipfw_socket < 0) + err(EX_UNAVAILABLE, "socket"); + + op3->opcode = optname; + + error = getsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, + (socklen_t *)optlen); + + return (error); +} + +/** + * match_token takes a table and a string, returns the value associated + * with the string (-1 in case of failure). + */ +int +match_token(struct _s_x *table, char *string) +{ + struct _s_x *pt; + uint i = strlen(string); + + for (pt = table ; i && pt->s != NULL ; pt++) + if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) + return pt->x; + return (-1); +} + +/** + * match_token takes a table and a string, returns the value associated + * with the string for the best match. + * + * Returns: + * value from @table for matched records + * -1 for non-matched records + * -2 if more than one records match @string. + */ +int +match_token_relaxed(struct _s_x *table, char *string) +{ + struct _s_x *pt, *m = NULL; + int i, c; + + i = strlen(string); + c = 0; + + for (pt = table ; i != 0 && pt->s != NULL ; pt++) { + if (strncmp(pt->s, string, i) != 0) + continue; + m = pt; + c++; + } + + if (c == 1) + return (m->x); + + return (c > 0 ? -2: -1); +} + +/** + * match_value takes a table and a value, returns the string associated + * with the value (NULL in case of failure). + */ +char const * +match_value(struct _s_x *p, int value) +{ + for (; p->s != NULL; p++) + if (p->x == value) + return p->s; + return NULL; +} + +size_t +concat_tokens(char *buf, size_t bufsize, struct _s_x *table, char *delimiter) +{ + struct _s_x *pt; + int l; + size_t sz; + + for (sz = 0, pt = table ; pt->s != NULL; pt++) { + l = snprintf(buf + sz, bufsize - sz, "%s%s", + (sz == 0) ? "" : delimiter, pt->s); + sz += l; + bufsize += l; + if (sz > bufsize) + return (bufsize); + } + + return (sz); +} + +/* + * helper function to process a set of flags and set bits in the + * appropriate masks. + */ +int +fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, + uint32_t *clear) +{ + char *q; /* points to the separator */ + int val; + uint32_t *which; /* mask we are working on */ + + while (p && *p) { + if (*p == '!') { + p++; + which = clear; + } else + which = set; + q = strchr(p, ','); + if (q) + *q++ = '\0'; + val = match_token(flags, p); + if (val <= 0) { + if (e != NULL) + *e = p; + return (-1); + } + *which |= (uint32_t)val; + p = q; + } + return (0); +} + +void +print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set) +{ + char const *comma = ""; + int i, l; + + for (i = 0; list[i].x != 0; i++) { + if ((set & list[i].x) == 0) + continue; + + set &= ~list[i].x; + l = snprintf(buf, sz, "%s%s", comma, list[i].s); + if (l >= sz) + return; + comma = ","; + buf += l; + sz -=l; + } +} + +/* + * _substrcmp takes two strings and returns 1 if they do not match, + * and 0 if they match exactly or the first string is a sub-string + * of the second. A warning is printed to stderr in the case that the + * first string is a sub-string of the second. + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp(const char *str1, const char* str2) +{ + + if (strncmp(str1, str2, strlen(str1)) != 0) + return 1; + + if (strlen(str1) != strlen(str2)) + warnx("DEPRECATED: '%s' matched '%s' as a sub-string", + str1, str2); + return 0; +} + +/* + * _substrcmp2 takes three strings and returns 1 if the first two do not match, + * and 0 if they match exactly or the second string is a sub-string + * of the first. A warning is printed to stderr in the case that the + * first string does not match the third. + * + * This function exists to warn about the bizarre construction + * strncmp(str, "by", 2) which is used to allow people to use a shortcut + * for "bytes". The problem is that in addition to accepting "by", + * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any + * other string beginning with "by". + * + * This function will be removed in the future through the usual + * deprecation process. + */ +int +_substrcmp2(const char *str1, const char* str2, const char* str3) +{ + + if (strncmp(str1, str2, strlen(str2)) != 0) + return 1; + + if (strcmp(str1, str3) != 0) + warnx("DEPRECATED: '%s' matched '%s'", + str1, str3); + return 0; +} + +/* + * prints one port, symbolic or numeric + */ +static void +print_port(struct buf_pr *bp, int proto, uint16_t port) +{ + + if (proto == IPPROTO_ETHERTYPE) { + char const *s; + + if (co.do_resolv && (s = match_value(ether_types, port)) ) + bprintf(bp, "%s", s); + else + bprintf(bp, "0x%04x", port); + } else { + struct servent *se = NULL; + if (co.do_resolv) { + struct protoent *pe = getprotobynumber(proto); + + se = getservbyport(htons(port), pe ? pe->p_name : NULL); + } + if (se) + bprintf(bp, "%s", se->s_name); + else + bprintf(bp, "%d", port); + } +} + +static struct _s_x _port_name[] = { + {"dst-port", O_IP_DSTPORT}, + {"src-port", O_IP_SRCPORT}, + {"ipid", O_IPID}, + {"iplen", O_IPLEN}, + {"ipttl", O_IPTTL}, + {"mac-type", O_MAC_TYPE}, + {"tcpdatalen", O_TCPDATALEN}, + {"tcpwin", O_TCPWIN}, + {"tagged", O_TAGGED}, + {NULL, 0} +}; + +/* + * Print the values in a list 16-bit items of the types above. + * XXX todo: add support for mask. + */ +static void +print_newports(struct buf_pr *bp, ipfw_insn_u16 *cmd, int proto, int opcode) +{ + uint16_t *p = cmd->ports; + int i; + char const *sep; + + if (opcode != 0) { + sep = match_value(_port_name, opcode); + if (sep == NULL) + sep = "???"; + bprintf(bp, " %s", sep); + } + sep = " "; + for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { + bprintf(bp, "%s", sep); + print_port(bp, proto, p[0]); + if (p[0] != p[1]) { + bprintf(bp, "-"); + print_port(bp, proto, p[1]); + } + sep = ","; + } +} + +/* + * Like strtol, but also translates service names into port numbers + * for some protocols. + * In particular: + * proto == -1 disables the protocol check; + * proto == IPPROTO_ETHERTYPE looks up an internal table + * proto == <some value in /etc/protocols> matches the values there. + * Returns *end == s in case the parameter is not found. + */ +static int +strtoport(char *s, char **end, int base, int proto) +{ + char *p, *buf; + char *s1; + int i; + + *end = s; /* default - not found */ + if (*s == '\0') + return 0; /* not found */ + + if (isdigit(*s)) + return strtol(s, end, base); + + /* + * find separator. '\\' escapes the next char. + */ + for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) + if (*s1 == '\\' && s1[1] != '\0') + s1++; + + buf = safe_calloc(s1 - s + 1, 1); + + /* + * copy into a buffer skipping backslashes + */ + for (p = s, i = 0; p != s1 ; p++) + if (*p != '\\') + buf[i++] = *p; + buf[i++] = '\0'; + + if (proto == IPPROTO_ETHERTYPE) { + i = match_token(ether_types, buf); + free(buf); + if (i != -1) { /* found */ + *end = s1; + return i; + } + } else { + struct protoent *pe = NULL; + struct servent *se; + + if (proto != 0) + pe = getprotobynumber(proto); + setservent(1); + se = getservbyname(buf, pe ? pe->p_name : NULL); + free(buf); + if (se != NULL) { + *end = s1; + return ntohs(se->s_port); + } + } + return 0; /* not found */ +} + +/* + * Fill the body of the command with the list of port ranges. + */ +static int +fill_newports(ipfw_insn_u16 *cmd, char *av, int proto, int cblen) +{ + uint16_t a, b, *p = cmd->ports; + int i = 0; + char *s = av; + + while (*s) { + a = strtoport(av, &s, 0, proto); + if (s == av) /* empty or invalid argument */ + return (0); + + CHECK_LENGTH(cblen, i + 2); + + switch (*s) { + case '-': /* a range */ + av = s + 1; + b = strtoport(av, &s, 0, proto); + /* Reject expressions like '1-abc' or '1-2-3'. */ + if (s == av || (*s != ',' && *s != '\0')) + return (0); + p[0] = a; + p[1] = b; + break; + case ',': /* comma separated list */ + case '\0': + p[0] = p[1] = a; + break; + default: + warnx("port list: invalid separator <%c> in <%s>", + *s, av); + return (0); + } + + i++; + p += 2; + av = s + 1; + } + if (i > 0) { + if (i + 1 > F_LEN_MASK) + errx(EX_DATAERR, "too many ports/ranges\n"); + cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ + } + return (i); +} + +/* + * Fill the body of the command with the list of DiffServ codepoints. + */ +static void +fill_dscp(ipfw_insn *cmd, char *av, int cblen) +{ + uint32_t *low, *high; + char *s = av, *a; + int code; + + cmd->opcode = O_DSCP; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32) + 1; + + CHECK_CMDLEN; + + low = (uint32_t *)(cmd + 1); + high = low + 1; + + *low = 0; + *high = 0; + + while (s != NULL) { + a = strchr(s, ','); + + if (a != NULL) + *a++ = '\0'; + + if (isalpha(*s)) { + if ((code = match_token(f_ipdscp, s)) == -1) + errx(EX_DATAERR, "Unknown DSCP code"); + } else { + code = strtoul(s, NULL, 10); + if (code < 0 || code > 63) + errx(EX_DATAERR, "Invalid DSCP value"); + } + + if (code > 32) + *high |= 1 << (code - 32); + else + *low |= 1 << code; + + s = a; + } +} + +static struct _s_x icmpcodes[] = { + { "net", ICMP_UNREACH_NET }, + { "host", ICMP_UNREACH_HOST }, + { "protocol", ICMP_UNREACH_PROTOCOL }, + { "port", ICMP_UNREACH_PORT }, + { "needfrag", ICMP_UNREACH_NEEDFRAG }, + { "srcfail", ICMP_UNREACH_SRCFAIL }, + { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, + { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, + { "isolated", ICMP_UNREACH_ISOLATED }, + { "net-prohib", ICMP_UNREACH_NET_PROHIB }, + { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, + { "tosnet", ICMP_UNREACH_TOSNET }, + { "toshost", ICMP_UNREACH_TOSHOST }, + { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, + { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, + { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, + { NULL, 0 } +}; + +static void +fill_reject_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmpcodes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); + *codep = val; + return; +} + +static void +print_reject_code(struct buf_pr *bp, uint16_t code) +{ + char const *s; + + if ((s = match_value(icmpcodes, code)) != NULL) + bprintf(bp, "unreach %s", s); + else + bprintf(bp, "unreach %u", code); +} + +/* + * Returns the number of bits set (from left) in a contiguous bitmask, + * or -1 if the mask is not contiguous. + * XXX this needs a proper fix. + * This effectively works on masks in big-endian (network) format. + * when compiled on little endian architectures. + * + * First bit is bit 7 of the first byte -- note, for MAC addresses, + * the first bit on the wire is bit 0 of the first byte. + * len is the max length in bits. + */ +int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i=0; i<len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n=i+1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n%8)))) != 0) + return -1; /* mask not contiguous */ + return i; +} + +/* + * print flags set/clear in the two bitmasks passed as parameters. + * There is a specialized check for f_tcpflags. + */ +static void +print_flags(struct buf_pr *bp, char const *name, ipfw_insn *cmd, + struct _s_x *list) +{ + char const *comma = ""; + int i; + uint8_t set = cmd->arg1 & 0xff; + uint8_t clear = (cmd->arg1 >> 8) & 0xff; + + if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { + bprintf(bp, " setup"); + return; + } + + bprintf(bp, " %s ", name); + for (i=0; list[i].x != 0; i++) { + if (set & list[i].x) { + set &= ~list[i].x; + bprintf(bp, "%s%s", comma, list[i].s); + comma = ","; + } + if (clear & list[i].x) { + clear &= ~list[i].x; + bprintf(bp, "%s!%s", comma, list[i].s); + comma = ","; + } + } +} + + +/* + * Print the ip address contained in a command. + */ +static void +print_ip(struct buf_pr *bp, struct format_opts *fo, ipfw_insn_ip *cmd, + char const *s) +{ + struct hostent *he = NULL; + struct in_addr *ia; + uint32_t len = F_LEN((ipfw_insn *)cmd); + uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; + char *t; + + if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { + uint32_t d = a[1]; + const char *arg = "<invalid>"; + + if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) + arg = match_value(rule_options, lookup_key[d]); + t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); + bprintf(bp, "%s lookup %s %s", cmd->o.len & F_NOT ? " not": "", + arg, t); + return; + } + bprintf(bp, "%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { + bprintf(bp, "me"); + return; + } + if (cmd->o.opcode == O_IP_SRC_LOOKUP || + cmd->o.opcode == O_IP_DST_LOOKUP) { + t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); + bprintf(bp, "table(%s", t); + if (len == F_INSN_SIZE(ipfw_insn_u32)) + bprintf(bp, ",%u", *a); + bprintf(bp, ")"); + return; + } + if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { + uint32_t x, *map = (uint32_t *)&(cmd->mask); + int i, j; + char comma = '{'; + + x = cmd->o.arg1 - 1; + x = htonl( ~x ); + cmd->addr.s_addr = htonl(cmd->addr.s_addr); + bprintf(bp, "%s/%d", inet_ntoa(cmd->addr), + contigmask((uint8_t *)&x, 32)); + x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); + x &= 0xff; /* base */ + /* + * Print bits and ranges. + * Locate first bit set (i), then locate first bit unset (j). + * If we have 3+ consecutive bits set, then print them as a + * range, otherwise only print the initial bit and rescan. + */ + for (i=0; i < cmd->o.arg1; i++) + if (map[i/32] & (1<<(i & 31))) { + for (j=i+1; j < cmd->o.arg1; j++) + if (!(map[ j/32] & (1<<(j & 31)))) + break; + bprintf(bp, "%c%d", comma, i+x); + if (j>i+2) { /* range has at least 3 elements */ + bprintf(bp, "-%d", j-1+x); + i = j-1; + } + comma = ','; + } + bprintf(bp, "}"); + return; + } + /* + * len == 2 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + for (len = len / 2; len > 0; len--, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? + 32 : contigmask((uint8_t *)&(a[1]), 32); + if (mb == 32 && co.do_resolv) + he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); + if (he != NULL) /* resolved to name */ + bprintf(bp, "%s", he->h_name); + else if (mb == 0) /* any */ + bprintf(bp, "any"); + else { /* numeric IP followed by some kind of mask */ + ia = (struct in_addr *)&a[0]; + bprintf(bp, "%s", inet_ntoa(*ia)); + if (mb < 0) + bprintf(bp, ":%s", inet_ntoa(*ia ) ); + else if (mb < 32) + bprintf(bp, "/%d", mb); + } + if (len > 1) + bprintf(bp, ","); + } +} + +/* + * prints a MAC address/mask pair + */ +static void +print_mac(struct buf_pr *bp, uint8_t *addr, uint8_t *mask) +{ + int l = contigmask(mask, 48); + + if (l == 0) + bprintf(bp, " any"); + else { + bprintf(bp, " %02x:%02x:%02x:%02x:%02x:%02x", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + if (l == -1) + bprintf(bp, "&%02x:%02x:%02x:%02x:%02x:%02x", + mask[0], mask[1], mask[2], + mask[3], mask[4], mask[5]); + else if (l < 48) + bprintf(bp, "/%d", l); + } +} + +static void +fill_icmptypes(ipfw_insn_u32 *cmd, char *av) +{ + uint8_t type; + + cmd->d[0] = 0; + while (*av) { + if (*av == ',') + av++; + + type = strtoul(av, &av, 0); + + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP type"); + + if (type > 31) + errx(EX_DATAERR, "ICMP type out of range"); + + cmd->d[0] |= 1 << type; + } + cmd->o.opcode = O_ICMPTYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); +} + +static void +print_icmptypes(struct buf_pr *bp, ipfw_insn_u32 *cmd) +{ + int i; + char sep= ' '; + + bprintf(bp, " icmptypes"); + for (i = 0; i < 32; i++) { + if ( (cmd->d[0] & (1 << (i))) == 0) + continue; + bprintf(bp, "%c%d", sep, i); + sep = ','; + } +} + +static void +print_dscp(struct buf_pr *bp, ipfw_insn_u32 *cmd) +{ + int i, c; + uint32_t *v; + char sep= ' '; + const char *code; + + bprintf(bp, " dscp"); + i = 0; + c = 0; + v = cmd->d; + while (i < 64) { + if (*v & (1 << i)) { + if ((code = match_value(f_ipdscp, i)) != NULL) + bprintf(bp, "%c%s", sep, code); + else + bprintf(bp, "%c%d", sep, i); + sep = ','; + } + + if ((++i % 32) == 0) + v++; + } +} + +/* + * show_ipfw() prints the body of an ipfw rule. + * Because the standard rule has at least proto src_ip dst_ip, we use + * a helper function to produce these entries if not provided explicitly. + * The first argument is the list of fields we have, the second is + * the list of fields we want to be printed. + * + * Special cases if we have provided a MAC header: + * + if the rule does not contain IP addresses/ports, do not print them; + * + if the rule does not contain an IP proto, print "all" instead of "ip"; + * + * Once we have 'have_options', IP header fields are printed as options. + */ +#define HAVE_PROTO 0x0001 +#define HAVE_SRCIP 0x0002 +#define HAVE_DSTIP 0x0004 +#define HAVE_PROTO4 0x0008 +#define HAVE_PROTO6 0x0010 +#define HAVE_IP 0x0100 +#define HAVE_OPTIONS 0x8000 + +static void +show_prerequisites(struct buf_pr *bp, int *flags, int want, int cmd) +{ + (void)cmd; /* UNUSED */ + if (co.comment_only) + return; + if ( (*flags & HAVE_IP) == HAVE_IP) + *flags |= HAVE_OPTIONS; + + if ( !(*flags & HAVE_OPTIONS)) { + if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { + if ( (*flags & HAVE_PROTO4)) + bprintf(bp, " ip4"); + else if ( (*flags & HAVE_PROTO6)) + bprintf(bp, " ip6"); + else + bprintf(bp, " ip"); + } + if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) + bprintf(bp, " from any"); + if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) + bprintf(bp, " to any"); + } + *flags |= want; +} + +static void +show_static_rule(struct cmdline_opts *co, struct format_opts *fo, + struct buf_pr *bp, struct ip_fw_rule *rule, struct ip_fw_bcounter *cntr) +{ + static int twidth = 0; + int l; + ipfw_insn *cmd, *tagptr = NULL; + const char *comment = NULL; /* ptr to comment if we have one */ + int proto = 0; /* default */ + int flags = 0; /* prerequisites */ + ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ + ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ + int or_block = 0; /* we are in an or block */ + uint32_t uval; + + if ((fo->set_mask & (1 << rule->set)) == 0) { + /* disabled mask */ + if (!co->show_sets) + return; + else + bprintf(bp, "# DISABLED "); + } + bprintf(bp, "%05u ", rule->rulenum); + + /* Print counters if enabled */ + if (fo->pcwidth > 0 || fo->bcwidth > 0) { + pr_u64(bp, &cntr->pcnt, fo->pcwidth); + pr_u64(bp, &cntr->bcnt, fo->bcwidth); + } + + if (co->do_time == 2) + bprintf(bp, "%10u ", cntr->timestamp); + else if (co->do_time == 1) { + char timestr[30]; + time_t t = (time_t)0; + + if (twidth == 0) { + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + twidth = strlen(timestr); + } + if (cntr->timestamp > 0) { + t = _long_to_time(cntr->timestamp); + + strcpy(timestr, ctime(&t)); + *strchr(timestr, '\n') = '\0'; + bprintf(bp, "%s ", timestr); + } else { + bprintf(bp, "%*s", twidth, " "); + } + } + + if (co->show_sets) + bprintf(bp, "set %d ", rule->set); + + /* + * print the optional "match probability" + */ + if (rule->cmd_len > 0) { + cmd = rule->cmd ; + if (cmd->opcode == O_PROB) { + ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; + double d = 1.0 * p->d[0]; + + d = (d / 0x7fffffff); + bprintf(bp, "prob %f ", d); + } + } + + /* + * first print actions + */ + for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); + l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { + switch(cmd->opcode) { + case O_CHECK_STATE: + bprintf(bp, "check-state"); + /* avoid printing anything else */ + flags = HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP; + break; + + case O_ACCEPT: + bprintf(bp, "allow"); + break; + + case O_COUNT: + bprintf(bp, "count"); + break; + + case O_DENY: + bprintf(bp, "deny"); + break; + + case O_REJECT: + if (cmd->arg1 == ICMP_REJECT_RST) + bprintf(bp, "reset"); + else if (cmd->arg1 == ICMP_UNREACH_HOST) + bprintf(bp, "reject"); + else + print_reject_code(bp, cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1 == ICMP6_UNREACH_RST) + bprintf(bp, "reset6"); + else + print_unreach6_code(cmd->arg1); + break; + + case O_SKIPTO: + bprint_uint_arg(bp, "skipto ", cmd->arg1); + break; + + case O_PIPE: + bprint_uint_arg(bp, "pipe ", cmd->arg1); + break; + + case O_QUEUE: + bprint_uint_arg(bp, "queue ", cmd->arg1); + break; + + case O_DIVERT: + bprint_uint_arg(bp, "divert ", cmd->arg1); + break; + + case O_TEE: + bprint_uint_arg(bp, "tee ", cmd->arg1); + break; + + case O_NETGRAPH: + bprint_uint_arg(bp, "netgraph ", cmd->arg1); + break; + + case O_NGTEE: + bprint_uint_arg(bp, "ngtee ", cmd->arg1); + break; + + case O_FORWARD_IP: + { + ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; + + if (s->sa.sin_addr.s_addr == INADDR_ANY) { + bprintf(bp, "fwd tablearg"); + } else { + bprintf(bp, "fwd %s",inet_ntoa(s->sa.sin_addr)); + } + if (s->sa.sin_port) + bprintf(bp, ",%d", s->sa.sin_port); + } + break; + + case O_FORWARD_IP6: + { + char buf[4 + INET6_ADDRSTRLEN + 1]; + ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; + + bprintf(bp, "fwd %s", inet_ntop(AF_INET6, + &s->sa.sin6_addr, buf, sizeof(buf))); + if (s->sa.sin6_port) + bprintf(bp, ",%d", s->sa.sin6_port); + } + break; + + case O_LOG: /* O_LOG is printed last */ + logptr = (ipfw_insn_log *)cmd; + break; + + case O_ALTQ: /* O_ALTQ is printed after O_LOG */ + altqptr = (ipfw_insn_altq *)cmd; + break; + + case O_TAG: + tagptr = cmd; + break; + + case O_NAT: + if (cmd->arg1 != 0) + bprint_uint_arg(bp, "nat ", cmd->arg1); + else + bprintf(bp, "nat global"); + break; + + case O_SETFIB: + bprint_uint_arg(bp, "setfib ", cmd->arg1 & 0x7FFF); + break; + + case O_SETDSCP: + { + const char *code; + + if (cmd->arg1 == IP_FW_TARG) { + bprint_uint_arg(bp, "setdscp ", cmd->arg1); + break; + } + uval = cmd->arg1 & 0x3F; + if ((code = match_value(f_ipdscp, uval)) != NULL) + bprintf(bp, "setdscp %s", code); + else + bprint_uint_arg(bp, "setdscp ", uval); + } + break; + + case O_REASS: + bprintf(bp, "reass"); + break; + + case O_CALLRETURN: + if (cmd->len & F_NOT) + bprintf(bp, "return"); + else + bprint_uint_arg(bp, "call ", cmd->arg1); + break; + + default: + bprintf(bp, "** unrecognized action %d len %d ", + cmd->opcode, cmd->len); + } + } + if (logptr) { + if (logptr->max_log > 0) + bprintf(bp, " log logamount %d", logptr->max_log); + else + bprintf(bp, " log"); + } +#ifndef NO_ALTQ + if (altqptr) { + print_altq_cmd(bp, altqptr); + } +#endif + if (tagptr) { + if (tagptr->len & F_NOT) + bprint_uint_arg(bp, " untag ", tagptr->arg1); + else + bprint_uint_arg(bp, " tag ", tagptr->arg1); + } + + /* + * then print the body. + */ + for (l = rule->act_ofs, cmd = rule->cmd; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + if ((cmd->len & F_OR) || (cmd->len & F_NOT)) + continue; + if (cmd->opcode == O_IP4) { + flags |= HAVE_PROTO4; + break; + } else if (cmd->opcode == O_IP6) { + flags |= HAVE_PROTO6; + break; + } + } + if (rule->flags & IPFW_RULE_NOOPT) { /* empty rules before options */ + if (!co->do_compact) { + show_prerequisites(bp, &flags, HAVE_PROTO, 0); + bprintf(bp, " from any to any"); + } + flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | + HAVE_SRCIP | HAVE_DSTIP; + } + + if (co->comment_only) + comment = "..."; + + for (l = rule->act_ofs, cmd = rule->cmd; + l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { + /* useful alias */ + ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; + + if (co->comment_only) { + if (cmd->opcode != O_NOP) + continue; + bprintf(bp, " // %s\n", (char *)(cmd + 1)); + return; + } + + show_prerequisites(bp, &flags, 0, cmd->opcode); + + switch(cmd->opcode) { + case O_PROB: + break; /* done already */ + + case O_PROBE_STATE: + break; /* no need to print anything here */ + + case O_IP_SRC: + case O_IP_SRC_LOOKUP: + case O_IP_SRC_MASK: + case O_IP_SRC_ME: + case O_IP_SRC_SET: + show_prerequisites(bp, &flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + bprintf(bp, " from"); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + print_ip(bp, fo, (ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip" : ""); + flags |= HAVE_SRCIP; + break; + + case O_IP_DST: + case O_IP_DST_LOOKUP: + case O_IP_DST_MASK: + case O_IP_DST_ME: + case O_IP_DST_SET: + show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + bprintf(bp, " to"); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + print_ip(bp, fo, (ipfw_insn_ip *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip" : ""); + flags |= HAVE_DSTIP; + break; + + case O_IP6_SRC: + case O_IP6_SRC_MASK: + case O_IP6_SRC_ME: + show_prerequisites(bp, &flags, HAVE_PROTO, 0); + if (!(flags & HAVE_SRCIP)) + bprintf(bp, " from"); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + print_ip6(bp, (ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " src-ip6" : ""); + flags |= HAVE_SRCIP | HAVE_PROTO; + break; + + case O_IP6_DST: + case O_IP6_DST_MASK: + case O_IP6_DST_ME: + show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); + if (!(flags & HAVE_DSTIP)) + bprintf(bp, " to"); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + print_ip6(bp, (ipfw_insn_ip6 *)cmd, + (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); + flags |= HAVE_DSTIP; + break; + + case O_FLOW6ID: + print_flow6id(bp, (ipfw_insn_u32 *) cmd ); + flags |= HAVE_OPTIONS; + break; + + case O_IP_DSTPORT: + show_prerequisites(bp, &flags, + HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP, 0); + case O_IP_SRCPORT: + if (flags & HAVE_DSTIP) + flags |= HAVE_IP; + show_prerequisites(bp, &flags, + HAVE_PROTO | HAVE_SRCIP, 0); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + if (cmd->len & F_NOT) + bprintf(bp, " not"); + print_newports(bp, (ipfw_insn_u16 *)cmd, proto, + (flags & HAVE_OPTIONS) ? cmd->opcode : 0); + break; + + case O_PROTO: { + struct protoent *pe = NULL; + + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + if (cmd->len & F_NOT) + bprintf(bp, " not"); + proto = cmd->arg1; + pe = getprotobynumber(cmd->arg1); + if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && + !(flags & HAVE_PROTO)) + show_prerequisites(bp, &flags, + HAVE_PROTO | HAVE_IP | HAVE_SRCIP | + HAVE_DSTIP | HAVE_OPTIONS, 0); + if (flags & HAVE_OPTIONS) + bprintf(bp, " proto"); + if (pe) + bprintf(bp, " %s", pe->p_name); + else + bprintf(bp, " %u", cmd->arg1); + } + flags |= HAVE_PROTO; + break; + + default: /*options ... */ + if (!(cmd->len & (F_OR|F_NOT))) + if (((cmd->opcode == O_IP6) && + (flags & HAVE_PROTO6)) || + ((cmd->opcode == O_IP4) && + (flags & HAVE_PROTO4))) + break; + show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); + if ((cmd->len & F_OR) && !or_block) + bprintf(bp, " {"); + if (cmd->len & F_NOT && cmd->opcode != O_IN) + bprintf(bp, " not"); + switch(cmd->opcode) { + case O_MACADDR2: { + ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; + + bprintf(bp, " MAC"); + print_mac(bp, m->addr, m->mask); + print_mac(bp, m->addr + 6, m->mask + 6); + } + break; + + case O_MAC_TYPE: + print_newports(bp, (ipfw_insn_u16 *)cmd, + IPPROTO_ETHERTYPE, cmd->opcode); + break; + + + case O_FRAG: + bprintf(bp, " frag"); + break; + + case O_FIB: + bprintf(bp, " fib %u", cmd->arg1 ); + break; + case O_SOCKARG: + bprintf(bp, " sockarg"); + break; + + case O_IN: + bprintf(bp, cmd->len & F_NOT ? " out" : " in"); + break; + + case O_DIVERTED: + switch (cmd->arg1) { + case 3: + bprintf(bp, " diverted"); + break; + case 1: + bprintf(bp, " diverted-loopback"); + break; + case 2: + bprintf(bp, " diverted-output"); + break; + default: + bprintf(bp, " diverted-?<%u>", cmd->arg1); + break; + } + break; + + case O_LAYER2: + bprintf(bp, " layer2"); + break; + case O_XMIT: + case O_RECV: + case O_VIA: + { + char const *s, *t; + ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; + + if (cmd->opcode == O_XMIT) + s = "xmit"; + else if (cmd->opcode == O_RECV) + s = "recv"; + else /* if (cmd->opcode == O_VIA) */ + s = "via"; + if (cmdif->name[0] == '\0') + bprintf(bp, " %s %s", s, + inet_ntoa(cmdif->p.ip)); + else if (cmdif->name[0] == '\1') { + /* interface table */ + t = table_search_ctlv(fo->tstate, + cmdif->p.kidx); + bprintf(bp, " %s table(%s)", s, t); + } else + bprintf(bp, " %s %s", s, cmdif->name); + + break; + } + case O_IP_FLOW_LOOKUP: + { + char *t; + + t = table_search_ctlv(fo->tstate, cmd->arg1); + bprintf(bp, " flow table(%s", t); + if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) + bprintf(bp, ",%u", + ((ipfw_insn_u32 *)cmd)->d[0]); + bprintf(bp, ")"); + break; + } + case O_IPID: + if (F_LEN(cmd) == 1) + bprintf(bp, " ipid %u", cmd->arg1 ); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, + O_IPID); + break; + + case O_IPTTL: + if (F_LEN(cmd) == 1) + bprintf(bp, " ipttl %u", cmd->arg1 ); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, + O_IPTTL); + break; + + case O_IPVER: + bprintf(bp, " ipver %u", cmd->arg1 ); + break; + + case O_IPPRECEDENCE: + bprintf(bp, " ipprecedence %u", cmd->arg1 >> 5); + break; + + case O_DSCP: + print_dscp(bp, (ipfw_insn_u32 *)cmd); + break; + + case O_IPLEN: + if (F_LEN(cmd) == 1) + bprintf(bp, " iplen %u", cmd->arg1 ); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, + O_IPLEN); + break; + + case O_IPOPT: + print_flags(bp, "ipoptions", cmd, f_ipopts); + break; + + case O_IPTOS: + print_flags(bp, "iptos", cmd, f_iptos); + break; + + case O_ICMPTYPE: + print_icmptypes(bp, (ipfw_insn_u32 *)cmd); + break; + + case O_ESTAB: + bprintf(bp, " established"); + break; + + case O_TCPDATALEN: + if (F_LEN(cmd) == 1) + bprintf(bp, " tcpdatalen %u", cmd->arg1 ); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, + O_TCPDATALEN); + break; + + case O_TCPFLAGS: + print_flags(bp, "tcpflags", cmd, f_tcpflags); + break; + + case O_TCPOPTS: + print_flags(bp, "tcpoptions", cmd, f_tcpopts); + break; + + case O_TCPWIN: + if (F_LEN(cmd) == 1) + bprintf(bp, " tcpwin %u", cmd->arg1); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, 0, + O_TCPWIN); + break; + + case O_TCPACK: + bprintf(bp, " tcpack %d", ntohl(cmd32->d[0])); + break; + + case O_TCPSEQ: + bprintf(bp, " tcpseq %d", ntohl(cmd32->d[0])); + break; + + case O_UID: + { + struct passwd *pwd = getpwuid(cmd32->d[0]); + + if (pwd) + bprintf(bp, " uid %s", pwd->pw_name); + else + bprintf(bp, " uid %u", cmd32->d[0]); + } + break; + + case O_GID: + { + struct group *grp = getgrgid(cmd32->d[0]); + + if (grp) + bprintf(bp, " gid %s", grp->gr_name); + else + bprintf(bp, " gid %u", cmd32->d[0]); + } + break; + + case O_JAIL: + bprintf(bp, " jail %d", cmd32->d[0]); + break; + + case O_VERREVPATH: + bprintf(bp, " verrevpath"); + break; + + case O_VERSRCREACH: + bprintf(bp, " versrcreach"); + break; + + case O_ANTISPOOF: + bprintf(bp, " antispoof"); + break; + + case O_IPSEC: + bprintf(bp, " ipsec"); + break; + + case O_NOP: + comment = (char *)(cmd + 1); + break; + + case O_KEEP_STATE: + bprintf(bp, " keep-state"); + break; + + case O_LIMIT: { + struct _s_x *p = limit_masks; + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + uint8_t x = c->limit_mask; + char const *comma = " "; + + bprintf(bp, " limit"); + for (; p->x != 0 ; p++) + if ((x & p->x) == p->x) { + x &= ~p->x; + bprintf(bp, "%s%s", comma,p->s); + comma = ","; + } + bprint_uint_arg(bp, " ", c->conn_limit); + break; + } + + case O_IP6: + bprintf(bp, " ip6"); + break; + + case O_IP4: + bprintf(bp, " ip4"); + break; + + case O_ICMP6TYPE: + print_icmp6types(bp, (ipfw_insn_u32 *)cmd); + break; + + case O_EXT_HDR: + print_ext6hdr(bp, (ipfw_insn *)cmd); + break; + + case O_TAGGED: + if (F_LEN(cmd) == 1) + bprint_uint_arg(bp, " tagged ", + cmd->arg1); + else + print_newports(bp, (ipfw_insn_u16 *)cmd, + 0, O_TAGGED); + break; + + default: + bprintf(bp, " [opcode %d len %d]", + cmd->opcode, cmd->len); + } + } + if (cmd->len & F_OR) { + bprintf(bp, " or"); + or_block = 1; + } else if (or_block) { + bprintf(bp, " }"); + or_block = 0; + } + } + show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP + | HAVE_IP, 0); + if (comment) + bprintf(bp, " // %s", comment); + bprintf(bp, "\n"); +} + +static void +show_dyn_state(struct cmdline_opts *co, struct format_opts *fo, + struct buf_pr *bp, ipfw_dyn_rule *d) +{ + struct protoent *pe; + struct in_addr a; + uint16_t rulenum; + char buf[INET6_ADDRSTRLEN]; + + if (!co->do_expired) { + if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) + return; + } + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + bprintf(bp, "%05d", rulenum); + if (fo->pcwidth > 0 || fo->bcwidth > 0) { + bprintf(bp, " "); + pr_u64(bp, &d->pcnt, fo->pcwidth); + pr_u64(bp, &d->bcnt, fo->bcwidth); + bprintf(bp, "(%ds)", d->expire); + } + switch (d->dyn_type) { + case O_LIMIT_PARENT: + bprintf(bp, " PARENT %d", d->count); + break; + case O_LIMIT: + bprintf(bp, " LIMIT"); + break; + case O_KEEP_STATE: /* bidir, no mask */ + bprintf(bp, " STATE"); + break; + } + + if ((pe = getprotobynumber(d->id.proto)) != NULL) + bprintf(bp, " %s", pe->p_name); + else + bprintf(bp, " proto %u", d->id.proto); + + if (d->id.addr_type == 4) { + a.s_addr = htonl(d->id.src_ip); + bprintf(bp, " %s %d", inet_ntoa(a), d->id.src_port); + + a.s_addr = htonl(d->id.dst_ip); + bprintf(bp, " <-> %s %d", inet_ntoa(a), d->id.dst_port); + } else if (d->id.addr_type == 6) { + bprintf(bp, " %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, + sizeof(buf)), d->id.src_port); + bprintf(bp, " <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, + buf, sizeof(buf)), d->id.dst_port); + } else + bprintf(bp, " UNKNOWN <-> UNKNOWN\n"); +} + +static int +do_range_cmd(int cmd, ipfw_range_tlv *rt) +{ + ipfw_range_header rh; + size_t sz; + + memset(&rh, 0, sizeof(rh)); + memcpy(&rh.range, rt, sizeof(*rt)); + rh.range.head.length = sizeof(*rt); + rh.range.head.type = IPFW_TLV_RANGE; + sz = sizeof(rh); + + if (do_get3(cmd, &rh.opheader, &sz) != 0) + return (-1); + /* Save number of matched objects */ + rt->new_set = rh.range.new_set; + return (0); +} + +/* + * This one handles all set-related commands + * ipfw set { show | enable | disable } + * ipfw set swap X Y + * ipfw set move X to Y + * ipfw set move rule X to Y + */ +void +ipfw_sets_handler(char *av[]) +{ + uint32_t masks[2]; + int i; + uint8_t cmd, rulenum; + ipfw_range_tlv rt; + char *msg; + size_t size; + + av++; + memset(&rt, 0, sizeof(rt)); + + if (av[0] == NULL) + errx(EX_USAGE, "set needs command"); + if (_substrcmp(*av, "show") == 0) { + struct format_opts fo; + ipfw_cfg_lheader *cfg; + + memset(&fo, 0, sizeof(fo)); + if (ipfw_get_config(&co, &fo, &cfg, &size) != 0) + err(EX_OSERR, "requesting config failed"); + + for (i = 0, msg = "disable"; i < RESVD_SET; i++) + if ((cfg->set_mask & (1<<i)) == 0) { + printf("%s %d", msg, i); + msg = ""; + } + msg = (cfg->set_mask != (uint32_t)-1) ? " enable" : "enable"; + for (i = 0; i < RESVD_SET; i++) + if ((cfg->set_mask & (1<<i)) != 0) { + printf("%s %d", msg, i); + msg = ""; + } + printf("\n"); + free(cfg); + } else if (_substrcmp(*av, "swap") == 0) { + av++; + if ( av[0] == NULL || av[1] == NULL ) + errx(EX_USAGE, "set swap needs 2 set numbers\n"); + rt.set = atoi(av[0]); + rt.new_set = atoi(av[1]); + if (!isdigit(*(av[0])) || rt.set > RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[0]); + if (!isdigit(*(av[1])) || rt.new_set > RESVD_SET) + errx(EX_DATAERR, "invalid set number %s\n", av[1]); + i = do_range_cmd(IP_FW_SET_SWAP, &rt); + } else if (_substrcmp(*av, "move") == 0) { + av++; + if (av[0] && _substrcmp(*av, "rule") == 0) { + rt.flags = IPFW_RCFLAG_RANGE; /* move rules to new set */ + cmd = IP_FW_XMOVE; + av++; + } else + cmd = IP_FW_SET_MOVE; /* Move set to new one */ + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) + errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); + rulenum = atoi(av[0]); + rt.new_set = atoi(av[2]); + if (cmd == IP_FW_XMOVE) { + rt.start_rule = rulenum; + rt.end_rule = rulenum; + } else + rt.set = rulenum; + rt.new_set = atoi(av[2]); + if (!isdigit(*(av[0])) || (cmd == 3 && rt.set > RESVD_SET) || + (cmd == 2 && rt.start_rule == IPFW_DEFAULT_RULE) ) + errx(EX_DATAERR, "invalid source number %s\n", av[0]); + if (!isdigit(*(av[2])) || rt.new_set > RESVD_SET) + errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); + i = do_range_cmd(cmd, &rt); + } else if (_substrcmp(*av, "disable") == 0 || + _substrcmp(*av, "enable") == 0 ) { + int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; + + av++; + masks[0] = masks[1] = 0; + + while (av[0]) { + if (isdigit(**av)) { + i = atoi(*av); + if (i < 0 || i > RESVD_SET) + errx(EX_DATAERR, + "invalid set number %d\n", i); + masks[which] |= (1<<i); + } else if (_substrcmp(*av, "disable") == 0) + which = 0; + else if (_substrcmp(*av, "enable") == 0) + which = 1; + else + errx(EX_DATAERR, + "invalid set command %s\n", *av); + av++; + } + if ( (masks[0] & masks[1]) != 0 ) + errx(EX_DATAERR, + "cannot enable and disable the same set\n"); + + rt.set = masks[0]; + rt.new_set = masks[1]; + i = do_range_cmd(IP_FW_SET_ENABLE, &rt); + if (i) + warn("set enable/disable: setsockopt(IP_FW_SET_ENABLE)"); + } else + errx(EX_USAGE, "invalid set command %s\n", *av); +} + +void +ipfw_sysctl_handler(char *av[], int which) +{ + av++; + + if (av[0] == NULL) { + warnx("missing keyword to enable/disable\n"); + } else if (_substrcmp(*av, "firewall") == 0) { + sysctlbyname("net.inet.ip.fw.enable", NULL, 0, + &which, sizeof(which)); + sysctlbyname("net.inet6.ip6.fw.enable", NULL, 0, + &which, sizeof(which)); + } else if (_substrcmp(*av, "one_pass") == 0) { + sysctlbyname("net.inet.ip.fw.one_pass", NULL, 0, + &which, sizeof(which)); + } else if (_substrcmp(*av, "debug") == 0) { + sysctlbyname("net.inet.ip.fw.debug", NULL, 0, + &which, sizeof(which)); + } else if (_substrcmp(*av, "verbose") == 0) { + sysctlbyname("net.inet.ip.fw.verbose", NULL, 0, + &which, sizeof(which)); + } else if (_substrcmp(*av, "dyn_keepalive") == 0) { + sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0, + &which, sizeof(which)); +#ifndef NO_ALTQ + } else if (_substrcmp(*av, "altq") == 0) { + altq_set_enabled(which); +#endif + } else { + warnx("unrecognize enable/disable keyword: %s\n", *av); + } +} + +typedef void state_cb(struct cmdline_opts *co, struct format_opts *fo, + void *arg, void *state); + +static void +prepare_format_dyn(struct cmdline_opts *co, struct format_opts *fo, + void *arg, void *_state) +{ + ipfw_dyn_rule *d; + int width; + uint8_t set; + + d = (ipfw_dyn_rule *)_state; + /* Count _ALL_ states */ + fo->dcnt++; + + if (fo->show_counters == 0) + return; + + if (co->use_set) { + /* skip states from another set */ + bcopy((char *)&d->rule + sizeof(uint16_t), &set, + sizeof(uint8_t)); + if (set != co->use_set - 1) + return; + } + + width = pr_u64(NULL, &d->pcnt, 0); + if (width > fo->pcwidth) + fo->pcwidth = width; + + width = pr_u64(NULL, &d->bcnt, 0); + if (width > fo->bcwidth) + fo->bcwidth = width; +} + +static int +foreach_state(struct cmdline_opts *co, struct format_opts *fo, + caddr_t base, size_t sz, state_cb dyn_bc, void *dyn_arg) +{ + int ttype; + state_cb *fptr; + void *farg; + ipfw_obj_tlv *tlv; + ipfw_obj_ctlv *ctlv; + + fptr = NULL; + ttype = 0; + + while (sz > 0) { + ctlv = (ipfw_obj_ctlv *)base; + switch (ctlv->head.type) { + case IPFW_TLV_DYNSTATE_LIST: + base += sizeof(*ctlv); + sz -= sizeof(*ctlv); + ttype = IPFW_TLV_DYN_ENT; + fptr = dyn_bc; + farg = dyn_arg; + break; + default: + return (sz); + } + + while (sz > 0) { + tlv = (ipfw_obj_tlv *)base; + if (tlv->type != ttype) + break; + + fptr(co, fo, farg, tlv + 1); + sz -= tlv->length; + base += tlv->length; + } + } + + return (sz); +} + +static void +prepare_format_opts(struct cmdline_opts *co, struct format_opts *fo, + ipfw_obj_tlv *rtlv, int rcnt, caddr_t dynbase, size_t dynsz) +{ + int bcwidth, pcwidth, width; + int n; + struct ip_fw_bcounter *cntr; + struct ip_fw_rule *r; + + bcwidth = 0; + pcwidth = 0; + if (fo->show_counters != 0) { + for (n = 0; n < rcnt; n++, + rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { + cntr = (struct ip_fw_bcounter *)(rtlv + 1); + r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); + /* skip rules from another set */ + if (co->use_set && r->set != co->use_set - 1) + continue; + + /* packet counter */ + width = pr_u64(NULL, &cntr->pcnt, 0); + if (width > pcwidth) + pcwidth = width; + + /* byte counter */ + width = pr_u64(NULL, &cntr->bcnt, 0); + if (width > bcwidth) + bcwidth = width; + } + } + fo->bcwidth = bcwidth; + fo->pcwidth = pcwidth; + + fo->dcnt = 0; + if (co->do_dynamic && dynsz > 0) + foreach_state(co, fo, dynbase, dynsz, prepare_format_dyn, NULL); +} + +static int +list_static_range(struct cmdline_opts *co, struct format_opts *fo, + struct buf_pr *bp, ipfw_obj_tlv *rtlv, int rcnt) +{ + int n, seen; + struct ip_fw_rule *r; + struct ip_fw_bcounter *cntr; + int c = 0; + + for (n = seen = 0; n < rcnt; n++, + rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { + + if (fo->show_counters != 0) { + cntr = (struct ip_fw_bcounter *)(rtlv + 1); + r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); + } else { + cntr = NULL; + r = (struct ip_fw_rule *)(rtlv + 1); + } + if (r->rulenum > fo->last) + break; + if (co->use_set && r->set != co->use_set - 1) + continue; + if (r->rulenum >= fo->first && r->rulenum <= fo->last) { + show_static_rule(co, fo, bp, r, cntr); + printf("%s", bp->buf); + c += rtlv->length; + bp_flush(bp); + seen++; + } + } + + return (seen); +} + +static void +list_dyn_state(struct cmdline_opts *co, struct format_opts *fo, + void *_arg, void *_state) +{ + uint16_t rulenum; + uint8_t set; + ipfw_dyn_rule *d; + struct buf_pr *bp; + + d = (ipfw_dyn_rule *)_state; + bp = (struct buf_pr *)_arg; + + bcopy(&d->rule, &rulenum, sizeof(rulenum)); + if (rulenum > fo->last) + return; + if (co->use_set) { + bcopy((char *)&d->rule + sizeof(uint16_t), + &set, sizeof(uint8_t)); + if (set != co->use_set - 1) + return; + } + if (rulenum >= fo->first) { + show_dyn_state(co, fo, bp, d); + printf("%s\n", bp->buf); + bp_flush(bp); + } +} + +static int +list_dyn_range(struct cmdline_opts *co, struct format_opts *fo, + struct buf_pr *bp, caddr_t base, size_t sz) +{ + + sz = foreach_state(co, fo, base, sz, list_dyn_state, bp); + return (sz); +} + +void +ipfw_list(int ac, char *av[], int show_counters) +{ + ipfw_cfg_lheader *cfg; + struct format_opts sfo; + size_t sz; + int error; + int lac; + char **lav; + uint32_t rnum; + char *endptr; + + if (co.test_only) { + fprintf(stderr, "Testing only, list disabled\n"); + return; + } + if (co.do_pipe) { + dummynet_list(ac, av, show_counters); + return; + } + + ac--; + av++; + memset(&sfo, 0, sizeof(sfo)); + + /* Determine rule range to request */ + if (ac > 0) { + for (lac = ac, lav = av; lac != 0; lac--) { + rnum = strtoul(*lav++, &endptr, 10); + if (sfo.first == 0 || rnum < sfo.first) + sfo.first = rnum; + + if (*endptr == '-') + rnum = strtoul(endptr + 1, &endptr, 10); + if (sfo.last == 0 || rnum > sfo.last) + sfo.last = rnum; + } + } + + /* get configuraion from kernel */ + cfg = NULL; + sfo.show_counters = show_counters; + sfo.flags = IPFW_CFG_GET_STATIC; + if (co.do_dynamic != 0) + sfo.flags |= IPFW_CFG_GET_STATES; + if (sfo.show_counters != 0) + sfo.flags |= IPFW_CFG_GET_COUNTERS; + if (ipfw_get_config(&co, &sfo, &cfg, &sz) != 0) + err(EX_OSERR, "retrieving config failed"); + + error = ipfw_show_config(&co, &sfo, cfg, sz, ac, av); + + free(cfg); + + if (error != EX_OK) + exit(error); +} + +static int +ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, + ipfw_cfg_lheader *cfg, size_t sz, int ac, char *av[]) +{ + caddr_t dynbase; + size_t dynsz; + int rcnt; + int exitval = EX_OK; + int lac; + char **lav; + char *endptr; + size_t readsz; + struct buf_pr bp; + ipfw_obj_ctlv *ctlv, *tstate; + ipfw_obj_tlv *rbase; + + /* + * Handle tablenames TLV first, if any + */ + tstate = NULL; + rbase = NULL; + dynbase = NULL; + dynsz = 0; + readsz = sizeof(*cfg); + rcnt = 0; + + fo->set_mask = cfg->set_mask; + + ctlv = (ipfw_obj_ctlv *)(cfg + 1); + + if (cfg->flags & IPFW_CFG_GET_STATIC) { + /* We've requested static rules */ + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { + fo->tstate = ctlv; + readsz += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + + ctlv->head.length); + } + + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { + rbase = (ipfw_obj_tlv *)(ctlv + 1); + rcnt = ctlv->count; + readsz += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + + ctlv->head.length); + } + } + + if ((cfg->flags & IPFW_CFG_GET_STATES) && (readsz != sz)) { + /* We may have some dynamic states */ + dynsz = sz - readsz; + /* Skip empty header */ + if (dynsz != sizeof(ipfw_obj_ctlv)) + dynbase = (caddr_t)ctlv; + else + dynsz = 0; + } + + prepare_format_opts(co, fo, rbase, rcnt, dynbase, dynsz); + bp_alloc(&bp, 4096); + + /* if no rule numbers were specified, list all rules */ + if (ac == 0) { + fo->first = 0; + fo->last = IPFW_DEFAULT_RULE; + list_static_range(co, fo, &bp, rbase, rcnt); + + if (co->do_dynamic && dynsz > 0) { + printf("## Dynamic rules (%d %zu):\n", fo->dcnt, dynsz); + list_dyn_range(co, fo, &bp, dynbase, dynsz); + } + + bp_free(&bp); + return (EX_OK); + } + + /* display specific rules requested on command line */ + for (lac = ac, lav = av; lac != 0; lac--) { + /* convert command line rule # */ + fo->last = fo->first = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + fo->last = strtoul(endptr + 1, &endptr, 10); + if (*endptr) { + exitval = EX_USAGE; + warnx("invalid rule number: %s", *(lav - 1)); + continue; + } + + if (list_static_range(co, fo, &bp, rbase, rcnt) == 0) { + /* give precedence to other error(s) */ + if (exitval == EX_OK) + exitval = EX_UNAVAILABLE; + if (fo->first == fo->last) + warnx("rule %u does not exist", fo->first); + else + warnx("no rules in range %u-%u", + fo->first, fo->last); + } + } + + if (co->do_dynamic && dynsz > 0) { + printf("## Dynamic rules:\n"); + for (lac = ac, lav = av; lac != 0; lac--) { + fo->last = fo->first = strtoul(*lav++, &endptr, 10); + if (*endptr == '-') + fo->last = strtoul(endptr+1, &endptr, 10); + if (*endptr) + /* already warned */ + continue; + list_dyn_range(co, fo, &bp, dynbase, dynsz); + } + } + + bp_free(&bp); + return (exitval); +} + + +/* + * Retrieves current ipfw configuration of given type + * and stores its pointer to @pcfg. + * + * Caller is responsible for freeing @pcfg. + * + * Returns 0 on success. + */ + +static int +ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, + ipfw_cfg_lheader **pcfg, size_t *psize) +{ + ipfw_cfg_lheader *cfg; + size_t sz; + int i; + + + if (co->test_only != 0) { + fprintf(stderr, "Testing only, list disabled\n"); + return (0); + } + + /* Start with some data size */ + sz = 4096; + cfg = NULL; + + for (i = 0; i < 16; i++) { + if (cfg != NULL) + free(cfg); + if ((cfg = calloc(1, sz)) == NULL) + return (ENOMEM); + + cfg->flags = fo->flags; + cfg->start_rule = fo->first; + cfg->end_rule = fo->last; + + if (do_get3(IP_FW_XGET, &cfg->opheader, &sz) != 0) { + if (errno != ENOMEM) { + free(cfg); + return (errno); + } + + /* Buffer size is not enough. Try to increase */ + sz = sz * 2; + if (sz < cfg->size) + sz = cfg->size; + continue; + } + + *pcfg = cfg; + *psize = sz; + return (0); + } + + free(cfg); + return (ENOMEM); +} + +static int +lookup_host (char *host, struct in_addr *ipaddr) +{ + struct hostent *he; + + if (!inet_aton(host, ipaddr)) { + if ((he = gethostbyname(host)) == NULL) + return(-1); + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; + } + return(0); +} + +struct tidx { + ipfw_obj_ntlv *idx; + uint32_t count; + uint32_t size; + uint16_t counter; + uint8_t set; +}; + +static uint16_t +pack_table(struct tidx *tstate, char *name) +{ + int i; + ipfw_obj_ntlv *ntlv; + + if (table_check_name(name) != 0) + return (0); + + for (i = 0; i < tstate->count; i++) { + if (strcmp(tstate->idx[i].name, name) != 0) + continue; + if (tstate->idx[i].set != tstate->set) + continue; + + return (tstate->idx[i].idx); + } + + if (tstate->count + 1 > tstate->size) { + tstate->size += 4; + tstate->idx = realloc(tstate->idx, tstate->size * + sizeof(ipfw_obj_ntlv)); + if (tstate->idx == NULL) + return (0); + } + + ntlv = &tstate->idx[i]; + memset(ntlv, 0, sizeof(ipfw_obj_ntlv)); + strlcpy(ntlv->name, name, sizeof(ntlv->name)); + ntlv->head.type = IPFW_TLV_TBL_NAME; + ntlv->head.length = sizeof(ipfw_obj_ntlv); + ntlv->set = tstate->set; + ntlv->idx = ++tstate->counter; + tstate->count++; + + return (ntlv->idx); +} + +static void +fill_table(ipfw_insn *cmd, char *av, uint8_t opcode, struct tidx *tstate) +{ + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; + uint16_t uidx; + char *p; + + if ((p = strchr(av + 6, ')')) == NULL) + errx(EX_DATAERR, "forgotten parenthesis: '%s'", av); + *p = '\0'; + p = strchr(av + 6, ','); + if (p) + *p++ = '\0'; + + if ((uidx = pack_table(tstate, av + 6)) == 0) + errx(EX_DATAERR, "Invalid table name: %s", av + 6); + + cmd->opcode = opcode; + cmd->arg1 = uidx; + if (p) { + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + d[0] = strtoul(p, NULL, 0); + } else + cmd->len |= F_INSN_SIZE(ipfw_insn); +} + + +/* + * fills the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * me returns O_IP_*_ME + * 1.2.3.4 single IP address + * 1.2.3.4:5.6.7.8 address:mask + * 1.2.3.4/24 address/mask + * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet + * We can have multiple comma-separated address/mask entries. + */ +static void +fill_ip(ipfw_insn_ip *cmd, char *av, int cblen, struct tidx *tstate) +{ + int len = 0; + uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (_substrcmp(av, "any") == 0) + return; + + if (_substrcmp(av, "me") == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return; + } + + if (strncmp(av, "table(", 6) == 0) { + fill_table(&cmd->o, av, O_IP_DST_LOOKUP, tstate); + return; + } + + while (av) { + /* + * After the address we can have '/' or ':' indicating a mask, + * ',' indicating another address follows, '{' indicating a + * set of addresses of unspecified size. + */ + char *t = NULL, *p = strpbrk(av, "/:,{"); + int masklen; + char md, nd = '\0'; + + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn) + 2 + len); + + if (p) { + md = *p; + *p++ = '\0'; + if ((t = strpbrk(p, ",{")) != NULL) { + nd = *t; + *t = '\0'; + } + } else + md = '\0'; + + if (lookup_host(av, (struct in_addr *)&d[0]) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", av); + switch (md) { + case ':': + if (!inet_aton(p, (struct in_addr *)&d[1])) + errx(EX_DATAERR, "bad netmask ``%s''", p); + break; + case '/': + masklen = atoi(p); + if (masklen == 0) + d[1] = htonl(0); /* mask */ + else if (masklen > 32) + errx(EX_DATAERR, "bad width ``%s''", p); + else + d[1] = htonl(~0 << (32 - masklen)); + break; + case '{': /* no mask, assume /24 and put back the '{' */ + d[1] = htonl(~0 << (32 - 24)); + *(--p) = md; + break; + + case ',': /* single address plus continuation */ + *(--p) = md; + /* FALLTHROUGH */ + case 0: /* initialization value */ + default: + d[1] = htonl(~0); /* force /32 */ + break; + } + d[0] &= d[1]; /* mask base address with mask */ + if (t) + *t = nd; + /* find next separator */ + if (p) + p = strpbrk(p, ",{"); + if (p && *p == '{') { + /* + * We have a set of addresses. They are stored as follows: + * arg1 is the set size (powers of 2, 2..256) + * addr is the base address IN HOST FORMAT + * mask.. is an array of arg1 bits (rounded up to + * the next multiple of 32) with bits set + * for each host in the map. + */ + uint32_t *map = (uint32_t *)&cmd->mask; + int low, high; + int i = contigmask((uint8_t *)&(d[1]), 32); + + if (len > 0) + errx(EX_DATAERR, "address set cannot be in a list"); + if (i < 24 || i > 31) + errx(EX_DATAERR, "invalid set with mask %d\n", i); + cmd->o.arg1 = 1<<(32-i); /* map length */ + d[0] = ntohl(d[0]); /* base addr in host format */ + cmd->o.opcode = O_IP_DST_SET; /* default */ + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; + for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) + map[i] = 0; /* clear map */ + + av = p + 1; + low = d[0] & 0xff; + high = low + cmd->o.arg1 - 1; + /* + * Here, i stores the previous value when we specify a range + * of addresses within a mask, e.g. 45-63. i = -1 means we + * have no previous value. + */ + i = -1; /* previous value in a range */ + while (isdigit(*av)) { + char *s; + int a = strtol(av, &s, 0); + + if (s == av) { /* no parameter */ + if (*av != '}') + errx(EX_DATAERR, "set not closed\n"); + if (i != -1) + errx(EX_DATAERR, "incomplete range %d-", i); + break; + } + if (a < low || a > high) + errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", + a, low, high); + a -= low; + if (i == -1) /* no previous in range */ + i = a; + else { /* check that range is valid */ + if (i > a) + errx(EX_DATAERR, "invalid range %d-%d", + i+low, a+low); + if (*s == '-') + errx(EX_DATAERR, "double '-' in range"); + } + for (; i <= a; i++) + map[i/32] |= 1<<(i & 31); + i = -1; + if (*s == '-') + i = a; + else if (*s == '}') + break; + av = s+1; + } + return; + } + av = p; + if (av) /* then *av must be a ',' */ + av++; + + /* Check this entry */ + if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT) { /* "not any" never matches */ + if (av == NULL && len == 0) /* only this entry */ + errx(EX_DATAERR, "not any never matches"); + } + /* else do nothing and skip this entry */ + return; + } + /* A single IP can be stored in an optimized format */ + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + return; + } + len += 2; /* two words... */ + d += 2; + } /* end while */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; +} + + +/* n2mask sets n bits of the mask */ +void +n2mask(struct in6_addr *mask, int n) +{ + static int minimask[9] = + { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; + u_char *p; + + memset(mask, 0, sizeof(struct in6_addr)); + p = (u_char *) mask; + for (; n > 0; p++, n -= 8) { + if (n >= 8) + *p = 0xff; + else + *p = minimask[n]; + } + return; +} + +static void +fill_flags_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, + struct _s_x *flags, char *p) +{ + char *e; + uint32_t set = 0, clear = 0; + + if (fill_flags(flags, p, &e, &set, &clear) != 0) + errx(EX_DATAERR, "invalid flag %s", e); + + cmd->opcode = opcode; + cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; + cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); +} + + +void +ipfw_delete(char *av[]) +{ + int i; + int exitval = EX_OK; + int do_set = 0; + ipfw_range_tlv rt; + + av++; + NEED1("missing rule specification"); + memset(&rt, 0, sizeof(rt)); + if ( *av && _substrcmp(*av, "set") == 0) { + /* Do not allow using the following syntax: + * ipfw set N delete set M + */ + if (co.use_set) + errx(EX_DATAERR, "invalid syntax"); + do_set = 1; /* delete set */ + av++; + } + + /* Rule number */ + while (*av && isdigit(**av)) { + i = atoi(*av); av++; + if (co.do_nat) { + exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); + if (exitval) { + exitval = EX_UNAVAILABLE; + warn("rule %u not available", i); + } + } else if (co.do_pipe) { + exitval = ipfw_delete_pipe(co.do_pipe, i); + } else { + if (do_set != 0) { + rt.set = i & 31; + rt.flags = IPFW_RCFLAG_SET; + } else { + rt.start_rule = i & 0xffff; + rt.end_rule = i & 0xffff; + if (rt.start_rule == 0 && rt.end_rule == 0) + rt.flags |= IPFW_RCFLAG_ALL; + else + rt.flags |= IPFW_RCFLAG_RANGE; + if (co.use_set != 0) { + rt.set = co.use_set - 1; + rt.flags |= IPFW_RCFLAG_SET; + } + } + i = do_range_cmd(IP_FW_XDEL, &rt); + if (i != 0) { + exitval = EX_UNAVAILABLE; + warn("rule %u: setsockopt(IP_FW_XDEL)", + rt.start_rule); + } else if (rt.new_set == 0) { + exitval = EX_UNAVAILABLE; + if (rt.start_rule != rt.end_rule) + warnx("no rules rules in %u-%u range", + rt.start_rule, rt.end_rule); + else + warnx("rule %u not found", + rt.start_rule); + } + } + } + if (exitval != EX_OK) + exit(exitval); +} + + +/* + * fill the interface structure. We do not check the name as we can + * create interfaces dynamically, so checking them at insert time + * makes relatively little sense. + * Interface names containing '*', '?', or '[' are assumed to be shell + * patterns which match interfaces. + */ +static void +fill_iface(ipfw_insn_if *cmd, char *arg, int cblen, struct tidx *tstate) +{ + char *p; + uint16_t uidx; + + cmd->name[0] = '\0'; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); + + CHECK_CMDLEN; + + /* Parse the interface or address */ + if (strcmp(arg, "any") == 0) + cmd->o.len = 0; /* effectively ignore this command */ + else if (strncmp(arg, "table(", 6) == 0) { + if ((p = strchr(arg + 6, ')')) == NULL) + errx(EX_DATAERR, "forgotten parenthesis: '%s'", arg); + *p = '\0'; + p = strchr(arg + 6, ','); + if (p) + *p++ = '\0'; + if ((uidx = pack_table(tstate, arg + 6)) == 0) + errx(EX_DATAERR, "Invalid table name: %s", arg + 6); + + cmd->name[0] = '\1'; /* Special value indicating table */ + cmd->p.kidx = uidx; + } else if (!isdigit(*arg)) { + strlcpy(cmd->name, arg, sizeof(cmd->name)); + cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; + } else if (!inet_aton(arg, &cmd->p.ip)) + errx(EX_DATAERR, "bad ip address ``%s''", arg); +} + +static void +get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) +{ + int i; + size_t l; + char *ap, *ptr, *optr; + struct ether_addr *mac; + const char *macset = "0123456789abcdefABCDEF:"; + + if (strcmp(p, "any") == 0) { + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] = mask[i] = 0; + return; + } + + optr = ptr = strdup(p); + if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { + l = strlen(ap); + if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) + errx(EX_DATAERR, "Incorrect MAC address"); + bcopy(mac, addr, ETHER_ADDR_LEN); + } else + errx(EX_DATAERR, "Incorrect MAC address"); + + if (ptr != NULL) { /* we have mask? */ + if (p[ptr - optr - 1] == '/') { /* mask len */ + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) + errx(EX_DATAERR, "Incorrect mask length"); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); + } else { /* mask */ + l = strlen(ptr); + if (strspn(ptr, macset) != l || + (mac = ether_aton(ptr)) == NULL) + errx(EX_DATAERR, "Incorrect mask"); + bcopy(mac, mask, ETHER_ADDR_LEN); + } + } else { /* default mask: ff:ff:ff:ff:ff:ff */ + for (i = 0; i < ETHER_ADDR_LEN; i++) + mask[i] = 0xff; + } + for (i = 0; i < ETHER_ADDR_LEN; i++) + addr[i] &= mask[i]; + + free(optr); +} + +/* + * helper function, updates the pointer to cmd with the length + * of the current command, and also cleans up the first word of + * the new command in case it has been clobbered before. + */ +static ipfw_insn * +next_cmd(ipfw_insn *cmd, int *len) +{ + *len -= F_LEN(cmd); + CHECK_LENGTH(*len, 0); + cmd += F_LEN(cmd); + bzero(cmd, sizeof(*cmd)); + return cmd; +} + +/* + * Takes arguments and copies them into a comment + */ +static void +fill_comment(ipfw_insn *cmd, char **av, int cblen) +{ + int i, l; + char *p = (char *)(cmd + 1); + + cmd->opcode = O_NOP; + cmd->len = (cmd->len & (F_NOT | F_OR)); + + /* Compute length of comment string. */ + for (i = 0, l = 0; av[i] != NULL; i++) + l += strlen(av[i]) + 1; + if (l == 0) + return; + if (l > 84) + errx(EX_DATAERR, + "comment too long (max 80 chars)"); + l = 1 + (l+3)/4; + cmd->len = (cmd->len & (F_NOT | F_OR)) | l; + CHECK_CMDLEN; + + for (i = 0; av[i] != NULL; i++) { + strcpy(p, av[i]); + p += strlen(av[i]); + *p++ = ' '; + } + *(--p) = '\0'; +} + +/* + * A function to fill simple commands of size 1. + * Existing flags are preserved. + */ +static void +fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) +{ + cmd->opcode = opcode; + cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; + cmd->arg1 = arg; +} + +/* + * Fetch and add the MAC address and type, with masks. This generates one or + * two microinstructions, and returns the pointer to the last one. + */ +static ipfw_insn * +add_mac(ipfw_insn *cmd, char *av[], int cblen) +{ + ipfw_insn_mac *mac; + + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) + errx(EX_DATAERR, "MAC dst src"); + + cmd->opcode = O_MACADDR2; + cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); + CHECK_CMDLEN; + + mac = (ipfw_insn_mac *)cmd; + get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ + get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), + &(mac->mask[ETHER_ADDR_LEN])); /* src */ + return cmd; +} + +static ipfw_insn * +add_mactype(ipfw_insn *cmd, char *av, int cblen) +{ + if (!av) + errx(EX_DATAERR, "missing MAC type"); + if (strcmp(av, "any") != 0) { /* we have a non-null type */ + fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE, + cblen); + cmd->opcode = O_MAC_TYPE; + return cmd; + } else + return NULL; +} + +static ipfw_insn * +add_proto0(ipfw_insn *cmd, char *av, u_char *protop) +{ + struct protoent *pe; + char *ep; + int proto; + + proto = strtol(av, &ep, 10); + if (*ep != '\0' || proto <= 0) { + if ((pe = getprotobyname(av)) == NULL) + return NULL; + proto = pe->p_proto; + } + + fill_cmd(cmd, O_PROTO, 0, proto); + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) +{ + u_char proto = IPPROTO_IP; + + if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) + ; /* do not set O_IP4 nor O_IP6 */ + else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) + /* explicit "just IPv4" rule */ + fill_cmd(cmd, O_IP4, 0, 0); + else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { + /* explicit "just IPv6" rule */ + proto = IPPROTO_IPV6; + fill_cmd(cmd, O_IP6, 0, 0); + } else + return add_proto0(cmd, av, protop); + + *protop = proto; + return cmd; +} + +static ipfw_insn * +add_srcip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) +{ + fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); + if (cmd->opcode == O_IP_DST_SET) /* set */ + cmd->opcode = O_IP_SRC_SET; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + cmd->opcode = O_IP_SRC_LOOKUP; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_SRC_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_SRC; + else /* addr/mask */ + cmd->opcode = O_IP_SRC_MASK; + return cmd; +} + +static ipfw_insn * +add_dstip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) +{ + fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); + if (cmd->opcode == O_IP_DST_SET) /* set */ + ; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + ; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ + cmd->opcode = O_IP_DST_ME; + else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ + cmd->opcode = O_IP_DST; + else /* addr/mask */ + cmd->opcode = O_IP_DST_MASK; + return cmd; +} + +static struct _s_x f_reserved_keywords[] = { + { "altq", TOK_OR }, + { "//", TOK_OR }, + { "diverted", TOK_OR }, + { "dst-port", TOK_OR }, + { "src-port", TOK_OR }, + { "established", TOK_OR }, + { "keep-state", TOK_OR }, + { "frag", TOK_OR }, + { "icmptypes", TOK_OR }, + { "in", TOK_OR }, + { "out", TOK_OR }, + { "ip6", TOK_OR }, + { "any", TOK_OR }, + { "to", TOK_OR }, + { "via", TOK_OR }, + { "{", TOK_OR }, + { NULL, 0 } /* terminator */ +}; + +static ipfw_insn * +add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode, int cblen) +{ + + if (match_token(f_reserved_keywords, av) != -1) + return (NULL); + + if (fill_newports((ipfw_insn_u16 *)cmd, av, proto, cblen)) { + /* XXX todo: check that we have a protocol with ports */ + cmd->opcode = opcode; + return cmd; + } + return NULL; +} + +static ipfw_insn * +add_src(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) +{ + struct in6_addr a; + char *host, *ch, buf[INET6_ADDRSTRLEN]; + ipfw_insn *ret = NULL; + int len; + + /* Copy first address in set if needed */ + if ((ch = strpbrk(av, "/,")) != NULL) { + len = ch - av; + strlcpy(buf, av, sizeof(buf)); + if (len < sizeof(buf)) + buf[len] = '\0'; + host = buf; + } else + host = av; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_srcip6(cmd, av, cblen); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_srcip(cmd, av, cblen, tstate); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + return ret; +} + +static ipfw_insn * +add_dst(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) +{ + struct in6_addr a; + char *host, *ch, buf[INET6_ADDRSTRLEN]; + ipfw_insn *ret = NULL; + int len; + + /* Copy first address in set if needed */ + if ((ch = strpbrk(av, "/,")) != NULL) { + len = ch - av; + strlcpy(buf, av, sizeof(buf)); + if (len < sizeof(buf)) + buf[len] = '\0'; + host = buf; + } else + host = av; + + if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || + inet_pton(AF_INET6, host, &a) == 1) + ret = add_dstip6(cmd, av, cblen); + /* XXX: should check for IPv4, not !IPv6 */ + if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || + inet_pton(AF_INET6, host, &a) != 1)) + ret = add_dstip(cmd, av, cblen, tstate); + if (ret == NULL && strcmp(av, "any") != 0) + ret = cmd; + + return ret; +} + +/* + * Parse arguments and assemble the microinstructions which make up a rule. + * Rules are added into the 'rulebuf' and then copied in the correct order + * into the actual rule. + * + * The syntax for a rule starts with the action, followed by + * optional action parameters, and the various match patterns. + * In the assembled microcode, the first opcode must be an O_PROBE_STATE + * (generated if the rule includes a keep-state option), then the + * various match patterns, log/altq actions, and the actual action. + * + */ +void +compile_rule(char *av[], uint32_t *rbuf, int *rbufsize, struct tidx *tstate) +{ + /* + * rules are added into the 'rulebuf' and then copied in + * the correct order into the actual rule. + * Some things that need to go out of order (prob, action etc.) + * go into actbuf[]. + */ + static uint32_t actbuf[255], cmdbuf[255]; + int rblen, ablen, cblen; + + ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; + ipfw_insn *first_cmd; /* first match pattern */ + + struct ip_fw_rule *rule; + + /* + * various flags used to record that we entered some fields. + */ + ipfw_insn *have_state = NULL; /* check-state or keep-state */ + ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; + size_t len; + + int i; + + int open_par = 0; /* open parenthesis ( */ + + /* proto is here because it is used to fetch ports */ + u_char proto = IPPROTO_IP; /* default protocol */ + + double match_prob = 1; /* match probability, default is always match */ + + bzero(actbuf, sizeof(actbuf)); /* actions go here */ + bzero(cmdbuf, sizeof(cmdbuf)); + bzero(rbuf, *rbufsize); + + rule = (struct ip_fw_rule *)rbuf; + cmd = (ipfw_insn *)cmdbuf; + action = (ipfw_insn *)actbuf; + + rblen = *rbufsize / sizeof(uint32_t); + rblen -= sizeof(struct ip_fw_rule) / sizeof(uint32_t); + ablen = sizeof(actbuf) / sizeof(actbuf[0]); + cblen = sizeof(cmdbuf) / sizeof(cmdbuf[0]); + cblen -= F_INSN_SIZE(ipfw_insn_u32) + 1; + +#define CHECK_RBUFLEN(len) { CHECK_LENGTH(rblen, len); rblen -= len; } +#define CHECK_ACTLEN CHECK_LENGTH(ablen, action->len) + + av++; + + /* [rule N] -- Rule number optional */ + if (av[0] && isdigit(**av)) { + rule->rulenum = atoi(*av); + av++; + } + + /* [set N] -- set number (0..RESVD_SET), optional */ + if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { + int set = strtoul(av[1], NULL, 10); + if (set < 0 || set > RESVD_SET) + errx(EX_DATAERR, "illegal set %s", av[1]); + rule->set = set; + tstate->set = set; + av += 2; + } + + /* [prob D] -- match probability, optional */ + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { + match_prob = strtod(av[1], NULL); + + if (match_prob <= 0 || match_prob > 1) + errx(EX_DATAERR, "illegal match prob. %s", av[1]); + av += 2; + } + + /* action -- mandatory */ + NEED1("missing action"); + i = match_token(rule_actions, *av); + av++; + action->len = 1; /* default */ + CHECK_ACTLEN; + switch(i) { + case TOK_CHECKSTATE: + have_state = action; + action->opcode = O_CHECK_STATE; + break; + + case TOK_ACCEPT: + action->opcode = O_ACCEPT; + break; + + case TOK_DENY: + action->opcode = O_DENY; + action->arg1 = 0; + break; + + case TOK_REJECT: + action->opcode = O_REJECT; + action->arg1 = ICMP_UNREACH_HOST; + break; + + case TOK_RESET: + action->opcode = O_REJECT; + action->arg1 = ICMP_REJECT_RST; + break; + + case TOK_RESET6: + action->opcode = O_UNREACH6; + action->arg1 = ICMP6_UNREACH_RST; + break; + + case TOK_UNREACH: + action->opcode = O_REJECT; + NEED1("missing reject code"); + fill_reject_code(&action->arg1, *av); + av++; + break; + + case TOK_UNREACH6: + action->opcode = O_UNREACH6; + NEED1("missing unreach code"); + fill_unreach6_code(&action->arg1, *av); + av++; + break; + + case TOK_COUNT: + action->opcode = O_COUNT; + break; + + case TOK_NAT: + action->opcode = O_NAT; + action->len = F_INSN_SIZE(ipfw_insn_nat); + CHECK_ACTLEN; + if (_substrcmp(*av, "global") == 0) { + action->arg1 = 0; + av++; + break; + } else + goto chkarg; + + case TOK_QUEUE: + action->opcode = O_QUEUE; + goto chkarg; + case TOK_PIPE: + action->opcode = O_PIPE; + goto chkarg; + case TOK_SKIPTO: + action->opcode = O_SKIPTO; + goto chkarg; + case TOK_NETGRAPH: + action->opcode = O_NETGRAPH; + goto chkarg; + case TOK_NGTEE: + action->opcode = O_NGTEE; + goto chkarg; + case TOK_DIVERT: + action->opcode = O_DIVERT; + goto chkarg; + case TOK_TEE: + action->opcode = O_TEE; + goto chkarg; + case TOK_CALL: + action->opcode = O_CALLRETURN; +chkarg: + if (!av[0]) + errx(EX_USAGE, "missing argument for %s", *(av - 1)); + if (isdigit(**av)) { + action->arg1 = strtoul(*av, NULL, 10); + if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) + errx(EX_DATAERR, "illegal argument for %s", + *(av - 1)); + } else if (_substrcmp(*av, "tablearg") == 0) { + action->arg1 = IP_FW_TARG; + } else if (i == TOK_DIVERT || i == TOK_TEE) { + struct servent *s; + setservent(1); + s = getservbyname(av[0], "divert"); + if (s != NULL) + action->arg1 = ntohs(s->s_port); + else + errx(EX_DATAERR, "illegal divert/tee port"); + } else + errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); + av++; + break; + + case TOK_FORWARD: { + /* + * Locate the address-port separator (':' or ','). + * Could be one of the following: + * hostname:port + * IPv4 a.b.c.d,port + * IPv4 a.b.c.d:port + * IPv6 w:x:y::z,port + * The ':' can only be used with hostname and IPv4 address. + * XXX-BZ Should we also support [w:x:y::z]:port? + */ + struct sockaddr_storage result; + struct addrinfo *res; + char *s, *end; + int family; + u_short port_number; + + NEED1("missing forward address[:port]"); + + /* + * locate the address-port separator (':' or ',') + */ + s = strchr(*av, ','); + if (s == NULL) { + /* Distinguish between IPv4:port and IPv6 cases. */ + s = strchr(*av, ':'); + if (s && strchr(s+1, ':')) + s = NULL; /* no port */ + } + + port_number = 0; + if (s != NULL) { + /* Terminate host portion and set s to start of port. */ + *(s++) = '\0'; + i = strtoport(s, &end, 0 /* base */, 0 /* proto */); + if (s == end) + errx(EX_DATAERR, + "illegal forwarding port ``%s''", s); + port_number = (u_short)i; + } + + if (_substrcmp(*av, "tablearg") == 0) { + family = PF_INET; + ((struct sockaddr_in*)&result)->sin_addr.s_addr = + INADDR_ANY; + } else { + /* + * Resolve the host name or address to a family and a + * network representation of the address. + */ + if (getaddrinfo(*av, NULL, NULL, &res)) + errx(EX_DATAERR, NULL); + /* Just use the first host in the answer. */ + family = res->ai_family; + memcpy(&result, res->ai_addr, res->ai_addrlen); + freeaddrinfo(res); + } + + if (family == PF_INET) { + ipfw_insn_sa *p = (ipfw_insn_sa *)action; + + action->opcode = O_FORWARD_IP; + action->len = F_INSN_SIZE(ipfw_insn_sa); + CHECK_ACTLEN; + + /* + * In the kernel we assume AF_INET and use only + * sin_port and sin_addr. Remember to set sin_len as + * the routing code seems to use it too. + */ + p->sa.sin_len = sizeof(struct sockaddr_in); + p->sa.sin_family = AF_INET; + p->sa.sin_port = port_number; + p->sa.sin_addr.s_addr = + ((struct sockaddr_in *)&result)->sin_addr.s_addr; + } else if (family == PF_INET6) { + ipfw_insn_sa6 *p = (ipfw_insn_sa6 *)action; + + action->opcode = O_FORWARD_IP6; + action->len = F_INSN_SIZE(ipfw_insn_sa6); + CHECK_ACTLEN; + + p->sa.sin6_len = sizeof(struct sockaddr_in6); + p->sa.sin6_family = AF_INET6; + p->sa.sin6_port = port_number; + p->sa.sin6_flowinfo = 0; + p->sa.sin6_scope_id = 0; + /* No table support for v6 yet. */ + bcopy(&((struct sockaddr_in6*)&result)->sin6_addr, + &p->sa.sin6_addr, sizeof(p->sa.sin6_addr)); + } else { + errx(EX_DATAERR, "Invalid address family in forward action"); + } + av++; + break; + } + case TOK_COMMENT: + /* pretend it is a 'count' rule followed by the comment */ + action->opcode = O_COUNT; + av--; /* go back... */ + break; + + case TOK_SETFIB: + { + int numfibs; + size_t intsize = sizeof(int); + + action->opcode = O_SETFIB; + NEED1("missing fib number"); + if (_substrcmp(*av, "tablearg") == 0) { + action->arg1 = IP_FW_TARG; + } else { + action->arg1 = strtoul(*av, NULL, 10); + if (sysctlbyname("net.fibs", &numfibs, &intsize, + NULL, 0) == -1) + errx(EX_DATAERR, "fibs not suported.\n"); + if (action->arg1 >= numfibs) /* Temporary */ + errx(EX_DATAERR, "fib too large.\n"); + /* Add high-order bit to fib to make room for tablearg*/ + action->arg1 |= 0x8000; + } + av++; + break; + } + + case TOK_SETDSCP: + { + int code; + + action->opcode = O_SETDSCP; + NEED1("missing DSCP code"); + if (_substrcmp(*av, "tablearg") == 0) { + action->arg1 = IP_FW_TARG; + } else if (isalpha(*av[0])) { + if ((code = match_token(f_ipdscp, *av)) == -1) + errx(EX_DATAERR, "Unknown DSCP code"); + action->arg1 = code; + } else + action->arg1 = strtoul(*av, NULL, 10); + /* Add high-order bit to DSCP to make room for tablearg */ + if (action->arg1 != IP_FW_TARG) + action->arg1 |= 0x8000; + av++; + break; + } + + case TOK_REASS: + action->opcode = O_REASS; + break; + + case TOK_RETURN: + fill_cmd(action, O_CALLRETURN, F_NOT, 0); + break; + + default: + errx(EX_DATAERR, "invalid action %s\n", av[-1]); + } + action = next_cmd(action, &ablen); + + /* + * [altq queuename] -- altq tag, optional + * [log [logamount N]] -- log, optional + * + * If they exist, it go first in the cmdbuf, but then it is + * skipped in the copy section to the end of the buffer. + */ + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; + switch (i) { + case TOK_LOG: + { + ipfw_insn_log *c = (ipfw_insn_log *)cmd; + int l; + + if (have_log) + errx(EX_DATAERR, + "log cannot be specified more than once"); + have_log = (ipfw_insn *)c; + cmd->len = F_INSN_SIZE(ipfw_insn_log); + CHECK_CMDLEN; + cmd->opcode = O_LOG; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; + NEED1("logamount requires argument"); + l = atoi(*av); + if (l < 0) + errx(EX_DATAERR, + "logamount must be positive"); + c->max_log = l; + av++; + } else { + len = sizeof(c->max_log); + if (sysctlbyname("net.inet.ip.fw.verbose_limit", + &c->max_log, &len, NULL, 0) == -1) { + if (co.test_only) { + c->max_log = 0; + break; + } + errx(1, "sysctlbyname(\"%s\")", + "net.inet.ip.fw.verbose_limit"); + } + } + } + break; + +#ifndef NO_ALTQ + case TOK_ALTQ: + { + ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; + + NEED1("missing altq queue name"); + if (have_altq) + errx(EX_DATAERR, + "altq cannot be specified more than once"); + have_altq = (ipfw_insn *)a; + cmd->len = F_INSN_SIZE(ipfw_insn_altq); + CHECK_CMDLEN; + cmd->opcode = O_ALTQ; + a->qid = altq_name_to_qid(*av); + av++; + } + break; +#endif + + case TOK_TAG: + case TOK_UNTAG: { + uint16_t tag; + + if (have_tag) + errx(EX_USAGE, "tag and untag cannot be " + "specified more than once"); + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, + rule_action_params); + have_tag = cmd; + fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); + av++; + break; + } + + default: + abort(); + } + cmd = next_cmd(cmd, &cblen); + } + + if (have_state) /* must be a check-state, we are done */ + goto done; + +#define OR_START(target) \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ + if (open_par) \ + errx(EX_USAGE, "nested \"(\" not allowed\n"); \ + prev = NULL; \ + open_par = 1; \ + if ( (av[0])[1] == '\0') { \ + av++; \ + } else \ + (*av)++; \ + } \ + target: \ + + +#define CLOSE_PAR \ + if (open_par) { \ + if (av[0] && ( \ + strcmp(*av, ")") == 0 || \ + strcmp(*av, "}") == 0)) { \ + prev = NULL; \ + open_par = 0; \ + av++; \ + } else \ + errx(EX_USAGE, "missing \")\"\n"); \ + } + +#define NOT_BLOCK \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ + if (cmd->len & F_NOT) \ + errx(EX_USAGE, "double \"not\" not allowed\n"); \ + cmd->len |= F_NOT; \ + av++; \ + } + +#define OR_BLOCK(target) \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ + if (prev == NULL || open_par == 0) \ + errx(EX_DATAERR, "invalid OR block"); \ + prev->len |= F_OR; \ + av++; \ + goto target; \ + } \ + CLOSE_PAR; + + first_cmd = cmd; + +#if 0 + /* + * MAC addresses, optional. + * If we have this, we skip the part "proto from src to dst" + * and jump straight to the option parsing. + */ + NOT_BLOCK; + NEED1("missing protocol"); + if (_substrcmp(*av, "MAC") == 0 || + _substrcmp(*av, "mac") == 0) { + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ + cmd = next_cmd(cmd); + av += 2; /* dst-mac and src-mac */ + NOT_BLOCK; + NEED1("missing mac type"); + if (add_mactype(cmd, av[0])) + cmd = next_cmd(cmd); + av++; /* any or mac-type */ + goto read_options; + } +#endif + + /* + * protocol, mandatory + */ + OR_START(get_proto); + NOT_BLOCK; + NEED1("missing protocol"); + if (add_proto_compat(cmd, *av, &proto)) { + av++; + if (F_LEN(cmd) != 0) { + prev = cmd; + cmd = next_cmd(cmd, &cblen); + } + } else if (first_cmd != cmd) { + errx(EX_DATAERR, "invalid protocol ``%s''", *av); + } else + goto read_options; + OR_BLOCK(get_proto); + + /* + * "from", mandatory + */ + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) + errx(EX_USAGE, "missing ``from''"); + av++; + + /* + * source IP, mandatory + */ + OR_START(source_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing source address"); + if (add_src(cmd, *av, proto, cblen, tstate)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd, &cblen); + } + } else + errx(EX_USAGE, "bad source address %s", *av); + OR_BLOCK(source_ip); + + /* + * source ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if ( av[0] != NULL ) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd, &cblen); + } + } + + /* + * "to", mandatory + */ + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) + errx(EX_USAGE, "missing ``to''"); + av++; + + /* + * destination, mandatory + */ + OR_START(dest_ip); + NOT_BLOCK; /* optional "not" */ + NEED1("missing dst address"); + if (add_dst(cmd, *av, proto, cblen, tstate)) { + av++; + if (F_LEN(cmd) != 0) { /* ! any */ + prev = cmd; + cmd = next_cmd(cmd, &cblen); + } + } else + errx( EX_USAGE, "bad destination address %s", *av); + OR_BLOCK(dest_ip); + + /* + * dest. ports, optional + */ + NOT_BLOCK; /* optional "not" */ + if (av[0]) { + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { + av++; + if (F_LEN(cmd) != 0) + cmd = next_cmd(cmd, &cblen); + } + } + +read_options: + if (av[0] && first_cmd == cmd) { + /* + * nothing specified so far, store in the rule to ease + * printout later. + */ + rule->flags |= IPFW_RULE_NOOPT; + } + prev = NULL; + while ( av[0] != NULL ) { + char *s; + ipfw_insn_u32 *cmd32; /* alias for cmd */ + + s = *av; + cmd32 = (ipfw_insn_u32 *)cmd; + + if (*s == '!') { /* alternate syntax for NOT */ + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + s++; + } + i = match_token(rule_options, s); + av++; + switch(i) { + case TOK_NOT: + if (cmd->len & F_NOT) + errx(EX_USAGE, "double \"not\" not allowed\n"); + cmd->len = F_NOT; + break; + + case TOK_OR: + if (open_par == 0 || prev == NULL) + errx(EX_USAGE, "invalid \"or\" block\n"); + prev->len |= F_OR; + break; + + case TOK_STARTBRACE: + if (open_par) + errx(EX_USAGE, "+nested \"(\" not allowed\n"); + open_par = 1; + break; + + case TOK_ENDBRACE: + if (!open_par) + errx(EX_USAGE, "+missing \")\"\n"); + open_par = 0; + prev = NULL; + break; + + case TOK_IN: + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_OUT: + cmd->len ^= F_NOT; /* toggle F_NOT */ + fill_cmd(cmd, O_IN, 0, 0); + break; + + case TOK_DIVERTED: + fill_cmd(cmd, O_DIVERTED, 0, 3); + break; + + case TOK_DIVERTEDLOOPBACK: + fill_cmd(cmd, O_DIVERTED, 0, 1); + break; + + case TOK_DIVERTEDOUTPUT: + fill_cmd(cmd, O_DIVERTED, 0, 2); + break; + + case TOK_FRAG: + fill_cmd(cmd, O_FRAG, 0, 0); + break; + + case TOK_LAYER2: + fill_cmd(cmd, O_LAYER2, 0, 0); + break; + + case TOK_XMIT: + case TOK_RECV: + case TOK_VIA: + NEED1("recv, xmit, via require interface name" + " or address"); + fill_iface((ipfw_insn_if *)cmd, av[0], cblen, tstate); + av++; + if (F_LEN(cmd) == 0) /* not a valid address */ + break; + if (i == TOK_XMIT) + cmd->opcode = O_XMIT; + else if (i == TOK_RECV) + cmd->opcode = O_RECV; + else if (i == TOK_VIA) + cmd->opcode = O_VIA; + break; + + case TOK_ICMPTYPES: + NEED1("icmptypes requires list of types"); + fill_icmptypes((ipfw_insn_u32 *)cmd, *av); + av++; + break; + + case TOK_ICMP6TYPES: + NEED1("icmptypes requires list of types"); + fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av, cblen); + av++; + break; + + case TOK_IPTTL: + NEED1("ipttl requires TTL"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPTTL, cblen)) + errx(EX_DATAERR, "invalid ipttl %s", *av); + } else + fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPID: + NEED1("ipid requires id"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPID, cblen)) + errx(EX_DATAERR, "invalid ipid %s", *av); + } else + fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPLEN: + NEED1("iplen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_IPLEN, cblen)) + errx(EX_DATAERR, "invalid ip len %s", *av); + } else + fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPVER: + NEED1("ipver requires version"); + fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_IPPRECEDENCE: + NEED1("ipprecedence requires value"); + fill_cmd(cmd, O_IPPRECEDENCE, 0, + (strtoul(*av, NULL, 0) & 7) << 5); + av++; + break; + + case TOK_DSCP: + NEED1("missing DSCP code"); + fill_dscp(cmd, *av, cblen); + av++; + break; + + case TOK_IPOPTS: + NEED1("missing argument for ipoptions"); + fill_flags_cmd(cmd, O_IPOPT, f_ipopts, *av); + av++; + break; + + case TOK_IPTOS: + NEED1("missing argument for iptos"); + fill_flags_cmd(cmd, O_IPTOS, f_iptos, *av); + av++; + break; + + case TOK_UID: + NEED1("uid requires argument"); + { + char *end; + uid_t uid; + struct passwd *pwd; + + cmd->opcode = O_UID; + uid = strtoul(*av, &end, 0); + pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); + if (pwd == NULL) + errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); + cmd32->d[0] = pwd->pw_uid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_GID: + NEED1("gid requires argument"); + { + char *end; + gid_t gid; + struct group *grp; + + cmd->opcode = O_GID; + gid = strtoul(*av, &end, 0); + grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); + if (grp == NULL) + errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); + cmd32->d[0] = grp->gr_gid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_JAIL: + NEED1("jail requires argument"); + { + char *end; + int jid; + + cmd->opcode = O_JAIL; + jid = (int)strtol(*av, &end, 0); + if (jid < 0 || *end != '\0') + errx(EX_DATAERR, "jail requires prison ID"); + cmd32->d[0] = (uint32_t)jid; + cmd->len |= F_INSN_SIZE(ipfw_insn_u32); + av++; + } + break; + + case TOK_ESTAB: + fill_cmd(cmd, O_ESTAB, 0, 0); + break; + + case TOK_SETUP: + fill_cmd(cmd, O_TCPFLAGS, 0, + (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); + break; + + case TOK_TCPDATALEN: + NEED1("tcpdatalen requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TCPDATALEN, cblen)) + errx(EX_DATAERR, "invalid tcpdata len %s", *av); + } else + fill_cmd(cmd, O_TCPDATALEN, 0, + strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPOPTS: + NEED1("missing argument for tcpoptions"); + fill_flags_cmd(cmd, O_TCPOPTS, f_tcpopts, *av); + av++; + break; + + case TOK_TCPSEQ: + case TOK_TCPACK: + NEED1("tcpseq/tcpack requires argument"); + cmd->len = F_INSN_SIZE(ipfw_insn_u32); + cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; + cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPWIN: + NEED1("tcpwin requires length"); + if (strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TCPWIN, cblen)) + errx(EX_DATAERR, "invalid tcpwin len %s", *av); + } else + fill_cmd(cmd, O_TCPWIN, 0, + strtoul(*av, NULL, 0)); + av++; + break; + + case TOK_TCPFLAGS: + NEED1("missing argument for tcpflags"); + cmd->opcode = O_TCPFLAGS; + fill_flags_cmd(cmd, O_TCPFLAGS, f_tcpflags, *av); + av++; + break; + + case TOK_KEEPSTATE: + if (open_par) + errx(EX_USAGE, "keep-state cannot be part " + "of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state " + "and limit is allowed"); + have_state = cmd; + fill_cmd(cmd, O_KEEP_STATE, 0, 0); + break; + + case TOK_LIMIT: { + ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + int val; + + if (open_par) + errx(EX_USAGE, + "limit cannot be part of an or block"); + if (have_state) + errx(EX_USAGE, "only one of keep-state and " + "limit is allowed"); + have_state = cmd; + + cmd->len = F_INSN_SIZE(ipfw_insn_limit); + CHECK_CMDLEN; + cmd->opcode = O_LIMIT; + c->limit_mask = c->conn_limit = 0; + + while ( av[0] != NULL ) { + if ((val = match_token(limit_masks, *av)) <= 0) + break; + c->limit_mask |= val; + av++; + } + + if (c->limit_mask == 0) + errx(EX_USAGE, "limit: missing limit mask"); + + GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_LIMIT, rule_options); + + av++; + break; + } + + case TOK_PROTO: + NEED1("missing protocol"); + if (add_proto(cmd, *av, &proto)) { + av++; + } else + errx(EX_DATAERR, "invalid protocol ``%s''", + *av); + break; + + case TOK_SRCIP: + NEED1("missing source IP"); + if (add_srcip(cmd, *av, cblen, tstate)) { + av++; + } + break; + + case TOK_DSTIP: + NEED1("missing destination IP"); + if (add_dstip(cmd, *av, cblen, tstate)) { + av++; + } + break; + + case TOK_SRCIP6: + NEED1("missing source IP6"); + if (add_srcip6(cmd, *av, cblen)) { + av++; + } + break; + + case TOK_DSTIP6: + NEED1("missing destination IP6"); + if (add_dstip6(cmd, *av, cblen)) { + av++; + } + break; + + case TOK_SRCPORT: + NEED1("missing source port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { + av++; + } else + errx(EX_DATAERR, "invalid source port %s", *av); + break; + + case TOK_DSTPORT: + NEED1("missing destination port"); + if (_substrcmp(*av, "any") == 0 || + add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { + av++; + } else + errx(EX_DATAERR, "invalid destination port %s", + *av); + break; + + case TOK_MAC: + if (add_mac(cmd, av, cblen)) + av += 2; + break; + + case TOK_MACTYPE: + NEED1("missing mac type"); + if (!add_mactype(cmd, *av, cblen)) + errx(EX_DATAERR, "invalid mac type %s", *av); + av++; + break; + + case TOK_VERREVPATH: + fill_cmd(cmd, O_VERREVPATH, 0, 0); + break; + + case TOK_VERSRCREACH: + fill_cmd(cmd, O_VERSRCREACH, 0, 0); + break; + + case TOK_ANTISPOOF: + fill_cmd(cmd, O_ANTISPOOF, 0, 0); + break; + + case TOK_IPSEC: + fill_cmd(cmd, O_IPSEC, 0, 0); + break; + + case TOK_IPV6: + fill_cmd(cmd, O_IP6, 0, 0); + break; + + case TOK_IPV4: + fill_cmd(cmd, O_IP4, 0, 0); + break; + + case TOK_EXT6HDR: + fill_ext6hdr( cmd, *av ); + av++; + break; + + case TOK_FLOWID: + if (proto != IPPROTO_IPV6 ) + errx( EX_USAGE, "flow-id filter is active " + "only for ipv6 protocol\n"); + fill_flow6( (ipfw_insn_u32 *) cmd, *av, cblen); + av++; + break; + + case TOK_COMMENT: + fill_comment(cmd, av, cblen); + av[0]=NULL; + break; + + case TOK_TAGGED: + if (av[0] && strpbrk(*av, "-,")) { + if (!add_ports(cmd, *av, 0, O_TAGGED, cblen)) + errx(EX_DATAERR, "tagged: invalid tag" + " list: %s", *av); + } + else { + uint16_t tag; + + GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, + TOK_TAGGED, rule_options); + fill_cmd(cmd, O_TAGGED, 0, tag); + } + av++; + break; + + case TOK_FIB: + NEED1("fib requires fib number"); + fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); + av++; + break; + case TOK_SOCKARG: + fill_cmd(cmd, O_SOCKARG, 0, 0); + break; + + case TOK_LOOKUP: { + ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; + int j; + + if (!av[0] || !av[1]) + errx(EX_USAGE, "format: lookup argument tablenum"); + cmd->opcode = O_IP_DST_LOOKUP; + cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; + i = match_token(rule_options, *av); + for (j = 0; lookup_key[j] >= 0 ; j++) { + if (i == lookup_key[j]) + break; + } + if (lookup_key[j] <= 0) + errx(EX_USAGE, "format: cannot lookup on %s", *av); + __PAST_END(c->d, 1) = j; // i converted to option + av++; + + if ((j = pack_table(tstate, *av)) == 0) + errx(EX_DATAERR, "Invalid table name: %s", *av); + + cmd->arg1 = j; + av++; + } + break; + case TOK_FLOW: + NEED1("missing table name"); + if (strncmp(*av, "table(", 6) != 0) + errx(EX_DATAERR, + "enclose table name into \"table()\""); + fill_table(cmd, *av, O_IP_FLOW_LOOKUP, tstate); + av++; + break; + + default: + errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); + } + if (F_LEN(cmd) > 0) { /* prepare to advance */ + prev = cmd; + cmd = next_cmd(cmd, &cblen); + } + } + +done: + /* + * Now copy stuff into the rule. + * If we have a keep-state option, the first instruction + * must be a PROBE_STATE (which is generated here). + * If we have a LOG option, it was stored as the first command, + * and now must be moved to the top of the action part. + */ + dst = (ipfw_insn *)rule->cmd; + + /* + * First thing to write into the command stream is the match probability. + */ + if (match_prob != 1) { /* 1 means always match */ + dst->opcode = O_PROB; + dst->len = 2; + *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); + dst += dst->len; + } + + /* + * generate O_PROBE_STATE if necessary + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + fill_cmd(dst, O_PROBE_STATE, 0, 0); + dst = next_cmd(dst, &rblen); + } + + /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ + for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { + i = F_LEN(src); + CHECK_RBUFLEN(i); + + switch (src->opcode) { + case O_LOG: + case O_KEEP_STATE: + case O_LIMIT: + case O_ALTQ: + case O_TAG: + break; + default: + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + } + + /* + * put back the have_state command as last opcode + */ + if (have_state && have_state->opcode != O_CHECK_STATE) { + i = F_LEN(have_state); + CHECK_RBUFLEN(i); + bcopy(have_state, dst, i * sizeof(uint32_t)); + dst += i; + } + /* + * start action section + */ + rule->act_ofs = dst - rule->cmd; + + /* put back O_LOG, O_ALTQ, O_TAG if necessary */ + if (have_log) { + i = F_LEN(have_log); + CHECK_RBUFLEN(i); + bcopy(have_log, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_altq) { + i = F_LEN(have_altq); + CHECK_RBUFLEN(i); + bcopy(have_altq, dst, i * sizeof(uint32_t)); + dst += i; + } + if (have_tag) { + i = F_LEN(have_tag); + CHECK_RBUFLEN(i); + bcopy(have_tag, dst, i * sizeof(uint32_t)); + dst += i; + } + + /* + * copy all other actions + */ + for (src = (ipfw_insn *)actbuf; src != action; src += i) { + i = F_LEN(src); + CHECK_RBUFLEN(i); + bcopy(src, dst, i * sizeof(uint32_t)); + dst += i; + } + + rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); + *rbufsize = (char *)dst - (char *)rule; +} + +/* + * Adds one or more rules to ipfw chain. + * Data layout: + * Request: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] (*2) (*3) + * ] + * Reply: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] + * ] + * + * Rules in reply are modified to store their actual ruleset number. + * + * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending + * accoring to their idx field and there has to be no duplicates. + * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. + * (*3) Each ip_fw structure needs to be aligned to u64 boundary. + */ +void +ipfw_add(char *av[]) +{ + uint32_t rulebuf[1024]; + int rbufsize, default_off, tlen, rlen; + size_t sz; + struct tidx ts; + struct ip_fw_rule *rule; + caddr_t tbuf; + ip_fw3_opheader *op3; + ipfw_obj_ctlv *ctlv, *tstate; + + rbufsize = sizeof(rulebuf); + memset(rulebuf, 0, rbufsize); + memset(&ts, 0, sizeof(ts)); + + /* Optimize case with no tables */ + default_off = sizeof(ipfw_obj_ctlv) + sizeof(ip_fw3_opheader); + op3 = (ip_fw3_opheader *)rulebuf; + ctlv = (ipfw_obj_ctlv *)(op3 + 1); + rule = (struct ip_fw_rule *)(ctlv + 1); + rbufsize -= default_off; + + compile_rule(av, (uint32_t *)rule, &rbufsize, &ts); + /* Align rule size to u64 boundary */ + rlen = roundup2(rbufsize, sizeof(uint64_t)); + + tbuf = NULL; + sz = 0; + tstate = NULL; + if (ts.count != 0) { + /* Some tables. We have to alloc more data */ + tlen = ts.count * sizeof(ipfw_obj_ntlv); + sz = default_off + sizeof(ipfw_obj_ctlv) + tlen + rlen; + + if ((tbuf = calloc(1, sz)) == NULL) + err(EX_UNAVAILABLE, "malloc() failed for IP_FW_ADD"); + op3 = (ip_fw3_opheader *)tbuf; + /* Tables first */ + ctlv = (ipfw_obj_ctlv *)(op3 + 1); + ctlv->head.type = IPFW_TLV_TBLNAME_LIST; + ctlv->head.length = sizeof(ipfw_obj_ctlv) + tlen; + ctlv->count = ts.count; + ctlv->objsize = sizeof(ipfw_obj_ntlv); + memcpy(ctlv + 1, ts.idx, tlen); + table_sort_ctlv(ctlv); + tstate = ctlv; + /* Rule next */ + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + ctlv->head.type = IPFW_TLV_RULE_LIST; + ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; + ctlv->count = 1; + memcpy(ctlv + 1, rule, rbufsize); + } else { + /* Simply add header */ + sz = rlen + default_off; + memset(ctlv, 0, sizeof(*ctlv)); + ctlv->head.type = IPFW_TLV_RULE_LIST; + ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; + ctlv->count = 1; + } + + if (do_get3(IP_FW_XADD, op3, &sz) != 0) + err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_XADD"); + + if (!co.do_quiet) { + struct format_opts sfo; + struct buf_pr bp; + memset(&sfo, 0, sizeof(sfo)); + sfo.tstate = tstate; + sfo.set_mask = (uint32_t)(-1); + bp_alloc(&bp, 4096); + show_static_rule(&co, &sfo, &bp, rule, NULL); + printf("%s", bp.buf); + bp_free(&bp); + } + + if (tbuf != NULL) + free(tbuf); + + if (ts.idx != NULL) + free(ts.idx); +} + +/* + * clear the counters or the log counters. + * optname has the following values: + * 0 (zero both counters and logging) + * 1 (zero logging only) + */ +void +ipfw_zero(int ac, char *av[], int optname) +{ + ipfw_range_tlv rt; + uint32_t arg; + int failed = EX_OK; + char const *errstr; + char const *name = optname ? "RESETLOG" : "ZERO"; + + optname = optname ? IP_FW_XRESETLOG : IP_FW_XZERO; + memset(&rt, 0, sizeof(rt)); + + av++; ac--; + + if (ac == 0) { + /* clear all entries */ + rt.flags = IPFW_RCFLAG_ALL; + if (do_range_cmd(optname, &rt) < 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_X%s)", name); + if (!co.do_quiet) + printf("%s.\n", optname == IP_FW_XZERO ? + "Accounting cleared":"Logging counts reset"); + + return; + } + + while (ac) { + /* Rule number */ + if (isdigit(**av)) { + arg = strtonum(*av, 0, 0xffff, &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid rule number %s\n", *av); + rt.start_rule = arg; + rt.end_rule = arg; + rt.flags |= IPFW_RCFLAG_RANGE; + if (co.use_set != 0) { + rt.set = co.use_set - 1; + rt.flags |= IPFW_RCFLAG_SET; + } + if (do_range_cmd(optname, &rt) != 0) { + warn("rule %u: setsockopt(IP_FW_X%s)", + arg, name); + failed = EX_UNAVAILABLE; + } else if (rt.new_set == 0) { + printf("Entry %d not found\n", arg); + failed = EX_UNAVAILABLE; + } else if (!co.do_quiet) + printf("Entry %d %s.\n", arg, + optname == IP_FW_XZERO ? + "cleared" : "logging count reset"); + } else { + errx(EX_USAGE, "invalid rule number ``%s''", *av); + } + av++; ac--; + } + if (failed != EX_OK) + exit(failed); +} + +void +ipfw_flush(int force) +{ + ipfw_range_tlv rt; + + if (!force && !co.do_quiet) { /* need to ask user */ + int c; + + printf("Are you sure? [yn] "); + fflush(stdout); + do { + c = toupper(getc(stdin)); + while (c != '\n' && getc(stdin) != '\n') + if (feof(stdin)) + return; /* and do not flush */ + } while (c != 'Y' && c != 'N'); + printf("\n"); + if (c == 'N') /* user said no */ + return; + } + if (co.do_pipe) { + dummynet_flush(); + return; + } + /* `ipfw set N flush` - is the same that `ipfw delete set N` */ + memset(&rt, 0, sizeof(rt)); + if (co.use_set != 0) { + rt.set = co.use_set - 1; + rt.flags = IPFW_RCFLAG_SET; + } else + rt.flags = IPFW_RCFLAG_ALL; + if (do_range_cmd(IP_FW_XDEL, &rt) != 0) + err(EX_UNAVAILABLE, "setsockopt(IP_FW_XDEL)"); + if (!co.do_quiet) + printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); +} + +static struct _s_x intcmds[] = { + { "talist", TOK_TALIST }, + { "iflist", TOK_IFLIST }, + { "vlist", TOK_VLIST }, + { NULL, 0 } +}; + +void +ipfw_internal_handler(int ac, char *av[]) +{ + int tcmd; + + ac--; av++; + NEED1("internal cmd required"); + + if ((tcmd = match_token(intcmds, *av)) == -1) + errx(EX_USAGE, "invalid internal sub-cmd: %s", *av); + + switch (tcmd) { + case TOK_IFLIST: + ipfw_list_tifaces(); + break; + case TOK_TALIST: + ipfw_list_ta(ac, av); + break; + case TOK_VLIST: + ipfw_list_values(ac, av); + break; + } +} + +static int +ipfw_get_tracked_ifaces(ipfw_obj_lheader **polh) +{ + ipfw_obj_lheader req, *olh; + size_t sz; + + memset(&req, 0, sizeof(req)); + sz = sizeof(req); + + if (do_get3(IP_FW_XIFLIST, &req.opheader, &sz) != 0) { + if (errno != ENOMEM) + return (errno); + } + + sz = req.size; + if ((olh = calloc(1, sz)) == NULL) + return (ENOMEM); + + olh->size = sz; + if (do_get3(IP_FW_XIFLIST, &olh->opheader, &sz) != 0) { + free(olh); + return (errno); + } + + *polh = olh; + return (0); +} + +static int +ifinfo_cmp(const void *a, const void *b) +{ + ipfw_iface_info *ia, *ib; + + ia = (ipfw_iface_info *)a; + ib = (ipfw_iface_info *)b; + + return (stringnum_cmp(ia->ifname, ib->ifname)); +} + +/* + * Retrieves table list from kernel, + * optionally sorts it and calls requested function for each table. + * Returns 0 on success. + */ +static void +ipfw_list_tifaces() +{ + ipfw_obj_lheader *olh = NULL; + ipfw_iface_info *info; + int i, error; + + if ((error = ipfw_get_tracked_ifaces(&olh)) != 0) + err(EX_OSERR, "Unable to request ipfw tracked interface list"); + + + qsort(olh + 1, olh->count, olh->objsize, ifinfo_cmp); + + info = (ipfw_iface_info *)(olh + 1); + for (i = 0; i < olh->count; i++) { + if (info->flags & IPFW_IFFLAG_RESOLVED) + printf("%s ifindex: %d refcount: %u changes: %u\n", + info->ifname, info->ifindex, info->refcnt, + info->gencnt); + else + printf("%s ifindex: unresolved refcount: %u changes: %u\n", + info->ifname, info->refcnt, info->gencnt); + info = (ipfw_iface_info *)((caddr_t)info + olh->objsize); + } + + free(olh); +} + + + + diff --git a/example/ipfw/ipfw/ipfw2.h b/example/ipfw/ipfw/ipfw2.h new file mode 100644 index 0000000..8770534 --- /dev/null +++ b/example/ipfw/ipfw/ipfw2.h @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipfw2.h 272840 2014-10-09 19:32:35Z melifaro $ + */ + +/* + * Options that can be set on the command line. + * When reading commands from a file, a subset of the options can also + * be applied globally by specifying them before the file name. + * After that, each line can contain its own option that changes + * the global value. + * XXX The context is not restored after each line. + */ + +struct cmdline_opts { + /* boolean options: */ + int do_value_as_ip; /* show table value as IP */ + int do_resolv; /* try to resolve all ip to names */ + int do_time; /* Show time stamps */ + int do_quiet; /* Be quiet in add and flush */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ + int do_nat; /* this cmd refers to a nat config */ + int do_dynamic; /* display dynamic rules */ + int do_expired; /* display expired dynamic rules */ + int do_compact; /* show rules in compact mode */ + int do_force; /* do not ask for confirmation */ + int show_sets; /* display the set each rule belongs to */ + int test_only; /* only check syntax */ + int comment_only; /* only print action and comment */ + int verbose; /* be verbose on some commands */ + + /* The options below can have multiple values. */ + + int do_sort; /* field to sort results (0 = no) */ + /* valid fields are 1 and above */ + + int use_set; /* work with specified set number */ + /* 0 means all sets, otherwise apply to set use_set - 1 */ + +}; + +extern struct cmdline_opts co; + +/* + * _s_x is a structure that stores a string <-> token pairs, used in + * various places in the parser. Entries are stored in arrays, + * with an entry with s=NULL as terminator. + * The search routines are match_token() and match_value(). + * Often, an element with x=0 contains an error string. + * + */ +struct _s_x { + char const *s; + int x; +}; + +extern struct _s_x f_ipdscp[]; + +enum tokens { + TOK_NULL=0, + + TOK_OR, + TOK_NOT, + TOK_STARTBRACE, + TOK_ENDBRACE, + + TOK_ACCEPT, + TOK_COUNT, + TOK_PIPE, + TOK_LINK, + TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, + TOK_DIVERT, + TOK_TEE, + TOK_NETGRAPH, + TOK_NGTEE, + TOK_FORWARD, + TOK_SKIPTO, + TOK_DENY, + TOK_REJECT, + TOK_RESET, + TOK_UNREACH, + TOK_CHECKSTATE, + TOK_NAT, + TOK_REASS, + TOK_CALL, + TOK_RETURN, + + TOK_ALTQ, + TOK_LOG, + TOK_TAG, + TOK_UNTAG, + + TOK_TAGGED, + TOK_UID, + TOK_GID, + TOK_JAIL, + TOK_IN, + TOK_LIMIT, + TOK_KEEPSTATE, + TOK_LAYER2, + TOK_OUT, + TOK_DIVERTED, + TOK_DIVERTEDLOOPBACK, + TOK_DIVERTEDOUTPUT, + TOK_XMIT, + TOK_RECV, + TOK_VIA, + TOK_FRAG, + TOK_IPOPTS, + TOK_IPLEN, + TOK_IPID, + TOK_IPPRECEDENCE, + TOK_DSCP, + TOK_IPTOS, + TOK_IPTTL, + TOK_IPVER, + TOK_ESTAB, + TOK_SETUP, + TOK_TCPDATALEN, + TOK_TCPFLAGS, + TOK_TCPOPTS, + TOK_TCPSEQ, + TOK_TCPACK, + TOK_TCPWIN, + TOK_ICMPTYPES, + TOK_MAC, + TOK_MACTYPE, + TOK_VERREVPATH, + TOK_VERSRCREACH, + TOK_ANTISPOOF, + TOK_IPSEC, + TOK_COMMENT, + + TOK_PLR, + TOK_NOERROR, + TOK_BUCKETS, + TOK_DSTIP, + TOK_SRCIP, + TOK_DSTPORT, + TOK_SRCPORT, + TOK_ALL, + TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, + TOK_BW, + TOK_DELAY, + TOK_PROFILE, + TOK_BURST, + TOK_RED, + TOK_GRED, + TOK_ECN, + TOK_DROPTAIL, + TOK_PROTO, + /* dummynet tokens */ + TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + + TOK_IP, + TOK_IF, + TOK_ALOG, + TOK_DENY_INC, + TOK_SAME_PORTS, + TOK_UNREG_ONLY, + TOK_SKIP_GLOBAL, + TOK_RESET_ADDR, + TOK_ALIAS_REV, + TOK_PROXY_ONLY, + TOK_REDIR_ADDR, + TOK_REDIR_PORT, + TOK_REDIR_PROTO, + + TOK_IPV6, + TOK_FLOWID, + TOK_ICMP6TYPES, + TOK_EXT6HDR, + TOK_DSTIP6, + TOK_SRCIP6, + + TOK_IPV4, + TOK_UNREACH6, + TOK_RESET6, + + TOK_FIB, + TOK_SETFIB, + TOK_LOOKUP, + TOK_SOCKARG, + TOK_SETDSCP, + TOK_FLOW, + TOK_IFLIST, + /* Table tokens */ + TOK_CREATE, + TOK_DESTROY, + TOK_LIST, + TOK_INFO, + TOK_DETAIL, + TOK_MODIFY, + TOK_FLUSH, + TOK_SWAP, + TOK_ADD, + TOK_DEL, + TOK_VALTYPE, + TOK_ALGO, + TOK_TALIST, + TOK_ATOMIC, + TOK_LOCK, + TOK_UNLOCK, + TOK_VLIST, +}; + +/* + * the following macro returns an error message if we run out of + * arguments. + */ +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} + +struct buf_pr { + char *buf; /* allocated buffer */ + char *ptr; /* current pointer */ + size_t size; /* total buffer size */ + size_t avail; /* available storage */ + size_t needed; /* length needed */ +}; + +int pr_u64(struct buf_pr *bp, uint64_t *pd, int width); +int bp_alloc(struct buf_pr *b, size_t size); +void bp_free(struct buf_pr *b); +int bprintf(struct buf_pr *b, char *format, ...); + + +/* memory allocation support */ +void *safe_calloc(size_t number, size_t size); +void *safe_realloc(void *ptr, size_t size); + +/* string comparison functions used for historical compatibility */ +int _substrcmp(const char *str1, const char* str2); +int _substrcmp2(const char *str1, const char* str2, const char* str3); +int stringnum_cmp(const char *a, const char *b); + +/* utility functions */ +int match_token(struct _s_x *table, char *string); +int match_token_relaxed(struct _s_x *table, char *string); +char const *match_value(struct _s_x *p, int value); +size_t concat_tokens(char *buf, size_t bufsize, struct _s_x *table, + char *delimiter); +int fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, + uint32_t *clear); +void print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set); + +struct _ip_fw3_opheader; +int do_cmd(int optname, void *optval, uintptr_t optlen); +int do_set3(int optname, struct _ip_fw3_opheader *op3, uintptr_t optlen); +int do_get3(int optname, struct _ip_fw3_opheader *op3, size_t *optlen); + +struct in6_addr; +void n2mask(struct in6_addr *mask, int n); +int contigmask(uint8_t *p, int len); + +/* + * Forward declarations to avoid include way too many headers. + * C does not allow duplicated typedefs, so we use the base struct + * that the typedef points to. + * Should the typedefs use a different type, the compiler will + * still detect the change when compiling the body of the + * functions involved, so we do not lose error checking. + */ +struct _ipfw_insn; +struct _ipfw_insn_altq; +struct _ipfw_insn_u32; +struct _ipfw_insn_ip6; +struct _ipfw_insn_icmp6; + +/* + * The reserved set numer. This is a constant in ip_fw.h + * but we store it in a variable so other files do not depend + * in that header just for one constant. + */ +extern int resvd_set_number; + +/* first-level command handlers */ +void ipfw_add(char *av[]); +void ipfw_show_nat(int ac, char **av); +void ipfw_config_pipe(int ac, char **av); +void ipfw_config_nat(int ac, char **av); +void ipfw_sets_handler(char *av[]); +void ipfw_table_handler(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); +void ipfw_flush(int force); +void ipfw_zero(int ac, char *av[], int optname); +void ipfw_list(int ac, char *av[], int show_counters); +void ipfw_internal_handler(int ac, char *av[]); + +#ifdef PF +/* altq.c */ +void altq_set_enabled(int enabled); +u_int32_t altq_name_to_qid(const char *name); +void print_altq_cmd(struct buf_pr *bp, struct _ipfw_insn_altq *altqptr); +#else +#define NO_ALTQ +#endif + +/* dummynet.c */ +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); +int ipfw_delete_pipe(int pipe_or_queue, int n); + +/* ipv6.c */ +void print_unreach6_code(uint16_t code); +void print_ip6(struct buf_pr *bp, struct _ipfw_insn_ip6 *cmd, char const *s); +void print_flow6id(struct buf_pr *bp, struct _ipfw_insn_u32 *cmd); +void print_icmp6types(struct buf_pr *bp, struct _ipfw_insn_u32 *cmd); +void print_ext6hdr(struct buf_pr *bp, struct _ipfw_insn *cmd ); + +struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av, int cblen); +struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av, int cblen); + +void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av, int cblen); +void fill_unreach6_code(u_short *codep, char *str); +void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av, int cblen); +int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); + +/* tables.c */ +struct _ipfw_obj_ctlv; +char *table_search_ctlv(struct _ipfw_obj_ctlv *ctlv, uint16_t idx); +void table_sort_ctlv(struct _ipfw_obj_ctlv *ctlv); +int table_check_name(char *tablename); +void ipfw_list_ta(int ac, char *av[]); +void ipfw_list_values(int ac, char *av[]); + diff --git a/example/ipfw/ipfw/ipv6.c b/example/ipfw/ipfw/ipv6.c new file mode 100644 index 0000000..0871a88 --- /dev/null +++ b/example/ipfw/ipfw/ipv6.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/ipv6.c 270424 2014-08-23 17:37:18Z melifaro $ + * + * ipv6 support + */ + +#include <sys/types.h> +#include <sys/socket.h> + +#include "ipfw2.h" + +#include <err.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/icmp6.h> +#include <netinet/ip_fw.h> +#include <arpa/inet.h> + +#define CHECK_LENGTH(v, len) do { \ + if ((v) < (len)) \ + errx(EX_DATAERR, "Rule too long"); \ + } while (0) + +static struct _s_x icmp6codes[] = { + { "no-route", ICMP6_DST_UNREACH_NOROUTE }, + { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, + { "address", ICMP6_DST_UNREACH_ADDR }, + { "port", ICMP6_DST_UNREACH_NOPORT }, + { NULL, 0 } +}; + +void +fill_unreach6_code(u_short *codep, char *str) +{ + int val; + char *s; + + val = strtoul(str, &s, 0); + if (s == str || *s != '\0' || val >= 0x100) + val = match_token(icmp6codes, str); + if (val < 0) + errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); + *codep = val; + return; +} + +void +print_unreach6_code(uint16_t code) +{ + char const *s = match_value(icmp6codes, code); + + if (s != NULL) + printf("unreach6 %s", s); + else + printf("unreach6 %u", code); +} + +/* + * Print the ip address contained in a command. + */ +void +print_ip6(struct buf_pr *bp, ipfw_insn_ip6 *cmd, char const *s) +{ + struct hostent *he = NULL; + int len = F_LEN((ipfw_insn *) cmd) - 1; + struct in6_addr *a = &(cmd->addr6); + char trad[255]; + + bprintf(bp, "%s%s ", cmd->o.len & F_NOT ? " not": "", s); + + if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { + bprintf(bp, "me6"); + return; + } + if (cmd->o.opcode == O_IP6) { + bprintf(bp, " ip6"); + return; + } + + /* + * len == 4 indicates a single IP, whereas lists of 1 or more + * addr/mask pairs have len = (2n+1). We convert len to n so we + * use that to count the number of entries. + */ + + for (len = len / 4; len > 0; len -= 2, a += 2) { + int mb = /* mask length */ + (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? + 128 : contigmask((uint8_t *)&(a[1]), 128); + + if (mb == 128 && co.do_resolv) + he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); + if (he != NULL) /* resolved to name */ + bprintf(bp, "%s", he->h_name); + else if (mb == 0) /* any */ + bprintf(bp, "any"); + else { /* numeric IP followed by some kind of mask */ + if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) + bprintf(bp, "Error ntop in print_ip6\n"); + bprintf(bp, "%s", trad ); + if (mb < 0) /* XXX not really legal... */ + bprintf(bp, ":%s", + inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); + else if (mb < 128) + bprintf(bp, "/%d", mb); + } + if (len > 2) + bprintf(bp, ","); + } +} + +void +fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av, int cblen) +{ + uint8_t type; + + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn_icmp6)); + + bzero(cmd, sizeof(*cmd)); + while (*av) { + if (*av == ',') + av++; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ICMP6 type"); + /* + * XXX: shouldn't this be 0xFF? I can't see any reason why + * we shouldn't be able to filter all possiable values + * regardless of the ability of the rest of the kernel to do + * anything useful with them. + */ + if (type > ICMP6_MAXTYPE) + errx(EX_DATAERR, "ICMP6 type out of range"); + cmd->d[type / 32] |= ( 1 << (type % 32)); + } + cmd->o.opcode = O_ICMP6TYPE; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); +} + + +void +print_icmp6types(struct buf_pr *bp, ipfw_insn_u32 *cmd) +{ + int i, j; + char sep= ' '; + + bprintf(bp, " ip6 icmp6types"); + for (i = 0; i < 7; i++) + for (j=0; j < 32; ++j) { + if ( (cmd->d[i] & (1 << (j))) == 0) + continue; + bprintf(bp, "%c%d", sep, (i*32 + j)); + sep = ','; + } +} + +void +print_flow6id(struct buf_pr *bp, ipfw_insn_u32 *cmd) +{ + uint16_t i, limit = cmd->o.arg1; + char sep = ','; + + bprintf(bp, " flow-id "); + for( i=0; i < limit; ++i) { + if (i == limit - 1) + sep = ' '; + bprintf(bp, "%d%c", cmd->d[i], sep); + } +} + +/* structure and define for the extension header in ipv6 */ +static struct _s_x ext6hdrcodes[] = { + { "frag", EXT_FRAGMENT }, + { "hopopt", EXT_HOPOPTS }, + { "route", EXT_ROUTING }, + { "dstopt", EXT_DSTOPTS }, + { "ah", EXT_AH }, + { "esp", EXT_ESP }, + { "rthdr0", EXT_RTHDR0 }, + { "rthdr2", EXT_RTHDR2 }, + { NULL, 0 } +}; + +/* fills command for the extension header filtering */ +int +fill_ext6hdr( ipfw_insn *cmd, char *av) +{ + int tok; + char *s = av; + + cmd->arg1 = 0; + + while(s) { + av = strsep( &s, ",") ; + tok = match_token(ext6hdrcodes, av); + switch (tok) { + case EXT_FRAGMENT: + cmd->arg1 |= EXT_FRAGMENT; + break; + + case EXT_HOPOPTS: + cmd->arg1 |= EXT_HOPOPTS; + break; + + case EXT_ROUTING: + cmd->arg1 |= EXT_ROUTING; + break; + + case EXT_DSTOPTS: + cmd->arg1 |= EXT_DSTOPTS; + break; + + case EXT_AH: + cmd->arg1 |= EXT_AH; + break; + + case EXT_ESP: + cmd->arg1 |= EXT_ESP; + break; + + case EXT_RTHDR0: + cmd->arg1 |= EXT_RTHDR0; + break; + + case EXT_RTHDR2: + cmd->arg1 |= EXT_RTHDR2; + break; + + default: + errx( EX_DATAERR, "invalid option for ipv6 exten header" ); + break; + } + } + if (cmd->arg1 == 0 ) + return 0; + cmd->opcode = O_EXT_HDR; + cmd->len |= F_INSN_SIZE( ipfw_insn ); + return 1; +} + +void +print_ext6hdr(struct buf_pr *bp, ipfw_insn *cmd ) +{ + char sep = ' '; + + bprintf(bp, " extension header:"); + if (cmd->arg1 & EXT_FRAGMENT ) { + bprintf(bp, "%cfragmentation", sep); + sep = ','; + } + if (cmd->arg1 & EXT_HOPOPTS ) { + bprintf(bp, "%chop options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ROUTING ) { + bprintf(bp, "%crouting options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR0 ) { + bprintf(bp, "%crthdr0", sep); + sep = ','; + } + if (cmd->arg1 & EXT_RTHDR2 ) { + bprintf(bp, "%crthdr2", sep); + sep = ','; + } + if (cmd->arg1 & EXT_DSTOPTS ) { + bprintf(bp, "%cdestination options", sep); + sep = ','; + } + if (cmd->arg1 & EXT_AH ) { + bprintf(bp, "%cauthentication header", sep); + sep = ','; + } + if (cmd->arg1 & EXT_ESP ) { + bprintf(bp, "%cencapsulated security payload", sep); + } +} + +/* Try to find ipv6 address by hostname */ +static int +lookup_host6 (char *host, struct in6_addr *ip6addr) +{ + struct hostent *he; + + if (!inet_pton(AF_INET6, host, ip6addr)) { + if ((he = gethostbyname2(host, AF_INET6)) == NULL) + return(-1); + memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); + } + return(0); +} + + +/* + * fill the addr and mask fields in the instruction as appropriate from av. + * Update length as appropriate. + * The following formats are allowed: + * any matches any IP6. Actually returns an empty instruction. + * me returns O_IP6_*_ME + * + * 03f1::234:123:0342 single IP6 addres + * 03f1::234:123:0342/24 address/mask + * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address + * + * Set of address (as in ipv6) not supported because ipv6 address + * are typically random past the initial prefix. + * Return 1 on success, 0 on failure. + */ +static int +fill_ip6(ipfw_insn_ip6 *cmd, char *av, int cblen) +{ + int len = 0; + struct in6_addr *d = &(cmd->addr6); + /* + * Needed for multiple address. + * Note d[1] points to struct in6_add r mask6 of cmd + */ + + cmd->o.len &= ~F_LEN_MASK; /* zero len */ + + if (strcmp(av, "any") == 0) + return (1); + + + if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + if (strncmp(av, "table(", 6) == 0) { + char *p = strchr(av + 6, ','); + uint32_t *dm = ((ipfw_insn_u32 *)cmd)->d; + + if (p) + *p++ = '\0'; + cmd->o.opcode = O_IP_DST_LOOKUP; + cmd->o.arg1 = strtoul(av + 6, NULL, 0); + if (p) { + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); + dm[0] = strtoul(p, NULL, 0); + } else + cmd->o.len |= F_INSN_SIZE(ipfw_insn); + return (1); + } + + av = strdup(av); + while (av) { + /* + * After the address we can have '/' indicating a mask, + * or ',' indicating another address follows. + */ + + char *p; + int masklen; + char md = '\0'; + + CHECK_LENGTH(cblen, 1 + len + 2 * F_INSN_SIZE(struct in6_addr)); + + if ((p = strpbrk(av, "/,")) ) { + md = *p; /* save the separator */ + *p = '\0'; /* terminate address string */ + p++; /* and skip past it */ + } + /* now p points to NULL, mask or next entry */ + + /* lookup stores address in *d as a side effect */ + if (lookup_host6(av, d) != 0) { + /* XXX: failed. Free memory and go */ + errx(EX_DATAERR, "bad address \"%s\"", av); + } + /* next, look at the mask, if any */ + masklen = (md == '/') ? atoi(p) : 128; + if (masklen > 128 || masklen < 0) + errx(EX_DATAERR, "bad width \"%s\''", p); + else + n2mask(&d[1], masklen); + + APPLY_MASK(d, &d[1]) /* mask base address with mask */ + + /* find next separator */ + + if (md == '/') { /* find separator past the mask */ + p = strpbrk(p, ","); + if (p != NULL) + p++; + } + av = p; + + /* Check this entry */ + if (masklen == 0) { + /* + * 'any' turns the entire list into a NOP. + * 'not any' never matches, so it is removed from the + * list unless it is the only item, in which case we + * report an error. + */ + if (cmd->o.len & F_NOT && av == NULL && len == 0) + errx(EX_DATAERR, "not any never matches"); + continue; + } + + /* + * A single IP can be stored alone + */ + if (masklen == 128 && av == NULL && len == 0) { + len = F_INSN_SIZE(struct in6_addr); + break; + } + + /* Update length and pointer to arguments */ + len += F_INSN_SIZE(struct in6_addr)*2; + d += 2; + } /* end while */ + + /* + * Total length of the command, remember that 1 is the size of + * the base command. + */ + if (len + 1 > F_LEN_MASK) + errx(EX_DATAERR, "address list too long"); + cmd->o.len |= len+1; + free(av); + return (1); +} + +/* + * fills command for ipv6 flow-id filtering + * note that the 20 bit flow number is stored in a array of u_int32_t + * it's supported lists of flow-id, so in the o.arg1 we store how many + * additional flow-id we want to filter, the basic is 1 + */ +void +fill_flow6( ipfw_insn_u32 *cmd, char *av, int cblen) +{ + u_int32_t type; /* Current flow number */ + u_int16_t nflow = 0; /* Current flow index */ + char *s = av; + cmd->d[0] = 0; /* Initializing the base number*/ + + while (s) { + CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn_u32) + nflow + 1); + + av = strsep( &s, ",") ; + type = strtoul(av, &av, 0); + if (*av != ',' && *av != '\0') + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + if (type > 0xfffff) + errx(EX_DATAERR, "flow number out of range %s", av); + cmd->d[nflow] |= type; + nflow++; + } + if( nflow > 0 ) { + cmd->o.opcode = O_FLOW6ID; + cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; + cmd->o.arg1 = nflow; + } + else { + errx(EX_DATAERR, "invalid ipv6 flow number %s", av); + } +} + +ipfw_insn * +add_srcip6(ipfw_insn *cmd, char *av, int cblen) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av, cblen); + if (cmd->opcode == O_IP_DST_SET) /* set */ + cmd->opcode = O_IP_SRC_SET; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + cmd->opcode = O_IP_SRC_LOOKUP; + else if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_SRC_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_SRC; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_SRC_MASK; + } + return cmd; +} + +ipfw_insn * +add_dstip6(ipfw_insn *cmd, char *av, int cblen) +{ + + fill_ip6((ipfw_insn_ip6 *)cmd, av, cblen); + if (cmd->opcode == O_IP_DST_SET) /* set */ + ; + else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ + ; + else if (F_LEN(cmd) == 0) { /* any */ + } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ + cmd->opcode = O_IP6_DST_ME; + } else if (F_LEN(cmd) == + (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { + /* single IP, no mask*/ + cmd->opcode = O_IP6_DST; + } else { /* addr/mask opt */ + cmd->opcode = O_IP6_DST_MASK; + } + return cmd; +} diff --git a/example/ipfw/ipfw/main.c b/example/ipfw/ipfw/main.c new file mode 100644 index 0000000..a8f5fed --- /dev/null +++ b/example/ipfw/ipfw/main.c @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2002-2003,2010 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * Command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/main.c 272840 2014-10-09 19:32:35Z melifaro $ + */ + +#include <sys/wait.h> +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include "ipfw2.h" + +static void +help(void) +{ + fprintf(stderr, +"ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" +"\tipfw [-abcdefhnNqStTv] <command>\n\n" +"where <command> is one of the following:\n\n" +"add [num] [set N] [prob x] RULE-BODY\n" +"{pipe|queue} N config PIPE-BODY\n" +"[pipe|queue] {zero|delete|show} [N{,N}]\n" +"nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" +" reverse|proxy_only|redirect_addr linkspec|\n" +" redirect_port linkspec|redirect_proto linkspec}\n" +"set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" +"set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" +"table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" +"table all {flush | list}\n" +"\n" +"RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" +"ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" +" skipto N | {divert|tee} PORT | forward ADDR |\n" +" pipe N | queue N | nat N | setfib FIB | reass\n" +"PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" +"ADDR: [ MAC dst src ether_type ] \n" +" [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" +" [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" +"IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" +"IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" +"IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" +"IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" +"OPTION_LIST: OPTION [OPTION_LIST]\n" +"OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" +" {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" +" {dst-port|src-port} LIST |\n" +" estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" +" iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" +" ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" +" icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" +" mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" +" setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" +" tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" +); + + exit(0); +} + +/* + * Called with the arguments, including program name because getopt + * wants it to be present. + * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. + */ +static int +ipfw_main(int oldac, char **oldav) +{ + int ch, ac; + const char *errstr; + char **av, **save_av; + int do_acct = 0; /* Show packet/byte count */ + int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ + +#define WHITESP " \t\f\v\n\r" + if (oldac < 2) + return 1; /* need at least one argument */ + + if (oldac == 2) { + /* + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. + */ + char *arg = oldav[1]; /* The string is the first arg. */ + int l = strlen(arg); + int copy = 0; /* 1 if we need to copy, 0 otherwise */ + int i, j; + + for (i = j = 0; i < l; i++) { + if (arg[i] == '#') /* comment marker */ + break; + if (copy) { + arg[j++] = arg[i]; + copy = !strchr("," WHITESP, arg[i]); + } else { + copy = !strchr(WHITESP, arg[i]); + if (copy) + arg[j++] = arg[i]; + } + } + if (!copy && j > 0) /* last char was a 'blank', remove it */ + j--; + l = j; /* the new argument length */ + arg[j++] = '\0'; + if (l == 0) /* empty string! */ + return 1; + + /* + * First, count number of arguments. Because of the previous + * processing, this is just the number of blanks plus 1. + */ + for (i = 0, ac = 1; i < l; i++) + if (strchr(WHITESP, arg[i]) != NULL) + ac++; + + /* + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. + */ + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, + * j is the initial character, i is the one past the end. + */ + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { + if (strchr(WHITESP, arg[i]) != NULL || i == l-1) { + if (i == l-1) + i++; + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the length of the string */ + *av_p++ = '\0'; + ac++; + j = i + 1; + } + } + } else { + /* + * If an argument ends with ',' join with the next one. + */ + int first, i, l=0; + + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i<oldac; i++) + l += strlen(oldav[i]); + + av_size = (oldac+1) * sizeof(char *) + l + oldac; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[] + */ + av_p = (char *)&av[oldac+1]; + for (first = i = ac = 1, l = 0; i < oldac; i++) { + char *arg = oldav[i]; + int k = strlen(arg); + + l += k; + if (arg[k-1] != ',' || i == oldac-1) { + /* Time to copy. */ + av[ac] = av_p; + for (l=0; first <= i; first++) { + strcat(av_p, oldav[first]); + av_p += strlen(oldav[first]); + } + *av_p++ = '\0'; + ac++; + l = 0; + first = i+1; + } + } + } + + /* + * set the progname pointer to the original string + * and terminate the array with null + */ + av[0] = oldav[0]; + av[ac] = NULL; + + /* Set the force flag for non-interactive processes */ + if (!co.do_force) + co.do_force = !isatty(STDIN_FILENO); + +#ifdef EMULATE_SYSCTL /* sysctl emulation */ + if ( ac >= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = strchr(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + + /* Save arguments for final freeing of memory. */ + save_av = av; + + optind = optreset = 1; /* restart getopt() */ + while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1) + switch (ch) { + case 'a': + do_acct = 1; + break; + + case 'b': + co.comment_only = 1; + co.do_compact = 1; + break; + + case 'c': + co.do_compact = 1; + break; + + case 'd': + co.do_dynamic = 1; + break; + + case 'e': + co.do_expired = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'h': /* help */ + free(save_av); + help(); + break; /* NOTREACHED */ + + case 'i': + co.do_value_as_ip = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'p': + errx(EX_USAGE, "An absolute pathname must be used " + "with -p option."); + /* NOTREACHED */ + + case 'q': + co.do_quiet = 1; + break; + + case 's': /* sort */ + co.do_sort = atoi(optarg); + break; + + case 'S': + co.show_sets = 1; + break; + + case 't': + co.do_time = 1; + break; + + case 'T': + co.do_time = 2; /* numeric timestamp */ + break; + + case 'v': /* verbose */ + co.verbose = 1; + break; + + default: + free(save_av); + return 1; + } + + ac -= optind; + av += optind; + NEED1("bad arguments, for usage summary ``ipfw''"); + + /* + * An undocumented behaviour of ipfw1 was to allow rule numbers first, + * e.g. "100 add allow ..." instead of "add 100 allow ...". + * In case, swap first and second argument to get the normal form. + */ + if (ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + /* + * Optional: pipe, queue or nat. + */ + co.do_nat = 0; + co.do_pipe = 0; + co.use_set = 0; + if (!strncmp(*av, "nat", strlen(*av))) + co.do_nat = 1; + else if (!strncmp(*av, "pipe", strlen(*av))) + co.do_pipe = 1; + else if (_substrcmp(*av, "queue") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; + else if (!strncmp(*av, "set", strlen(*av))) { + if (ac > 1 && isdigit(av[1][0])) { + co.use_set = strtonum(av[1], 0, resvd_set_number, + &errstr); + if (errstr) + errx(EX_DATAERR, + "invalid set number %s\n", av[1]); + ac -= 2; av += 2; co.use_set++; + } + } + + if (co.do_pipe || co.do_nat) { + ac--; + av++; + } + NEED1("missing command"); + + /* + * For pipes, queues and nats we normally say 'nat|pipe NN config' + * but the code is easier to parse as 'nat|pipe config NN' + * so we swap the two arguments. + */ + if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { + char *p = av[0]; + + av[0] = av[1]; + av[1] = p; + } + + if (co.use_set == 0) { + if (_substrcmp(*av, "add") == 0) + ipfw_add(av); + else if (co.do_nat && _substrcmp(*av, "show") == 0) + ipfw_show_nat(ac, av); + else if (co.do_pipe && _substrcmp(*av, "config") == 0) + ipfw_config_pipe(ac, av); + else if (co.do_nat && _substrcmp(*av, "config") == 0) + ipfw_config_nat(ac, av); + else if (_substrcmp(*av, "set") == 0) + ipfw_sets_handler(av); + else if (_substrcmp(*av, "table") == 0) + ipfw_table_handler(ac, av); + else if (_substrcmp(*av, "enable") == 0) + ipfw_sysctl_handler(av, 1); + else if (_substrcmp(*av, "disable") == 0) + ipfw_sysctl_handler(av, 0); + else + try_next = 1; + } + + if (co.use_set || try_next) { + if (_substrcmp(*av, "delete") == 0) + ipfw_delete(av); + else if (_substrcmp(*av, "flush") == 0) + ipfw_flush(co.do_force); + else if (_substrcmp(*av, "zero") == 0) + ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); + else if (_substrcmp(*av, "resetlog") == 0) + ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); + else if (_substrcmp(*av, "print") == 0 || + _substrcmp(*av, "list") == 0) + ipfw_list(ac, av, do_acct); + else if (_substrcmp(*av, "show") == 0) + ipfw_list(ac, av, 1 /* show counters */); + else if (_substrcmp(*av, "table") == 0) + ipfw_table_handler(ac, av); + else if (_substrcmp(*av, "internal") == 0) + ipfw_internal_handler(ac, av); + else + errx(EX_USAGE, "bad command `%s'", *av); + } + + /* Free memory allocated in the argument parsing. */ + free(save_av); + return 0; +} + + +static void +ipfw_readfile(int ac, char *av[]) +{ +#define MAX_ARGS 32 + char buf[4096]; + char *progname = av[0]; /* original program name */ + const char *cmd = NULL; /* preprocessor name, if any */ + const char *filename = av[ac-1]; /* file to read */ + int c, lineno=0; + FILE *f = NULL; + pid_t preproc = 0; + + while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { + switch(c) { + case 'c': + co.do_compact = 1; + break; + + case 'f': + co.do_force = 1; + break; + + case 'N': + co.do_resolv = 1; + break; + + case 'n': + co.test_only = 1; + break; + + case 'p': + /* + * ipfw -p cmd [args] filename + * + * We are done with getopt(). All arguments + * except the filename go to the preprocessor, + * so we need to do the following: + * - check that a filename is actually present; + * - advance av by optind-1 to skip arguments + * already processed; + * - decrease ac by optind, to remove the args + * already processed and the final filename; + * - set the last entry in av[] to NULL so + * popen() can detect the end of the array; + * - set optind=ac to let getopt() terminate. + */ + if (optind == ac) + errx(EX_USAGE, "no filename argument"); + cmd = optarg; + av[ac-1] = NULL; + av += optind - 1; + ac -= optind; + optind = ac; + break; + + case 'q': + co.do_quiet = 1; + break; + + case 'S': + co.show_sets = 1; + break; + + default: + errx(EX_USAGE, "bad arguments, for usage" + " summary ``ipfw''"); + } + + } + + if (cmd == NULL && ac != optind + 1) + errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); + + if ((f = fopen(filename, "r")) == NULL) + err(EX_UNAVAILABLE, "fopen: %s", filename); + + if (cmd != NULL) { /* pipe through preprocessor */ + int pipedes[2]; + + if (pipe(pipedes) == -1) + err(EX_OSERR, "cannot create pipe"); + + preproc = fork(); + if (preproc == -1) + err(EX_OSERR, "cannot fork"); + + if (preproc == 0) { + /* + * Child, will run the preprocessor with the + * file on stdin and the pipe on stdout. + */ + if (dup2(fileno(f), 0) == -1 + || dup2(pipedes[1], 1) == -1) + err(EX_OSERR, "dup2()"); + fclose(f); + close(pipedes[1]); + close(pipedes[0]); + execvp(cmd, av); + err(EX_OSERR, "execvp(%s) failed", cmd); + } else { /* parent, will reopen f as the pipe */ + fclose(f); + close(pipedes[1]); + if ((f = fdopen(pipedes[0], "r")) == NULL) { + int savederrno = errno; + + (void)kill(preproc, SIGTERM); + errno = savederrno; + err(EX_OSERR, "fdopen()"); + } + } + } + + while (fgets(buf, sizeof(buf), f)) { /* read commands */ + char linename[20]; + char *args[2]; + + lineno++; + snprintf(linename, sizeof(linename), "Line %d", lineno); + setprogname(linename); /* XXX */ + args[0] = progname; + args[1] = buf; + ipfw_main(2, args); + } + fclose(f); + if (cmd != NULL) { + int status; + + if (waitpid(preproc, &status, 0) == -1) + errx(EX_OSERR, "waitpid()"); + if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) + errx(EX_UNAVAILABLE, + "preprocessor exited with status %d", + WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + errx(EX_UNAVAILABLE, + "preprocessor exited with signal %d", + WTERMSIG(status)); + } +} + +int +main(int ac, char *av[]) +{ +#if defined(_WIN32) && defined(TCC) + { + WSADATA wsaData; + int ret=0; + unsigned short wVersionRequested = MAKEWORD(2, 2); + ret = WSAStartup(wVersionRequested, &wsaData); + if (ret != 0) { + /* Tell the user that we could not find a usable */ + /* Winsock DLL. */ + printf("WSAStartup failed with error: %d\n", ret); + return 1; + } + } +#endif + /* + * If the last argument is an absolute pathname, interpret it + * as a file to be preprocessed. + */ + + if (ac > 1 && av[ac - 1][0] == '/') { + if (access(av[ac - 1], R_OK) == 0) + ipfw_readfile(ac, av); + else + err(EX_USAGE, "pathname: %s", av[ac - 1]); + } else { + if (ipfw_main(ac, av)) { + errx(EX_USAGE, + "usage: ipfw [options]\n" + "do \"ipfw -h\" or \"man ipfw\" for details"); + } + } + return EX_OK; +} diff --git a/example/ipfw/ipfw/nat.c b/example/ipfw/ipfw/nat.c new file mode 100644 index 0000000..dc2364f --- /dev/null +++ b/example/ipfw/ipfw/nat.c @@ -0,0 +1,1115 @@ +/* + * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Idea and grammar partially left from: + * Copyright (c) 1993 Daniel Boulet + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * NEW command line interface for IP firewall facility + * + * $FreeBSD: head/sbin/ipfw/nat.c 272840 2014-10-09 19:32:35Z melifaro $ + * + * In-kernel nat support + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include "ipfw2.h" + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> /* def. of struct route */ +#include <netinet/in.h> +#include <netinet/ip_fw.h> +#include <arpa/inet.h> +#include <alias.h> + +typedef int (nat_cb_t)(struct nat44_cfg_nat *cfg, void *arg); +static void nat_show_cfg(struct nat44_cfg_nat *n, void *arg); +static void nat_show_log(struct nat44_cfg_nat *n, void *arg); +static int nat_show_data(struct nat44_cfg_nat *cfg, void *arg); +static int natname_cmp(const void *a, const void *b); +static int nat_foreach(nat_cb_t *f, void *arg, int sort); +static int nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh); + +static struct _s_x nat_params[] = { + { "ip", TOK_IP }, + { "if", TOK_IF }, + { "log", TOK_ALOG }, + { "deny_in", TOK_DENY_INC }, + { "same_ports", TOK_SAME_PORTS }, + { "unreg_only", TOK_UNREG_ONLY }, + { "skip_global", TOK_SKIP_GLOBAL }, + { "reset", TOK_RESET_ADDR }, + { "reverse", TOK_ALIAS_REV }, + { "proxy_only", TOK_PROXY_ONLY }, + { "redirect_addr", TOK_REDIR_ADDR }, + { "redirect_port", TOK_REDIR_PORT }, + { "redirect_proto", TOK_REDIR_PROTO }, + { NULL, 0 } /* terminator */ +}; + + +/* + * Search for interface with name "ifn", and fill n accordingly: + * + * n->ip ip address of interface "ifn" + * n->if_name copy of interface name "ifn" + */ +static void +set_addr_dynamic(const char *ifn, struct nat44_cfg_nat *n) +{ + size_t needed; + int mib[6]; + char *buf, *lim, *next; + struct if_msghdr *ifm; + struct ifa_msghdr *ifam; + struct sockaddr_dl *sdl; + struct sockaddr_in *sin; + int ifIndex, ifMTU; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = AF_INET; + mib[4] = NET_RT_IFLIST; + mib[5] = 0; +/* + * Get interface data. + */ + if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) + err(1, "iflist-sysctl-estimate"); + buf = safe_calloc(1, needed); + if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) + err(1, "iflist-sysctl-get"); + lim = buf + needed; +/* + * Loop through interfaces until one with + * given name is found. This is done to + * find correct interface index for routing + * message processing. + */ + ifIndex = 0; + next = buf; + while (next < lim) { + ifm = (struct if_msghdr *)next; + next += ifm->ifm_msglen; + if (ifm->ifm_version != RTM_VERSION) { + if (co.verbose) + warnx("routing message version %d " + "not understood", ifm->ifm_version); + continue; + } + if (ifm->ifm_type == RTM_IFINFO) { + sdl = (struct sockaddr_dl *)(ifm + 1); + if (strlen(ifn) == sdl->sdl_nlen && + strncmp(ifn, sdl->sdl_data, sdl->sdl_nlen) == 0) { + ifIndex = ifm->ifm_index; + ifMTU = ifm->ifm_data.ifi_mtu; + break; + } + } + } + if (!ifIndex) + errx(1, "unknown interface name %s", ifn); +/* + * Get interface address. + */ + sin = NULL; + while (next < lim) { + ifam = (struct ifa_msghdr *)next; + next += ifam->ifam_msglen; + if (ifam->ifam_version != RTM_VERSION) { + if (co.verbose) + warnx("routing message version %d " + "not understood", ifam->ifam_version); + continue; + } + if (ifam->ifam_type != RTM_NEWADDR) + break; + if (ifam->ifam_addrs & RTA_IFA) { + int i; + char *cp = (char *)(ifam + 1); + + for (i = 1; i < RTA_IFA; i <<= 1) { + if (ifam->ifam_addrs & i) + cp += SA_SIZE((struct sockaddr *)cp); + } + if (((struct sockaddr *)cp)->sa_family == AF_INET) { + sin = (struct sockaddr_in *)cp; + break; + } + } + } + if (sin == NULL) + errx(1, "%s: cannot get interface address", ifn); + + n->ip = sin->sin_addr; + strncpy(n->if_name, ifn, IF_NAMESIZE); + + free(buf); +} + +/* + * XXX - The following functions, macros and definitions come from natd.c: + * it would be better to move them outside natd.c, in a file + * (redirect_support.[ch]?) shared by ipfw and natd, but for now i can live + * with it. + */ + +/* + * Definition of a port range, and macros to deal with values. + * FORMAT: HI 16-bits == first port in range, 0 == all ports. + * LO 16-bits == number of ports in range + * NOTES: - Port values are not stored in network byte order. + */ + +#define port_range u_long + +#define GETLOPORT(x) ((x) >> 0x10) +#define GETNUMPORTS(x) ((x) & 0x0000ffff) +#define GETHIPORT(x) (GETLOPORT((x)) + GETNUMPORTS((x))) + +/* Set y to be the low-port value in port_range variable x. */ +#define SETLOPORT(x,y) ((x) = ((x) & 0x0000ffff) | ((y) << 0x10)) + +/* Set y to be the number of ports in port_range variable x. */ +#define SETNUMPORTS(x,y) ((x) = ((x) & 0xffff0000) | (y)) + +static void +StrToAddr (const char* str, struct in_addr* addr) +{ + struct hostent* hp; + + if (inet_aton (str, addr)) + return; + + hp = gethostbyname (str); + if (!hp) + errx (1, "unknown host %s", str); + + memcpy (addr, hp->h_addr, sizeof (struct in_addr)); +} + +static int +StrToPortRange (const char* str, const char* proto, port_range *portRange) +{ + char* sep; + struct servent* sp; + char* end; + u_short loPort; + u_short hiPort; + + /* First see if this is a service, return corresponding port if so. */ + sp = getservbyname (str,proto); + if (sp) { + SETLOPORT(*portRange, ntohs(sp->s_port)); + SETNUMPORTS(*portRange, 1); + return 0; + } + + /* Not a service, see if it's a single port or port range. */ + sep = strchr (str, '-'); + if (sep == NULL) { + SETLOPORT(*portRange, strtol(str, &end, 10)); + if (end != str) { + /* Single port. */ + SETNUMPORTS(*portRange, 1); + return 0; + } + + /* Error in port range field. */ + errx (EX_DATAERR, "%s/%s: unknown service", str, proto); + } + + /* Port range, get the values and sanity check. */ + sscanf (str, "%hu-%hu", &loPort, &hiPort); + SETLOPORT(*portRange, loPort); + SETNUMPORTS(*portRange, 0); /* Error by default */ + if (loPort <= hiPort) + SETNUMPORTS(*portRange, hiPort - loPort + 1); + + if (GETNUMPORTS(*portRange) == 0) + errx (EX_DATAERR, "invalid port range %s", str); + + return 0; +} + +static int +StrToProto (const char* str) +{ + if (!strcmp (str, "tcp")) + return IPPROTO_TCP; + + if (!strcmp (str, "udp")) + return IPPROTO_UDP; + + if (!strcmp (str, "sctp")) + return IPPROTO_SCTP; + errx (EX_DATAERR, "unknown protocol %s. Expected sctp, tcp or udp", str); +} + +static int +StrToAddrAndPortRange (const char* str, struct in_addr* addr, char* proto, + port_range *portRange) +{ + char* ptr; + + ptr = strchr (str, ':'); + if (!ptr) + errx (EX_DATAERR, "%s is missing port number", str); + + *ptr = '\0'; + ++ptr; + + StrToAddr (str, addr); + return StrToPortRange (ptr, proto, portRange); +} + +/* End of stuff taken from natd.c. */ + +/* + * The next 3 functions add support for the addr, port and proto redirect and + * their logic is loosely based on SetupAddressRedirect(), SetupPortRedirect() + * and SetupProtoRedirect() from natd.c. + * + * Every setup_* function fills at least one redirect entry + * (struct nat44_cfg_redir) and zero or more server pool entry + * (struct nat44_cfg_spool) in buf. + * + * The format of data in buf is: + * + * nat44_cfg_nat nat44_cfg_redir nat44_cfg_spool ...... nat44_cfg_spool + * + * ------------------------------------- ------------ + * | | .....X ..... | | | | ..... + * ------------------------------------- ...... ------------ + * ^ + * spool_cnt n=0 ...... n=(X-1) + * + * len points to the amount of available space in buf + * space counts the memory consumed by every function + * + * XXX - Every function get all the argv params so it + * has to check, in optional parameters, that the next + * args is a valid option for the redir entry and not + * another token. Only redir_port and redir_proto are + * affected by this. + */ + +static int +estimate_redir_addr(int *ac, char ***av) +{ + size_t space = sizeof(struct nat44_cfg_redir); + char *sep = **av; + u_int c = 0; + + (void)ac; /* UNUSED */ + while ((sep = strchr(sep, ',')) != NULL) { + c++; + sep++; + } + + if (c > 0) + c++; + + space += c * sizeof(struct nat44_cfg_spool); + + return (space); +} + +static int +setup_redir_addr(char *buf, int *ac, char ***av) +{ + struct nat44_cfg_redir *r; + char *sep; + size_t space; + + r = (struct nat44_cfg_redir *)buf; + r->mode = REDIR_ADDR; + /* Skip nat44_cfg_redir at beginning of buf. */ + buf = &buf[sizeof(struct nat44_cfg_redir)]; + space = sizeof(struct nat44_cfg_redir); + + /* Extract local address. */ + if (strchr(**av, ',') != NULL) { + struct nat44_cfg_spool *spool; + + /* Setup LSNAT server pool. */ + r->laddr.s_addr = INADDR_NONE; + sep = strtok(**av, ","); + while (sep != NULL) { + spool = (struct nat44_cfg_spool *)buf; + space += sizeof(struct nat44_cfg_spool); + StrToAddr(sep, &spool->addr); + spool->port = ~0; + r->spool_cnt++; + /* Point to the next possible nat44_cfg_spool. */ + buf = &buf[sizeof(struct nat44_cfg_spool)]; + sep = strtok(NULL, ","); + } + } else + StrToAddr(**av, &r->laddr); + (*av)++; (*ac)--; + + /* Extract public address. */ + StrToAddr(**av, &r->paddr); + (*av)++; (*ac)--; + + return (space); +} + +static int +estimate_redir_port(int *ac, char ***av) +{ + size_t space = sizeof(struct nat44_cfg_redir); + char *sep = **av; + u_int c = 0; + + (void)ac; /* UNUSED */ + while ((sep = strchr(sep, ',')) != NULL) { + c++; + sep++; + } + + if (c > 0) + c++; + + space += c * sizeof(struct nat44_cfg_spool); + + return (space); +} + +static int +setup_redir_port(char *buf, int *ac, char ***av) +{ + struct nat44_cfg_redir *r; + char *sep, *protoName, *lsnat = NULL; + size_t space; + u_short numLocalPorts; + port_range portRange; + + numLocalPorts = 0; + + r = (struct nat44_cfg_redir *)buf; + r->mode = REDIR_PORT; + /* Skip nat44_cfg_redir at beginning of buf. */ + buf = &buf[sizeof(struct nat44_cfg_redir)]; + space = sizeof(struct nat44_cfg_redir); + + /* + * Extract protocol. + */ + r->proto = StrToProto(**av); + protoName = **av; + (*av)++; (*ac)--; + + /* + * Extract local address. + */ + if (strchr(**av, ',') != NULL) { + r->laddr.s_addr = INADDR_NONE; + r->lport = ~0; + numLocalPorts = 1; + lsnat = **av; + } else { + /* + * The sctp nat does not allow the port numbers to be mapped to + * new port numbers. Therefore, no ports are to be specified + * in the target port field. + */ + if (r->proto == IPPROTO_SCTP) { + if (strchr(**av, ':')) + errx(EX_DATAERR, "redirect_port:" + "port numbers do not change in sctp, so do " + "not specify them as part of the target"); + else + StrToAddr(**av, &r->laddr); + } else { + if (StrToAddrAndPortRange(**av, &r->laddr, protoName, + &portRange) != 0) + errx(EX_DATAERR, "redirect_port: " + "invalid local port range"); + + r->lport = GETLOPORT(portRange); + numLocalPorts = GETNUMPORTS(portRange); + } + } + (*av)++; (*ac)--; + + /* + * Extract public port and optionally address. + */ + if (strchr(**av, ':') != NULL) { + if (StrToAddrAndPortRange(**av, &r->paddr, protoName, + &portRange) != 0) + errx(EX_DATAERR, "redirect_port: " + "invalid public port range"); + } else { + r->paddr.s_addr = INADDR_ANY; + if (StrToPortRange(**av, protoName, &portRange) != 0) + errx(EX_DATAERR, "redirect_port: " + "invalid public port range"); + } + + r->pport = GETLOPORT(portRange); + if (r->proto == IPPROTO_SCTP) { /* so the logic below still works */ + numLocalPorts = GETNUMPORTS(portRange); + r->lport = r->pport; + } + r->pport_cnt = GETNUMPORTS(portRange); + (*av)++; (*ac)--; + + /* + * Extract remote address and optionally port. + */ + /* + * NB: isdigit(**av) => we've to check that next parameter is really an + * option for this redirect entry, else stop here processing arg[cv]. + */ + if (*ac != 0 && isdigit(***av)) { + if (strchr(**av, ':') != NULL) { + if (StrToAddrAndPortRange(**av, &r->raddr, protoName, + &portRange) != 0) + errx(EX_DATAERR, "redirect_port: " + "invalid remote port range"); + } else { + SETLOPORT(portRange, 0); + SETNUMPORTS(portRange, 1); + StrToAddr(**av, &r->raddr); + } + (*av)++; (*ac)--; + } else { + SETLOPORT(portRange, 0); + SETNUMPORTS(portRange, 1); + r->raddr.s_addr = INADDR_ANY; + } + r->rport = GETLOPORT(portRange); + r->rport_cnt = GETNUMPORTS(portRange); + + /* + * Make sure port ranges match up, then add the redirect ports. + */ + if (numLocalPorts != r->pport_cnt) + errx(EX_DATAERR, "redirect_port: " + "port ranges must be equal in size"); + + /* Remote port range is allowed to be '0' which means all ports. */ + if (r->rport_cnt != numLocalPorts && + (r->rport_cnt != 1 || r->rport != 0)) + errx(EX_DATAERR, "redirect_port: remote port must" + "be 0 or equal to local port range in size"); + + /* Setup LSNAT server pool. */ + if (lsnat != NULL) { + struct nat44_cfg_spool *spool; + + sep = strtok(lsnat, ","); + while (sep != NULL) { + spool = (struct nat44_cfg_spool *)buf; + space += sizeof(struct nat44_cfg_spool); + /* + * The sctp nat does not allow the port numbers to + * be mapped to new port numbers. Therefore, no ports + * are to be specified in the target port field. + */ + if (r->proto == IPPROTO_SCTP) { + if (strchr (sep, ':')) { + errx(EX_DATAERR, "redirect_port:" + "port numbers do not change in " + "sctp, so do not specify them as " + "part of the target"); + } else { + StrToAddr(sep, &spool->addr); + spool->port = r->pport; + } + } else { + if (StrToAddrAndPortRange(sep, &spool->addr, + protoName, &portRange) != 0) + errx(EX_DATAERR, "redirect_port:" + "invalid local port range"); + if (GETNUMPORTS(portRange) != 1) + errx(EX_DATAERR, "redirect_port: " + "local port must be single in " + "this context"); + spool->port = GETLOPORT(portRange); + } + r->spool_cnt++; + /* Point to the next possible nat44_cfg_spool. */ + buf = &buf[sizeof(struct nat44_cfg_spool)]; + sep = strtok(NULL, ","); + } + } + + return (space); +} + +static int +setup_redir_proto(char *buf, int *ac, char ***av) +{ + struct nat44_cfg_redir *r; + struct protoent *protoent; + size_t space; + + r = (struct nat44_cfg_redir *)buf; + r->mode = REDIR_PROTO; + /* Skip nat44_cfg_redir at beginning of buf. */ + buf = &buf[sizeof(struct nat44_cfg_redir)]; + space = sizeof(struct nat44_cfg_redir); + + /* + * Extract protocol. + */ + protoent = getprotobyname(**av); + if (protoent == NULL) + errx(EX_DATAERR, "redirect_proto: unknown protocol %s", **av); + else + r->proto = protoent->p_proto; + + (*av)++; (*ac)--; + + /* + * Extract local address. + */ + StrToAddr(**av, &r->laddr); + + (*av)++; (*ac)--; + + /* + * Extract optional public address. + */ + if (*ac == 0) { + r->paddr.s_addr = INADDR_ANY; + r->raddr.s_addr = INADDR_ANY; + } else { + /* see above in setup_redir_port() */ + if (isdigit(***av)) { + StrToAddr(**av, &r->paddr); + (*av)++; (*ac)--; + + /* + * Extract optional remote address. + */ + /* see above in setup_redir_port() */ + if (*ac != 0 && isdigit(***av)) { + StrToAddr(**av, &r->raddr); + (*av)++; (*ac)--; + } + } + } + + return (space); +} + +static void +nat_show_log(struct nat44_cfg_nat *n, void *arg) +{ + char *buf; + + buf = (char *)(n + 1); + if (buf[0] != '\0') + printf("nat %s: %s\n", n->name, buf); +} + +static void +nat_show_cfg(struct nat44_cfg_nat *n, void *arg) +{ + int i, cnt, flag, off; + struct nat44_cfg_redir *t; + struct nat44_cfg_spool *s; + caddr_t buf; + struct protoent *p; + + buf = (caddr_t)n; + flag = 1; + off = sizeof(*n); + printf("ipfw nat %s config", n->name); + if (strlen(n->if_name) != 0) + printf(" if %s", n->if_name); + else if (n->ip.s_addr != 0) + printf(" ip %s", inet_ntoa(n->ip)); + while (n->mode != 0) { + if (n->mode & PKT_ALIAS_LOG) { + printf(" log"); + n->mode &= ~PKT_ALIAS_LOG; + } else if (n->mode & PKT_ALIAS_DENY_INCOMING) { + printf(" deny_in"); + n->mode &= ~PKT_ALIAS_DENY_INCOMING; + } else if (n->mode & PKT_ALIAS_SAME_PORTS) { + printf(" same_ports"); + n->mode &= ~PKT_ALIAS_SAME_PORTS; + } else if (n->mode & PKT_ALIAS_SKIP_GLOBAL) { + printf(" skip_global"); + n->mode &= ~PKT_ALIAS_SKIP_GLOBAL; + } else if (n->mode & PKT_ALIAS_UNREGISTERED_ONLY) { + printf(" unreg_only"); + n->mode &= ~PKT_ALIAS_UNREGISTERED_ONLY; + } else if (n->mode & PKT_ALIAS_RESET_ON_ADDR_CHANGE) { + printf(" reset"); + n->mode &= ~PKT_ALIAS_RESET_ON_ADDR_CHANGE; + } else if (n->mode & PKT_ALIAS_REVERSE) { + printf(" reverse"); + n->mode &= ~PKT_ALIAS_REVERSE; + } else if (n->mode & PKT_ALIAS_PROXY_ONLY) { + printf(" proxy_only"); + n->mode &= ~PKT_ALIAS_PROXY_ONLY; + } + } + /* Print all the redirect's data configuration. */ + for (cnt = 0; cnt < n->redir_cnt; cnt++) { + t = (struct nat44_cfg_redir *)&buf[off]; + off += sizeof(struct nat44_cfg_redir); + switch (t->mode) { + case REDIR_ADDR: + printf(" redirect_addr"); + if (t->spool_cnt == 0) + printf(" %s", inet_ntoa(t->laddr)); + else + for (i = 0; i < t->spool_cnt; i++) { + s = (struct nat44_cfg_spool *)&buf[off]; + if (i) + printf(","); + else + printf(" "); + printf("%s", inet_ntoa(s->addr)); + off += sizeof(struct nat44_cfg_spool); + } + printf(" %s", inet_ntoa(t->paddr)); + break; + case REDIR_PORT: + p = getprotobynumber(t->proto); + printf(" redirect_port %s ", p->p_name); + if (!t->spool_cnt) { + printf("%s:%u", inet_ntoa(t->laddr), t->lport); + if (t->pport_cnt > 1) + printf("-%u", t->lport + + t->pport_cnt - 1); + } else + for (i=0; i < t->spool_cnt; i++) { + s = (struct nat44_cfg_spool *)&buf[off]; + if (i) + printf(","); + printf("%s:%u", inet_ntoa(s->addr), + s->port); + off += sizeof(struct nat44_cfg_spool); + } + + printf(" "); + if (t->paddr.s_addr) + printf("%s:", inet_ntoa(t->paddr)); + printf("%u", t->pport); + if (!t->spool_cnt && t->pport_cnt > 1) + printf("-%u", t->pport + t->pport_cnt - 1); + + if (t->raddr.s_addr) { + printf(" %s", inet_ntoa(t->raddr)); + if (t->rport) { + printf(":%u", t->rport); + if (!t->spool_cnt && t->rport_cnt > 1) + printf("-%u", t->rport + + t->rport_cnt - 1); + } + } + break; + case REDIR_PROTO: + p = getprotobynumber(t->proto); + printf(" redirect_proto %s %s", p->p_name, + inet_ntoa(t->laddr)); + if (t->paddr.s_addr != 0) { + printf(" %s", inet_ntoa(t->paddr)); + if (t->raddr.s_addr) + printf(" %s", inet_ntoa(t->raddr)); + } + break; + default: + errx(EX_DATAERR, "unknown redir mode"); + break; + } + } + printf("\n"); +} + +void +ipfw_config_nat(int ac, char **av) +{ + ipfw_obj_header *oh; + struct nat44_cfg_nat *n; /* Nat instance configuration. */ + int i, off, tok, ac1; + char *id, *buf, **av1, *end; + size_t len; + + av++; + ac--; + /* Nat id. */ + if (ac == 0) + errx(EX_DATAERR, "missing nat id"); + id = *av; + i = (int)strtol(id, &end, 0); + if (i <= 0 || *end != '\0') + errx(EX_DATAERR, "illegal nat id: %s", id); + av++; + ac--; + if (ac == 0) + errx(EX_DATAERR, "missing option"); + + len = sizeof(*oh) + sizeof(*n); + ac1 = ac; + av1 = av; + while (ac1 > 0) { + tok = match_token(nat_params, *av1); + ac1--; + av1++; + switch (tok) { + case TOK_IP: + case TOK_IF: + ac1--; + av1++; + break; + case TOK_ALOG: + case TOK_DENY_INC: + case TOK_SAME_PORTS: + case TOK_SKIP_GLOBAL: + case TOK_UNREG_ONLY: + case TOK_RESET_ADDR: + case TOK_ALIAS_REV: + case TOK_PROXY_ONLY: + break; + case TOK_REDIR_ADDR: + if (ac1 < 2) + errx(EX_DATAERR, "redirect_addr: " + "not enough arguments"); + len += estimate_redir_addr(&ac1, &av1); + av1 += 2; + ac1 -= 2; + break; + case TOK_REDIR_PORT: + if (ac1 < 3) + errx(EX_DATAERR, "redirect_port: " + "not enough arguments"); + av1++; + ac1--; + len += estimate_redir_port(&ac1, &av1); + av1 += 2; + ac1 -= 2; + /* Skip optional remoteIP/port */ + if (ac1 != 0 && isdigit(**av1)) { + av1++; + ac1--; + } + break; + case TOK_REDIR_PROTO: + if (ac1 < 2) + errx(EX_DATAERR, "redirect_proto: " + "not enough arguments"); + len += sizeof(struct nat44_cfg_redir); + av1 += 2; + ac1 -= 2; + /* Skip optional remoteIP/port */ + if (ac1 != 0 && isdigit(**av1)) { + av1++; + ac1--; + } + if (ac1 != 0 && isdigit(**av1)) { + av1++; + ac1--; + } + break; + default: + errx(EX_DATAERR, "unrecognised option ``%s''", av1[-1]); + } + } + + if ((buf = malloc(len)) == NULL) + errx(EX_OSERR, "malloc failed"); + + /* Offset in buf: save space for header at the beginning. */ + off = sizeof(*oh) + sizeof(*n); + memset(buf, 0, len); + oh = (ipfw_obj_header *)buf; + n = (struct nat44_cfg_nat *)(oh + 1); + oh->ntlv.head.length = sizeof(oh->ntlv); + snprintf(oh->ntlv.name, sizeof(oh->ntlv.name), "%d", i); + snprintf(n->name, sizeof(n->name), "%d", i); + + while (ac > 0) { + tok = match_token(nat_params, *av); + ac--; + av++; + switch (tok) { + case TOK_IP: + if (ac == 0) + errx(EX_DATAERR, "missing option"); + if (!inet_aton(av[0], &(n->ip))) + errx(EX_DATAERR, "bad ip address ``%s''", + av[0]); + ac--; + av++; + break; + case TOK_IF: + if (ac == 0) + errx(EX_DATAERR, "missing option"); + set_addr_dynamic(av[0], n); + ac--; + av++; + break; + case TOK_ALOG: + n->mode |= PKT_ALIAS_LOG; + break; + case TOK_DENY_INC: + n->mode |= PKT_ALIAS_DENY_INCOMING; + break; + case TOK_SAME_PORTS: + n->mode |= PKT_ALIAS_SAME_PORTS; + break; + case TOK_UNREG_ONLY: + n->mode |= PKT_ALIAS_UNREGISTERED_ONLY; + break; + case TOK_SKIP_GLOBAL: + n->mode |= PKT_ALIAS_SKIP_GLOBAL; + break; + case TOK_RESET_ADDR: + n->mode |= PKT_ALIAS_RESET_ON_ADDR_CHANGE; + break; + case TOK_ALIAS_REV: + n->mode |= PKT_ALIAS_REVERSE; + break; + case TOK_PROXY_ONLY: + n->mode |= PKT_ALIAS_PROXY_ONLY; + break; + /* + * All the setup_redir_* functions work directly in + * the final buffer, see above for details. + */ + case TOK_REDIR_ADDR: + case TOK_REDIR_PORT: + case TOK_REDIR_PROTO: + switch (tok) { + case TOK_REDIR_ADDR: + i = setup_redir_addr(&buf[off], &ac, &av); + break; + case TOK_REDIR_PORT: + i = setup_redir_port(&buf[off], &ac, &av); + break; + case TOK_REDIR_PROTO: + i = setup_redir_proto(&buf[off], &ac, &av); + break; + } + n->redir_cnt++; + off += i; + break; + } + } + + i = do_set3(IP_FW_NAT44_XCONFIG, &oh->opheader, len); + if (i != 0) + err(1, "setsockopt(%s)", "IP_FW_NAT44_XCONFIG"); + + if (!co.do_quiet) { + /* After every modification, we show the resultant rule. */ + int _ac = 3; + const char *_av[] = {"show", "config", id}; + ipfw_show_nat(_ac, (char **)(void *)_av); + } +} + +struct nat_list_arg { + uint16_t cmd; + int is_all; +}; + +static int +nat_show_data(struct nat44_cfg_nat *cfg, void *arg) +{ + struct nat_list_arg *nla; + ipfw_obj_header *oh; + + nla = (struct nat_list_arg *)arg; + + switch (nla->cmd) { + case IP_FW_NAT44_XGETCONFIG: + if (nat_get_cmd(cfg->name, nla->cmd, &oh) != 0) { + warnx("Error getting nat instance %s info", cfg->name); + break; + } + nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); + free(oh); + break; + case IP_FW_NAT44_XGETLOG: + if (nat_get_cmd(cfg->name, nla->cmd, &oh) == 0) { + nat_show_log((struct nat44_cfg_nat *)(oh + 1), NULL); + free(oh); + break; + } + /* Handle error */ + if (nla->is_all != 0 && errno == ENOENT) + break; + warn("Error getting nat instance %s info", cfg->name); + break; + } + + return (0); +} + +/* + * Compare nat names. + * Honor number comparison. + */ +static int +natname_cmp(const void *a, const void *b) +{ + struct nat44_cfg_nat *ia, *ib; + + ia = (struct nat44_cfg_nat *)a; + ib = (struct nat44_cfg_nat *)b; + + return (stringnum_cmp(ia->name, ib->name)); +} + +/* + * Retrieves nat list from kernel, + * optionally sorts it and calls requested function for each table. + * Returns 0 on success. + */ +static int +nat_foreach(nat_cb_t *f, void *arg, int sort) +{ + ipfw_obj_lheader *olh; + struct nat44_cfg_nat *cfg; + size_t sz; + int i, error; + + /* Start with reasonable default */ + sz = sizeof(*olh) + 16 * sizeof(struct nat44_cfg_nat); + + for (;;) { + if ((olh = calloc(1, sz)) == NULL) + return (ENOMEM); + + olh->size = sz; + if (do_get3(IP_FW_NAT44_LIST_NAT, &olh->opheader, &sz) != 0) { + free(olh); + if (errno == ENOMEM) { + sz = olh->size; + continue; + } + return (errno); + } + + if (sort != 0) + qsort(olh + 1, olh->count, olh->objsize, natname_cmp); + + cfg = (struct nat44_cfg_nat*)(olh + 1); + for (i = 0; i < olh->count; i++) { + error = f(cfg, arg); /* Ignore errors for now */ + cfg = (struct nat44_cfg_nat *)((caddr_t)cfg + + olh->objsize); + } + + free(olh); + break; + } + + return (0); +} + +static int +nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh) +{ + ipfw_obj_header *oh; + struct nat44_cfg_nat *cfg; + size_t sz; + + /* Start with reasonable default */ + sz = sizeof(*oh) + sizeof(*cfg) + 128; + + for (;;) { + if ((oh = calloc(1, sz)) == NULL) + return (ENOMEM); + cfg = (struct nat44_cfg_nat *)(oh + 1); + oh->ntlv.head.length = sizeof(oh->ntlv); + strlcpy(oh->ntlv.name, name, sizeof(oh->ntlv.name)); + strlcpy(cfg->name, name, sizeof(cfg->name)); + + if (do_get3(cmd, &oh->opheader, &sz) != 0) { + sz = cfg->size; + free(oh); + if (errno == ENOMEM) + continue; + return (errno); + } + + *ooh = oh; + break; + } + + return (0); +} + +void +ipfw_show_nat(int ac, char **av) +{ + ipfw_obj_header *oh; + char *name; + int cmd; + struct nat_list_arg nla; + + ac--; + av++; + + if (co.test_only) + return; + + /* Parse parameters. */ + cmd = 0; /* XXX: Change to IP_FW_NAT44_XGETLOG @ MFC */ + name = NULL; + for ( ; ac != 0; ac--, av++) { + if (!strncmp(av[0], "config", strlen(av[0]))) { + cmd = IP_FW_NAT44_XGETCONFIG; + continue; + } + if (strcmp(av[0], "log") == 0) { + cmd = IP_FW_NAT44_XGETLOG; + continue; + } + if (name != NULL) + err(EX_USAGE,"only one instance name may be specified"); + name = av[0]; + } + + if (cmd == 0) + errx(EX_USAGE, "Please specify action. Available: config,log"); + + if (name == NULL) { + memset(&nla, 0, sizeof(nla)); + nla.cmd = cmd; + nla.is_all = 1; + nat_foreach(nat_show_data, &nla, 1); + } else { + if (nat_get_cmd(name, cmd, &oh) != 0) + err(EX_OSERR, "Error getting nat %s instance info", name); + nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); + free(oh); + } +} + diff --git a/example/ipfw/ipfw/tables.c b/example/ipfw/ipfw/tables.c new file mode 100644 index 0000000..e75b59a --- /dev/null +++ b/example/ipfw/ipfw/tables.c @@ -0,0 +1,2013 @@ +/* + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * in-kernel ipfw tables support. + * + * $FreeBSD: head/sbin/ipfw/tables.c 273241 2014-10-17 20:47:55Z melifaro $ + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip_fw.h> +#include <arpa/inet.h> + +#include "ipfw2.h" + +static void table_modify_record(ipfw_obj_header *oh, int ac, char *av[], + int add, int quiet, int update, int atomic); +static int table_flush(ipfw_obj_header *oh); +static int table_destroy(ipfw_obj_header *oh); +static int table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i); +static int table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i); +static int table_do_swap(ipfw_obj_header *oh, char *second); +static void table_create(ipfw_obj_header *oh, int ac, char *av[]); +static void table_modify(ipfw_obj_header *oh, int ac, char *av[]); +static void table_lookup(ipfw_obj_header *oh, int ac, char *av[]); +static void table_lock(ipfw_obj_header *oh, int lock); +static int table_swap(ipfw_obj_header *oh, char *second); +static int table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i); +static int table_show_info(ipfw_xtable_info *i, void *arg); +static void table_fill_ntlv(ipfw_obj_ntlv *ntlv, char *name, uint32_t set, + uint16_t uidx); + +static int table_flush_one(ipfw_xtable_info *i, void *arg); +static int table_show_one(ipfw_xtable_info *i, void *arg); +static int table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh); +static void table_show_list(ipfw_obj_header *oh, int need_header); +static void table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent); + +static void tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, + char *key, int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi); +static void tentry_fill_value(ipfw_obj_header *oh, ipfw_obj_tentry *tent, + char *arg, uint8_t type, uint32_t vmask); +static void table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, + uint32_t vmask, int print_ip); + +typedef int (table_cb_t)(ipfw_xtable_info *i, void *arg); +static int tables_foreach(table_cb_t *f, void *arg, int sort); + +#ifndef s6_addr32 +#define s6_addr32 __u6_addr.__u6_addr32 +#endif + +static struct _s_x tabletypes[] = { + { "addr", IPFW_TABLE_ADDR }, + { "iface", IPFW_TABLE_INTERFACE }, + { "number", IPFW_TABLE_NUMBER }, + { "flow", IPFW_TABLE_FLOW }, + { NULL, 0 } +}; + +static struct _s_x tablevaltypes[] = { + { "skipto", IPFW_VTYPE_SKIPTO }, + { "pipe", IPFW_VTYPE_PIPE }, + { "fib", IPFW_VTYPE_FIB }, + { "nat", IPFW_VTYPE_NAT }, + { "dscp", IPFW_VTYPE_DSCP }, + { "tag", IPFW_VTYPE_TAG }, + { "divert", IPFW_VTYPE_DIVERT }, + { "netgraph", IPFW_VTYPE_NETGRAPH }, + { "limit", IPFW_VTYPE_LIMIT }, + { "ipv4", IPFW_VTYPE_NH4 }, + { "ipv6", IPFW_VTYPE_NH6 }, + { NULL, 0 } +}; + +static struct _s_x tablecmds[] = { + { "add", TOK_ADD }, + { "delete", TOK_DEL }, + { "create", TOK_CREATE }, + { "destroy", TOK_DESTROY }, + { "flush", TOK_FLUSH }, + { "modify", TOK_MODIFY }, + { "swap", TOK_SWAP }, + { "info", TOK_INFO }, + { "detail", TOK_DETAIL }, + { "list", TOK_LIST }, + { "lookup", TOK_LOOKUP }, + { "atomic", TOK_ATOMIC }, + { "lock", TOK_LOCK }, + { "unlock", TOK_UNLOCK }, + { NULL, 0 } +}; + +static int +lookup_host (char *host, struct in_addr *ipaddr) +{ + struct hostent *he; + + if (!inet_aton(host, ipaddr)) { + if ((he = gethostbyname(host)) == NULL) + return(-1); + *ipaddr = *(struct in_addr *)he->h_addr_list[0]; + } + return(0); +} + +static int +get_token(struct _s_x *table, char *string, char *errbase) +{ + int tcmd; + + if ((tcmd = match_token_relaxed(table, string)) < 0) + errx(EX_USAGE, "%s %s %s", + (tcmd == 0) ? "invalid" : "ambiguous", errbase, string); + + return (tcmd); +} + +/* + * This one handles all table-related commands + * ipfw table NAME create ... + * ipfw table NAME modify ... + * ipfw table NAME destroy + * ipfw table NAME swap NAME + * ipfw table NAME lock + * ipfw table NAME unlock + * ipfw table NAME add addr[/masklen] [value] + * ipfw table NAME add [addr[/masklen] value] [addr[/masklen] value] .. + * ipfw table NAME delete addr[/masklen] [addr[/masklen]] .. + * ipfw table NAME lookup addr + * ipfw table {NAME | all} flush + * ipfw table {NAME | all} list + * ipfw table {NAME | all} info + * ipfw table {NAME | all} detail + */ +void +ipfw_table_handler(int ac, char *av[]) +{ + int do_add, is_all; + int atomic, error, tcmd; + ipfw_xtable_info i; + ipfw_obj_header oh; + char *tablename; + uint32_t set; + void *arg; + + memset(&oh, 0, sizeof(oh)); + is_all = 0; + if (co.use_set != 0) + set = co.use_set - 1; + else + set = 0; + + ac--; av++; + NEED1("table needs name"); + tablename = *av; + + if (table_check_name(tablename) == 0) { + table_fill_ntlv(&oh.ntlv, *av, set, 1); + oh.idx = 1; + } else { + if (strcmp(tablename, "all") == 0) + is_all = 1; + else + errx(EX_USAGE, "table name %s is invalid", tablename); + } + ac--; av++; + NEED1("table needs command"); + + tcmd = get_token(tablecmds, *av, "table command"); + /* Check if atomic operation was requested */ + atomic = 0; + if (tcmd == TOK_ATOMIC) { + ac--; av++; + NEED1("atomic needs command"); + tcmd = get_token(tablecmds, *av, "table command"); + switch (tcmd) { + case TOK_ADD: + break; + default: + errx(EX_USAGE, "atomic is not compatible with %s", *av); + } + atomic = 1; + } + + switch (tcmd) { + case TOK_LIST: + case TOK_INFO: + case TOK_DETAIL: + case TOK_FLUSH: + break; + default: + if (is_all != 0) + errx(EX_USAGE, "table name required"); + } + + switch (tcmd) { + case TOK_ADD: + case TOK_DEL: + do_add = **av == 'a'; + ac--; av++; + table_modify_record(&oh, ac, av, do_add, co.do_quiet, + co.do_quiet, atomic); + break; + case TOK_CREATE: + ac--; av++; + table_create(&oh, ac, av); + break; + case TOK_MODIFY: + ac--; av++; + table_modify(&oh, ac, av); + break; + case TOK_DESTROY: + if (table_destroy(&oh) != 0) + err(EX_OSERR, "failed to destroy table %s", tablename); + break; + case TOK_FLUSH: + if (is_all == 0) { + if ((error = table_flush(&oh)) != 0) + err(EX_OSERR, "failed to flush table %s info", + tablename); + } else { + error = tables_foreach(table_flush_one, &oh, 1); + if (error != 0) + err(EX_OSERR, "failed to flush tables list"); + } + break; + case TOK_SWAP: + ac--; av++; + NEED1("second table name required"); + table_swap(&oh, *av); + break; + case TOK_LOCK: + case TOK_UNLOCK: + table_lock(&oh, (tcmd == TOK_LOCK)); + break; + case TOK_DETAIL: + case TOK_INFO: + arg = (tcmd == TOK_DETAIL) ? (void *)1 : NULL; + if (is_all == 0) { + if ((error = table_get_info(&oh, &i)) != 0) + err(EX_OSERR, "failed to request table info"); + table_show_info(&i, arg); + } else { + error = tables_foreach(table_show_info, arg, 1); + if (error != 0) + err(EX_OSERR, "failed to request tables list"); + } + break; + case TOK_LIST: + if (is_all == 0) { + ipfw_xtable_info i; + if ((error = table_get_info(&oh, &i)) != 0) + err(EX_OSERR, "failed to request table info"); + table_show_one(&i, NULL); + } else { + error = tables_foreach(table_show_one, NULL, 1); + if (error != 0) + err(EX_OSERR, "failed to request tables list"); + } + break; + case TOK_LOOKUP: + ac--; av++; + table_lookup(&oh, ac, av); + break; + } +} + +static void +table_fill_ntlv(ipfw_obj_ntlv *ntlv, char *name, uint32_t set, uint16_t uidx) +{ + + ntlv->head.type = IPFW_TLV_TBL_NAME; + ntlv->head.length = sizeof(ipfw_obj_ntlv); + ntlv->idx = uidx; + ntlv->set = set; + strlcpy(ntlv->name, name, sizeof(ntlv->name)); +} + +static void +table_fill_objheader(ipfw_obj_header *oh, ipfw_xtable_info *i) +{ + + oh->idx = 1; + table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); +} + +static struct _s_x tablenewcmds[] = { + { "type", TOK_TYPE }, + { "valtype", TOK_VALTYPE }, + { "algo", TOK_ALGO }, + { "limit", TOK_LIMIT }, + { "locked", TOK_LOCK }, + { NULL, 0 } +}; + +static struct _s_x flowtypecmds[] = { + { "src-ip", IPFW_TFFLAG_SRCIP }, + { "proto", IPFW_TFFLAG_PROTO }, + { "src-port", IPFW_TFFLAG_SRCPORT }, + { "dst-ip", IPFW_TFFLAG_DSTIP }, + { "dst-port", IPFW_TFFLAG_DSTPORT }, + { NULL, 0 } +}; + +int +table_parse_type(uint8_t ttype, char *p, uint8_t *tflags) +{ + uint32_t fset, fclear; + char *e; + + /* Parse type options */ + switch(ttype) { + case IPFW_TABLE_FLOW: + fset = fclear = 0; + if (fill_flags(flowtypecmds, p, &e, &fset, &fclear) != 0) + errx(EX_USAGE, + "unable to parse flow option %s", e); + *tflags = fset; + break; + default: + return (EX_USAGE); + } + + return (0); +} + +void +table_print_type(char *tbuf, size_t size, uint8_t type, uint8_t tflags) +{ + const char *tname; + int l; + + if ((tname = match_value(tabletypes, type)) == NULL) + tname = "unknown"; + + l = snprintf(tbuf, size, "%s", tname); + tbuf += l; + size -= l; + + switch(type) { + case IPFW_TABLE_FLOW: + if (tflags != 0) { + *tbuf++ = ':'; + l--; + print_flags_buffer(tbuf, size, flowtypecmds, tflags); + } + break; + } +} + +/* + * Creates new table + * + * ipfw table NAME create [ type { addr | iface | number | flow } ] + * [ algo algoname ] + */ +static void +table_create(ipfw_obj_header *oh, int ac, char *av[]) +{ + ipfw_xtable_info xi; + int error, tcmd, val; + uint32_t fset, fclear; + size_t sz; + char *e, *p; + char tbuf[128]; + + sz = sizeof(tbuf); + memset(&xi, 0, sizeof(xi)); + + while (ac > 0) { + tcmd = get_token(tablenewcmds, *av, "option"); + ac--; av++; + + switch (tcmd) { + case TOK_LIMIT: + NEED1("limit value required"); + xi.limit = strtol(*av, NULL, 10); + ac--; av++; + break; + case TOK_TYPE: + NEED1("table type required"); + /* Type may have suboptions after ':' */ + if ((p = strchr(*av, ':')) != NULL) + *p++ = '\0'; + val = match_token(tabletypes, *av); + if (val == -1) { + concat_tokens(tbuf, sizeof(tbuf), tabletypes, + ", "); + errx(EX_USAGE, + "Unknown tabletype: %s. Supported: %s", + *av, tbuf); + } + xi.type = val; + if (p != NULL) { + error = table_parse_type(val, p, &xi.tflags); + if (error != 0) + errx(EX_USAGE, + "Unsupported suboptions: %s", p); + } + ac--; av++; + break; + case TOK_VALTYPE: + NEED1("table value type required"); + fset = fclear = 0; + val = fill_flags(tablevaltypes, *av, &e, &fset, &fclear); + if (val != -1) { + xi.vmask = fset; + ac--; av++; + break; + } + concat_tokens(tbuf, sizeof(tbuf), tablevaltypes, ", "); + errx(EX_USAGE, "Unknown value type: %s. Supported: %s", + e, tbuf); + break; + case TOK_ALGO: + NEED1("table algorithm name required"); + if (strlen(*av) > sizeof(xi.algoname)) + errx(EX_USAGE, "algorithm name too long"); + strlcpy(xi.algoname, *av, sizeof(xi.algoname)); + ac--; av++; + break; + case TOK_LOCK: + xi.flags |= IPFW_TGFLAGS_LOCKED; + break; + } + } + + /* Set some defaults to preserve compability */ + if (xi.algoname[0] == '\0' && xi.type == 0) + xi.type = IPFW_TABLE_ADDR; + if (xi.vmask == 0) + xi.vmask = IPFW_VTYPE_LEGACY; + + if ((error = table_do_create(oh, &xi)) != 0) + err(EX_OSERR, "Table creation failed"); +} + +/* + * Creates new table + * + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success. + */ +static int +table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i) +{ + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; + int error; + + memcpy(tbuf, oh, sizeof(*oh)); + memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); + oh = (ipfw_obj_header *)tbuf; + + error = do_set3(IP_FW_TABLE_XCREATE, &oh->opheader, sizeof(tbuf)); + + return (error); +} + +/* + * Modifies existing table + * + * ipfw table NAME modify [ limit number ] + */ +static void +table_modify(ipfw_obj_header *oh, int ac, char *av[]) +{ + ipfw_xtable_info xi; + int tcmd; + size_t sz; + char tbuf[128]; + + sz = sizeof(tbuf); + memset(&xi, 0, sizeof(xi)); + + while (ac > 0) { + tcmd = get_token(tablenewcmds, *av, "option"); + ac--; av++; + + switch (tcmd) { + case TOK_LIMIT: + NEED1("limit value required"); + xi.limit = strtol(*av, NULL, 10); + xi.mflags |= IPFW_TMFLAGS_LIMIT; + ac--; av++; + break; + default: + errx(EX_USAGE, "cmd is not supported for modificatiob"); + } + } + + if (table_do_modify(oh, &xi) != 0) + err(EX_OSERR, "Table modification failed"); +} + +/* + * Modifies existing table. + * + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success. + */ +static int +table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i) +{ + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; + int error; + + memcpy(tbuf, oh, sizeof(*oh)); + memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); + oh = (ipfw_obj_header *)tbuf; + + error = do_set3(IP_FW_TABLE_XMODIFY, &oh->opheader, sizeof(tbuf)); + + return (error); +} + +/* + * Locks or unlocks given table + */ +static void +table_lock(ipfw_obj_header *oh, int lock) +{ + ipfw_xtable_info xi; + + memset(&xi, 0, sizeof(xi)); + + xi.mflags |= IPFW_TMFLAGS_LOCK; + xi.flags |= (lock != 0) ? IPFW_TGFLAGS_LOCKED : 0; + + if (table_do_modify(oh, &xi) != 0) + err(EX_OSERR, "Table %s failed", lock != 0 ? "lock" : "unlock"); +} + +/* + * Destroys given table specified by @oh->ntlv. + * Returns 0 on success. + */ +static int +table_destroy(ipfw_obj_header *oh) +{ + + if (do_set3(IP_FW_TABLE_XDESTROY, &oh->opheader, sizeof(*oh)) != 0) + return (-1); + + return (0); +} + +/* + * Flushes given table specified by @oh->ntlv. + * Returns 0 on success. + */ +static int +table_flush(ipfw_obj_header *oh) +{ + + if (do_set3(IP_FW_TABLE_XFLUSH, &oh->opheader, sizeof(*oh)) != 0) + return (-1); + + return (0); +} + +static int +table_do_swap(ipfw_obj_header *oh, char *second) +{ + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ntlv)]; + int error; + + memset(tbuf, 0, sizeof(tbuf)); + memcpy(tbuf, oh, sizeof(*oh)); + oh = (ipfw_obj_header *)tbuf; + table_fill_ntlv((ipfw_obj_ntlv *)(oh + 1), second, oh->ntlv.set, 1); + + error = do_set3(IP_FW_TABLE_XSWAP, &oh->opheader, sizeof(tbuf)); + + return (error); +} + +/* + * Swaps given table with @second one. + */ +static int +table_swap(ipfw_obj_header *oh, char *second) +{ + int error; + + if (table_check_name(second) != 0) + errx(EX_USAGE, "table name %s is invalid", second); + + error = table_do_swap(oh, second); + + switch (error) { + case EINVAL: + errx(EX_USAGE, "Unable to swap table: check types"); + case EFBIG: + errx(EX_USAGE, "Unable to swap table: check limits"); + } + + return (0); +} + + +/* + * Retrieves table in given table specified by @oh->ntlv. + * it inside @i. + * Returns 0 on success. + */ +static int +table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i) +{ + char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; + size_t sz; + + sz = sizeof(tbuf); + memset(tbuf, 0, sizeof(tbuf)); + memcpy(tbuf, oh, sizeof(*oh)); + oh = (ipfw_obj_header *)tbuf; + + if (do_get3(IP_FW_TABLE_XINFO, &oh->opheader, &sz) != 0) + return (errno); + + if (sz < sizeof(tbuf)) + return (EINVAL); + + *i = *(ipfw_xtable_info *)(oh + 1); + + return (0); +} + +static struct _s_x tablealgoclass[] = { + { "hash", IPFW_TACLASS_HASH }, + { "array", IPFW_TACLASS_ARRAY }, + { "radix", IPFW_TACLASS_RADIX }, + { NULL, 0 } +}; + +struct ta_cldata { + uint8_t taclass; + uint8_t spare4; + uint16_t itemsize; + uint16_t itemsize6; + uint32_t size; + uint32_t count; +}; + +/* + * Print global/per-AF table @i algorithm info. + */ +static void +table_show_tainfo(ipfw_xtable_info *i, struct ta_cldata *d, + const char *af, const char *taclass) +{ + + switch (d->taclass) { + case IPFW_TACLASS_HASH: + case IPFW_TACLASS_ARRAY: + printf(" %salgorithm %s info\n", af, taclass); + if (d->itemsize == d->itemsize6) + printf(" size: %u items: %u itemsize: %u\n", + d->size, d->count, d->itemsize); + else + printf(" size: %u items: %u " + "itemsize4: %u itemsize6: %u\n", + d->size, d->count, + d->itemsize, d->itemsize6); + break; + case IPFW_TACLASS_RADIX: + printf(" %salgorithm %s info\n", af, taclass); + if (d->itemsize == d->itemsize6) + printf(" items: %u itemsize: %u\n", + d->count, d->itemsize); + else + printf(" items: %u " + "itemsize4: %u itemsize6: %u\n", + d->count, d->itemsize, d->itemsize6); + break; + default: + printf(" algo class: %s\n", taclass); + } +} + +static void +table_print_valheader(char *buf, size_t bufsize, uint32_t vmask) +{ + + if (vmask == IPFW_VTYPE_LEGACY) { + snprintf(buf, bufsize, "legacy"); + return; + } + + print_flags_buffer(buf, bufsize, tablevaltypes, vmask); +} + +/* + * Prints table info struct @i in human-readable form. + */ +static int +table_show_info(ipfw_xtable_info *i, void *arg) +{ + const char *vtype; + ipfw_ta_tinfo *tainfo; + int afdata, afitem; + struct ta_cldata d; + char ttype[64], tvtype[64]; + + table_print_type(ttype, sizeof(ttype), i->type, i->tflags); + table_print_valheader(tvtype, sizeof(tvtype), i->vmask); + + printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); + if ((i->flags & IPFW_TGFLAGS_LOCKED) != 0) + printf(" kindex: %d, type: %s, locked\n", i->kidx, ttype); + else + printf(" kindex: %d, type: %s\n", i->kidx, ttype); + printf(" references: %u, valtype: %s\n", i->refcnt, tvtype); + printf(" algorithm: %s\n", i->algoname); + printf(" items: %u, size: %u\n", i->count, i->size); + if (i->limit > 0) + printf(" limit: %u\n", i->limit); + + /* Print algo-specific info if requested & set */ + if (arg == NULL) + return (0); + + if ((i->ta_info.flags & IPFW_TATFLAGS_DATA) == 0) + return (0); + tainfo = &i->ta_info; + + afdata = 0; + afitem = 0; + if (tainfo->flags & IPFW_TATFLAGS_AFDATA) + afdata = 1; + if (tainfo->flags & IPFW_TATFLAGS_AFITEM) + afitem = 1; + + memset(&d, 0, sizeof(d)); + d.taclass = tainfo->taclass4; + d.size = tainfo->size4; + d.count = tainfo->count4; + d.itemsize = tainfo->itemsize4; + if (afdata == 0 && afitem != 0) + d.itemsize6 = tainfo->itemsize6; + else + d.itemsize6 = d.itemsize; + if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) + vtype = "unknown"; + + if (afdata == 0) { + table_show_tainfo(i, &d, "", vtype); + } else { + table_show_tainfo(i, &d, "IPv4 ", vtype); + memset(&d, 0, sizeof(d)); + d.taclass = tainfo->taclass6; + if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) + vtype = "unknown"; + d.size = tainfo->size6; + d.count = tainfo->count6; + d.itemsize = tainfo->itemsize6; + d.itemsize6 = d.itemsize; + table_show_tainfo(i, &d, "IPv6 ", vtype); + } + + return (0); +} + + +/* + * Function wrappers which can be used either + * as is or as foreach function parameter. + */ + +static int +table_show_one(ipfw_xtable_info *i, void *arg) +{ + ipfw_obj_header *oh = NULL; // XXX uninitialized + int error; + + if ((error = table_do_get_list(i, &oh)) != 0) { + err(EX_OSERR, "Error requesting table %s list", i->tablename); + return (error); + } + + table_show_list(oh, 1); + + free(oh); + return (0); +} + +static int +table_flush_one(ipfw_xtable_info *i, void *arg) +{ + ipfw_obj_header *oh; + + oh = (ipfw_obj_header *)arg; + + table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); + + return (table_flush(oh)); +} + +static int +table_do_modify_record(int cmd, ipfw_obj_header *oh, + ipfw_obj_tentry *tent, int count, int atomic) +{ + ipfw_obj_ctlv *ctlv; + ipfw_obj_tentry *tent_base; + caddr_t pbuf; + char xbuf[sizeof(*oh) + sizeof(ipfw_obj_ctlv) + sizeof(*tent)]; + int error, i; + size_t sz; + + sz = sizeof(*ctlv) + sizeof(*tent) * count; + if (count == 1) { + memset(xbuf, 0, sizeof(xbuf)); + pbuf = xbuf; + } else { + if ((pbuf = calloc(1, sizeof(*oh) + sz)) == NULL) + return (ENOMEM); + } + + memcpy(pbuf, oh, sizeof(*oh)); + oh = (ipfw_obj_header *)pbuf; + oh->opheader.version = 1; + + ctlv = (ipfw_obj_ctlv *)(oh + 1); + ctlv->count = count; + ctlv->head.length = sz; + if (atomic != 0) + ctlv->flags |= IPFW_CTF_ATOMIC; + + tent_base = tent; + memcpy(ctlv + 1, tent, sizeof(*tent) * count); + tent = (ipfw_obj_tentry *)(ctlv + 1); + for (i = 0; i < count; i++, tent++) { + tent->head.length = sizeof(ipfw_obj_tentry); + tent->idx = oh->idx; + } + + sz += sizeof(*oh); + error = do_get3(cmd, &oh->opheader, &sz); + tent = (ipfw_obj_tentry *)(ctlv + 1); + /* Copy result back to provided buffer */ + memcpy(tent_base, ctlv + 1, sizeof(*tent) * count); + + if (pbuf != xbuf) + free(pbuf); + + return (error); +} + +static void +table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add, + int quiet, int update, int atomic) +{ + ipfw_obj_tentry *ptent, tent, *tent_buf; + ipfw_xtable_info xi; + uint8_t type; + uint32_t vmask; + int cmd, count, error, i, ignored; + char *texterr, *etxt, *px; + + if (ac == 0) + errx(EX_USAGE, "address required"); + + if (add != 0) { + cmd = IP_FW_TABLE_XADD; + texterr = "Adding record failed"; + } else { + cmd = IP_FW_TABLE_XDEL; + texterr = "Deleting record failed"; + } + + /* + * Calculate number of entries: + * Assume [key val] x N for add + * and + * key x N for delete + */ + count = (add != 0) ? ac / 2 + 1 : ac; + + if (count <= 1) { + /* Adding single entry with/without value */ + memset(&tent, 0, sizeof(tent)); + tent_buf = &tent; + } else { + + if ((tent_buf = calloc(count, sizeof(tent))) == NULL) + errx(EX_OSERR, + "Unable to allocate memory for all entries"); + } + ptent = tent_buf; + + memset(&xi, 0, sizeof(xi)); + count = 0; + while (ac > 0) { + tentry_fill_key(oh, ptent, *av, add, &type, &vmask, &xi); + + /* + * compability layer: auto-create table if not exists + */ + if (xi.tablename[0] == '\0') { + xi.type = type; + xi.vmask = vmask; + strlcpy(xi.tablename, oh->ntlv.name, + sizeof(xi.tablename)); + fprintf(stderr, "DEPRECATED: inserting data info " + "non-existent table %s. (auto-created)\n", + xi.tablename); + table_do_create(oh, &xi); + } + + oh->ntlv.type = type; + ac--; av++; + + if (add != 0 && ac > 0) { + tentry_fill_value(oh, ptent, *av, type, vmask); + ac--; av++; + } + + if (update != 0) + ptent->head.flags |= IPFW_TF_UPDATE; + + count++; + ptent++; + } + + error = table_do_modify_record(cmd, oh, tent_buf, count, atomic); + + quiet = 0; + + /* + * Compatibility stuff: do not yell on duplicate keys or + * failed deletions. + */ + if (error == 0 || (error == EEXIST && add != 0) || + (error == ENOENT && add == 0)) { + if (quiet != 0) { + if (tent_buf != &tent) + free(tent_buf); + return; + } + } + + /* Report results back */ + ptent = tent_buf; + for (i = 0; i < count; ptent++, i++) { + ignored = 0; + switch (ptent->result) { + case IPFW_TR_ADDED: + px = "added"; + break; + case IPFW_TR_DELETED: + px = "deleted"; + break; + case IPFW_TR_UPDATED: + px = "updated"; + break; + case IPFW_TR_LIMIT: + px = "limit"; + ignored = 1; + break; + case IPFW_TR_ERROR: + px = "error"; + ignored = 1; + break; + case IPFW_TR_NOTFOUND: + px = "notfound"; + ignored = 1; + break; + case IPFW_TR_EXISTS: + px = "exists"; + ignored = 1; + break; + case IPFW_TR_IGNORED: + px = "ignored"; + ignored = 1; + break; + default: + px = "unknown"; + ignored = 1; + } + + if (error != 0 && atomic != 0 && ignored == 0) + printf("%s(reverted): ", px); + else + printf("%s: ", px); + + table_show_entry(&xi, ptent); + } + + if (tent_buf != &tent) + free(tent_buf); + + if (error == 0) + return; + /* Get real OS error */ + error = errno; + + /* Try to provide more human-readable error */ + switch (error) { + case EEXIST: + etxt = "record already exists"; + break; + case EFBIG: + etxt = "limit hit"; + break; + case ESRCH: + etxt = "table not found"; + break; + case ENOENT: + etxt = "record not found"; + break; + case EACCES: + etxt = "table is locked"; + break; + default: + etxt = strerror(error); + } + + errx(EX_OSERR, "%s: %s", texterr, etxt); +} + +static int +table_do_lookup(ipfw_obj_header *oh, char *key, ipfw_xtable_info *xi, + ipfw_obj_tentry *xtent) +{ + char xbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_tentry)]; + ipfw_obj_tentry *tent; + uint8_t type; + uint32_t vmask; + size_t sz; + + memcpy(xbuf, oh, sizeof(*oh)); + oh = (ipfw_obj_header *)xbuf; + tent = (ipfw_obj_tentry *)(oh + 1); + + memset(tent, 0, sizeof(*tent)); + tent->head.length = sizeof(*tent); + tent->idx = 1; + + tentry_fill_key(oh, tent, key, 0, &type, &vmask, xi); + oh->ntlv.type = type; + + sz = sizeof(xbuf); + if (do_get3(IP_FW_TABLE_XFIND, &oh->opheader, &sz) != 0) + return (errno); + + if (sz < sizeof(xbuf)) + return (EINVAL); + + *xtent = *tent; + + return (0); +} + +static void +table_lookup(ipfw_obj_header *oh, int ac, char *av[]) +{ + ipfw_obj_tentry xtent; + ipfw_xtable_info xi; + char key[64]; + int error; + + if (ac == 0) + errx(EX_USAGE, "address required"); + + strlcpy(key, *av, sizeof(key)); + + memset(&xi, 0, sizeof(xi)); + error = table_do_lookup(oh, key, &xi, &xtent); + + switch (error) { + case 0: + break; + case ESRCH: + errx(EX_UNAVAILABLE, "Table %s not found", oh->ntlv.name); + case ENOENT: + errx(EX_UNAVAILABLE, "Entry %s not found", *av); + case ENOTSUP: + errx(EX_UNAVAILABLE, "Table %s algo does not support " + "\"lookup\" method", oh->ntlv.name); + default: + err(EX_OSERR, "getsockopt(IP_FW_TABLE_XFIND)"); + } + + table_show_entry(&xi, &xtent); +} + +static void +tentry_fill_key_type(char *arg, ipfw_obj_tentry *tentry, uint8_t type, + uint8_t tflags) +{ + char *p, *pp; + int mask, af; + struct in6_addr *paddr, tmp; + struct tflow_entry *tfe; + uint32_t key, *pkey; + uint16_t port; + struct protoent *pent; + struct servent *sent; + int masklen; + + mask = 0; // XXX uninitialized ? + masklen = 0; + af = 0; + paddr = (struct in6_addr *)&tentry->k; + + switch (type) { + case IPFW_TABLE_ADDR: + /* Remove / if exists */ + if ((p = strchr(arg, '/')) != NULL) { + *p = '\0'; + mask = atoi(p + 1); + } + + if (inet_pton(AF_INET, arg, paddr) == 1) { + if (p != NULL && mask > 32) + errx(EX_DATAERR, "bad IPv4 mask width: %s", + p + 1); + + masklen = p ? mask : 32; + af = AF_INET; + } else if (inet_pton(AF_INET6, arg, paddr) == 1) { + if (IN6_IS_ADDR_V4COMPAT(paddr)) + errx(EX_DATAERR, + "Use IPv4 instead of v4-compatible"); + if (p != NULL && mask > 128) + errx(EX_DATAERR, "bad IPv6 mask width: %s", + p + 1); + + masklen = p ? mask : 128; + af = AF_INET6; + } else { + /* Assume FQDN */ + if (lookup_host(arg, (struct in_addr *)paddr) != 0) + errx(EX_NOHOST, "hostname ``%s'' unknown", arg); + + masklen = 32; + type = IPFW_TABLE_ADDR; + af = AF_INET; + } + break; + case IPFW_TABLE_INTERFACE: + /* Assume interface name. Copy significant data only */ + mask = MIN(strlen(arg), IF_NAMESIZE - 1); + memcpy(paddr, arg, mask); + /* Set mask to exact match */ + masklen = 8 * IF_NAMESIZE; + break; + case IPFW_TABLE_NUMBER: + /* Port or any other key */ + key = strtol(arg, &p, 10); + if (*p != '\0') + errx(EX_DATAERR, "Invalid number: %s", arg); + + pkey = (uint32_t *)paddr; + *pkey = key; + masklen = 32; + break; + case IPFW_TABLE_FLOW: + /* Assume [src-ip][,proto][,src-port][,dst-ip][,dst-port] */ + tfe = &tentry->k.flow; + af = 0; + + /* Handle <ipv4|ipv6> */ + if ((tflags & IPFW_TFFLAG_SRCIP) != 0) { + if ((p = strchr(arg, ',')) != NULL) + *p++ = '\0'; + /* Determine family using temporary storage */ + if (inet_pton(AF_INET, arg, &tmp) == 1) { + if (af != 0 && af != AF_INET) + errx(EX_DATAERR, + "Inconsistent address family\n"); + af = AF_INET; + memcpy(&tfe->a.a4.sip, &tmp, 4); + } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { + if (af != 0 && af != AF_INET6) + errx(EX_DATAERR, + "Inconsistent address family\n"); + af = AF_INET6; + memcpy(&tfe->a.a6.sip6, &tmp, 16); + } + + arg = p; + } + + /* Handle <proto-num|proto-name> */ + if ((tflags & IPFW_TFFLAG_PROTO) != 0) { + if (arg == NULL) + errx(EX_DATAERR, "invalid key: proto missing"); + if ((p = strchr(arg, ',')) != NULL) + *p++ = '\0'; + + key = strtol(arg, &pp, 10); + if (*pp != '\0') { + if ((pent = getprotobyname(arg)) == NULL) + errx(EX_DATAERR, "Unknown proto: %s", + arg); + else + key = pent->p_proto; + } + + if (key > 255) + errx(EX_DATAERR, "Bad protocol number: %u",key); + + tfe->proto = key; + + arg = p; + } + + /* Handle <port-num|service-name> */ + if ((tflags & IPFW_TFFLAG_SRCPORT) != 0) { + if (arg == NULL) + errx(EX_DATAERR, "invalid key: src port missing"); + if ((p = strchr(arg, ',')) != NULL) + *p++ = '\0'; + + if ((port = htons(strtol(arg, NULL, 10))) == 0) { + if ((sent = getservbyname(arg, NULL)) == NULL) + errx(EX_DATAERR, "Unknown service: %s", + arg); + else + key = sent->s_port; + } + + tfe->sport = port; + + arg = p; + } + + /* Handle <ipv4|ipv6>*/ + if ((tflags & IPFW_TFFLAG_DSTIP) != 0) { + if (arg == NULL) + errx(EX_DATAERR, "invalid key: dst ip missing"); + if ((p = strchr(arg, ',')) != NULL) + *p++ = '\0'; + /* Determine family using temporary storage */ + if (inet_pton(AF_INET, arg, &tmp) == 1) { + if (af != 0 && af != AF_INET) + errx(EX_DATAERR, + "Inconsistent address family"); + af = AF_INET; + memcpy(&tfe->a.a4.dip, &tmp, 4); + } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { + if (af != 0 && af != AF_INET6) + errx(EX_DATAERR, + "Inconsistent address family"); + af = AF_INET6; + memcpy(&tfe->a.a6.dip6, &tmp, 16); + } + + arg = p; + } + + /* Handle <port-num|service-name> */ + if ((tflags & IPFW_TFFLAG_DSTPORT) != 0) { + if (arg == NULL) + errx(EX_DATAERR, "invalid key: dst port missing"); + if ((p = strchr(arg, ',')) != NULL) + *p++ = '\0'; + + if ((port = htons(strtol(arg, NULL, 10))) == 0) { + if ((sent = getservbyname(arg, NULL)) == NULL) + errx(EX_DATAERR, "Unknown service: %s", + arg); + else + key = sent->s_port; + } + + tfe->dport = port; + + arg = p; + } + + tfe->af = af; + + break; + + default: + errx(EX_DATAERR, "Unsupported table type: %d", type); + } + + tentry->subtype = af; + tentry->masklen = masklen; +} + +static void +tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *key, + int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi) +{ + uint8_t type, tflags; + uint32_t vmask; + int error; + char *del; + + type = 0; + tflags = 0; + vmask = 0; + + if (xi->tablename[0] == '\0') + error = table_get_info(oh, xi); + else + error = 0; + + if (error == 0) { + /* Table found. */ + type = xi->type; + tflags = xi->tflags; + vmask = xi->vmask; + } else { + if (error != ESRCH) + errx(EX_OSERR, "Error requesting table %s info", + oh->ntlv.name); + if (add == 0) + errx(EX_DATAERR, "Table %s does not exist", + oh->ntlv.name); + /* + * Table does not exist. + * Compability layer: try to interpret data as ADDR + * before failing. + */ + if ((del = strchr(key, '/')) != NULL) + *del = '\0'; + if (inet_pton(AF_INET, key, &tent->k.addr6) == 1 || + inet_pton(AF_INET6, key, &tent->k.addr6) == 1) { + /* OK Prepare and send */ + type = IPFW_TABLE_ADDR; + vmask = IPFW_VTYPE_LEGACY; + } else { + /* Inknown key */ + errx(EX_USAGE, "Table %s does not exist, cannot guess " + "key '%s' type", oh->ntlv.name, key); + } + if (del != NULL) + *del = '/'; + } + + tentry_fill_key_type(key, tent, type, tflags); + + *ptype = type; + *pvmask = vmask; +} + +static void +set_legacy_value(uint32_t val, ipfw_table_value *v) +{ + v->tag = val; + v->pipe = val; + v->divert = val; + v->skipto = val; + v->netgraph = val; + v->fib = val; + v->nat = val; + v->nh4 = val; + v->dscp = (uint8_t)val; + v->limit = val; +} + +static void +tentry_fill_value(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *arg, + uint8_t type, uint32_t vmask) +{ + uint32_t a4, flag, val, vm; + ipfw_table_value *v; + uint32_t i; + int dval; + char *comma, *e, *etype, *n, *p; + + v = &tent->v.value; + vm = vmask; + + /* Compat layer: keep old behavior for legacy value types */ + if (vmask == IPFW_VTYPE_LEGACY) { + /* Try to interpret as number first */ + val = strtoul(arg, &p, 0); + if (*p == '\0') { + set_legacy_value(val, v); + return; + } + if (inet_pton(AF_INET, arg, &val) == 1) { + set_legacy_value(ntohl(val), v); + return; + } + /* Try hostname */ + if (lookup_host(arg, (struct in_addr *)&val) == 0) { + set_legacy_value(val, v); + return; + } + errx(EX_OSERR, "Unable to parse value %s", arg); + } + + /* + * Shorthands: handle single value if vmask consists + * of numbers only. e.g.: + * vmask = "fib,skipto" -> treat input "1" as "1,1" + */ + + n = arg; + etype = NULL; + for (i = 1; i < (1 << 31); i *= 2) { + if ((flag = (vmask & i)) == 0) + continue; + vmask &= ~flag; + + if ((comma = strchr(n, ',')) != NULL) + *comma = '\0'; + + switch (flag) { + case IPFW_VTYPE_TAG: + v->tag = strtol(n, &e, 10); + if (*e != '\0') + etype = "tag"; + break; + case IPFW_VTYPE_PIPE: + v->pipe = strtol(n, &e, 10); + if (*e != '\0') + etype = "pipe"; + break; + case IPFW_VTYPE_DIVERT: + v->divert = strtol(n, &e, 10); + if (*e != '\0') + etype = "divert"; + break; + case IPFW_VTYPE_SKIPTO: + v->skipto = strtol(n, &e, 10); + if (*e != '\0') + etype = "skipto"; + break; + case IPFW_VTYPE_NETGRAPH: + v->netgraph = strtol(n, &e, 10); + if (*e != '\0') + etype = "netgraph"; + break; + case IPFW_VTYPE_FIB: + v->fib = strtol(n, &e, 10); + if (*e != '\0') + etype = "fib"; + break; + case IPFW_VTYPE_NAT: + v->nat = strtol(n, &e, 10); + if (*e != '\0') + etype = "nat"; + break; + case IPFW_VTYPE_LIMIT: + v->limit = strtol(n, &e, 10); + if (*e != '\0') + etype = "limit"; + break; + case IPFW_VTYPE_NH4: + if (strchr(n, '.') != NULL && + inet_pton(AF_INET, n, &a4) == 1) { + v->nh4 = ntohl(a4); + break; + } + if (lookup_host(n, (struct in_addr *)&v->nh4) == 0) + break; + etype = "ipv4"; + break; + case IPFW_VTYPE_DSCP: + if (isalpha(*n)) { + if ((dval = match_token(f_ipdscp, n)) != -1) { + v->dscp = dval; + break; + } else + etype = "DSCP code"; + } else { + v->dscp = strtol(n, &e, 10); + if (v->dscp > 63 || *e != '\0') + etype = "DSCP value"; + } + break; + case IPFW_VTYPE_NH6: + if (strchr(n, ':') != NULL && + inet_pton(AF_INET6, n, &v->nh6) == 1) + break; + etype = "ipv6"; + break; + } + + if (etype != NULL) + errx(EX_USAGE, "Unable to parse %s as %s", n, etype); + + if (comma != NULL) + *comma++ = ','; + + if ((n = comma) != NULL) + continue; + + /* End of input. */ + if (vmask != 0) + errx(EX_USAGE, "Not enough fields inside value"); + } +} + +/* + * Compare table names. + * Honor number comparison. + */ +static int +tablename_cmp(const void *a, const void *b) +{ + ipfw_xtable_info *ia, *ib; + + ia = (ipfw_xtable_info *)a; + ib = (ipfw_xtable_info *)b; + + return (stringnum_cmp(ia->tablename, ib->tablename)); +} + +/* + * Retrieves table list from kernel, + * optionally sorts it and calls requested function for each table. + * Returns 0 on success. + */ +static int +tables_foreach(table_cb_t *f, void *arg, int sort) +{ + ipfw_obj_lheader *olh; + ipfw_xtable_info *info; + size_t sz; + int i, error; + + /* Start with reasonable default */ + sz = sizeof(*olh) + 16 * sizeof(ipfw_xtable_info); + + for (;;) { + if ((olh = calloc(1, sz)) == NULL) + return (ENOMEM); + + olh->size = sz; + if (do_get3(IP_FW_TABLES_XLIST, &olh->opheader, &sz) != 0) { + sz = olh->size; + free(olh); + if (errno != ENOMEM) + return (errno); + continue; + } + + if (sort != 0) + qsort(olh + 1, olh->count, olh->objsize, tablename_cmp); + + info = (ipfw_xtable_info *)(olh + 1); + for (i = 0; i < olh->count; i++) { + error = f(info, arg); /* Ignore errors for now */ + info = (ipfw_xtable_info *)((caddr_t)info + olh->objsize); + } + + free(olh); + break; + } + + return (0); +} + + +/* + * Retrieves all entries for given table @i in + * eXtended format. Allocate buffer large enough + * to store result. Called needs to free it later. + * + * Returns 0 on success. + */ +static int +table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh) +{ + ipfw_obj_header *oh; + size_t sz; + int c; + + sz = 0; + oh = NULL; + for (c = 0; c < 8; c++) { + if (sz < i->size) + sz = i->size + 44; + if (oh != NULL) + free(oh); + if ((oh = calloc(1, sz)) == NULL) + continue; + table_fill_objheader(oh, i); + oh->opheader.version = 1; /* Current version */ + if (do_get3(IP_FW_TABLE_XLIST, &oh->opheader, &sz) == 0) { + *poh = oh; + return (0); + } + + if (errno != ENOMEM) + break; + } + free(oh); + + return (errno); +} + +/* + * Shows all entries from @oh in human-readable format + */ +static void +table_show_list(ipfw_obj_header *oh, int need_header) +{ + ipfw_obj_tentry *tent; + uint32_t count; + ipfw_xtable_info *i; + + i = (ipfw_xtable_info *)(oh + 1); + tent = (ipfw_obj_tentry *)(i + 1); + + if (need_header) + printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); + + count = i->count; + while (count > 0) { + table_show_entry(i, tent); + tent = (ipfw_obj_tentry *)((caddr_t)tent + tent->head.length); + count--; + } +} + +static void +table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, + uint32_t vmask, int print_ip) +{ + uint32_t flag, i, l; + size_t sz; + struct in_addr a4; + char abuf[INET6_ADDRSTRLEN]; + + sz = bufsize; + + /* + * Some shorthands for printing values: + * legacy assumes all values are equal, so keep the first one. + */ + if (vmask == IPFW_VTYPE_LEGACY) { + if (print_ip != 0) { + flag = htonl(v->tag); + inet_ntop(AF_INET, &flag, buf, sz); + } else + snprintf(buf, sz, "%u", v->tag); + return; + } + + for (i = 1; i < (1 << 31); i *= 2) { + if ((flag = (vmask & i)) == 0) + continue; + l = 0; + + switch (flag) { + case IPFW_VTYPE_TAG: + l = snprintf(buf, sz, "%u,", v->tag); + break; + case IPFW_VTYPE_PIPE: + l = snprintf(buf, sz, "%u,", v->pipe); + break; + case IPFW_VTYPE_DIVERT: + l = snprintf(buf, sz, "%d,", v->divert); + break; + case IPFW_VTYPE_SKIPTO: + l = snprintf(buf, sz, "%d,", v->skipto); + break; + case IPFW_VTYPE_NETGRAPH: + l = snprintf(buf, sz, "%u,", v->netgraph); + break; + case IPFW_VTYPE_FIB: + l = snprintf(buf, sz, "%u,", v->fib); + break; + case IPFW_VTYPE_NAT: + l = snprintf(buf, sz, "%u,", v->nat); + break; + case IPFW_VTYPE_LIMIT: + l = snprintf(buf, sz, "%u,", v->limit); + break; + case IPFW_VTYPE_NH4: + a4.s_addr = htonl(v->nh4); + inet_ntop(AF_INET, &a4, abuf, sizeof(abuf)); + l = snprintf(buf, sz, "%s,", abuf); + break; + case IPFW_VTYPE_DSCP: + l = snprintf(buf, sz, "%d,", v->dscp); + break; + case IPFW_VTYPE_NH6: + inet_ntop(AF_INET6, &v->nh6, abuf, sizeof(abuf)); + l = snprintf(buf, sz, "%s,", abuf); + break; + } + + buf += l; + sz -= l; + } + + if (sz != bufsize) + *(buf - 1) = '\0'; +} + +static void +table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent) +{ + char *comma, tbuf[128], pval[128]; + void *paddr; + struct tflow_entry *tfe; + + table_show_value(pval, sizeof(pval), &tent->v.value, i->vmask, + co.do_value_as_ip); + + switch (i->type) { + case IPFW_TABLE_ADDR: + /* IPv4 or IPv6 prefixes */ + inet_ntop(tent->subtype, &tent->k, tbuf, sizeof(tbuf)); + printf("%s/%u %s\n", tbuf, tent->masklen, pval); + break; + case IPFW_TABLE_INTERFACE: + /* Interface names */ + printf("%s %s\n", tent->k.iface, pval); + break; + case IPFW_TABLE_NUMBER: + /* numbers */ + printf("%u %s\n", tent->k.key, pval); + break; + case IPFW_TABLE_FLOW: + /* flows */ + tfe = &tent->k.flow; + comma = ""; + + if ((i->tflags & IPFW_TFFLAG_SRCIP) != 0) { + if (tfe->af == AF_INET) + paddr = &tfe->a.a4.sip; + else + paddr = &tfe->a.a6.sip6; + + inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); + printf("%s%s", comma, tbuf); + comma = ","; + } + + if ((i->tflags & IPFW_TFFLAG_PROTO) != 0) { + printf("%s%d", comma, tfe->proto); + comma = ","; + } + + if ((i->tflags & IPFW_TFFLAG_SRCPORT) != 0) { + printf("%s%d", comma, ntohs(tfe->sport)); + comma = ","; + } + if ((i->tflags & IPFW_TFFLAG_DSTIP) != 0) { + if (tfe->af == AF_INET) + paddr = &tfe->a.a4.dip; + else + paddr = &tfe->a.a6.dip6; + + inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); + printf("%s%s", comma, tbuf); + comma = ","; + } + + if ((i->tflags & IPFW_TFFLAG_DSTPORT) != 0) { + printf("%s%d", comma, ntohs(tfe->dport)); + comma = ","; + } + + printf(" %s\n", pval); + } +} + +static int +table_do_get_stdlist(uint16_t opcode, ipfw_obj_lheader **polh) +{ + ipfw_obj_lheader req, *olh; + size_t sz; + + memset(&req, 0, sizeof(req)); + sz = sizeof(req); + + if (do_get3(opcode, &req.opheader, &sz) != 0) + if (errno != ENOMEM) + return (errno); + + sz = req.size; + if ((olh = calloc(1, sz)) == NULL) + return (ENOMEM); + + olh->size = sz; + if (do_get3(opcode, &olh->opheader, &sz) != 0) { + free(olh); + return (errno); + } + + *polh = olh; + return (0); +} + +static int +table_do_get_algolist(ipfw_obj_lheader **polh) +{ + + return (table_do_get_stdlist(IP_FW_TABLES_ALIST, polh)); +} + +static int +table_do_get_vlist(ipfw_obj_lheader **polh) +{ + + return (table_do_get_stdlist(IP_FW_TABLE_VLIST, polh)); +} + +void +ipfw_list_ta(int ac, char *av[]) +{ + ipfw_obj_lheader *olh; + ipfw_ta_info *info; + int error, i; + const char *atype; + + error = table_do_get_algolist(&olh); + if (error != 0) + err(EX_OSERR, "Unable to request algorithm list"); + + info = (ipfw_ta_info *)(olh + 1); + for (i = 0; i < olh->count; i++) { + if ((atype = match_value(tabletypes, info->type)) == NULL) + atype = "unknown"; + printf("--- %s ---\n", info->algoname); + printf(" type: %s\n refcount: %u\n", atype, info->refcnt); + + info = (ipfw_ta_info *)((caddr_t)info + olh->objsize); + } + + free(olh); +} + + +/* Copy of current kernel table_value structure */ +struct _table_value { + uint32_t tag; /* O_TAG/O_TAGGED */ + uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t divert; /* O_DIVERT/O_TEE */ + uint16_t skipto; /* skipto, CALLRET */ + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ + uint32_t fib; /* O_SETFIB */ + uint32_t nat; /* O_NAT */ + uint32_t nh4; + uint8_t dscp; + uint8_t spare0[3]; + /* -- 32 bytes -- */ + struct in6_addr nh6; + uint32_t limit; /* O_LIMIT */ + uint32_t spare1; + uint64_t refcnt; /* Number of references */ +}; + +int +compare_values(const void *_a, const void *_b) +{ + struct _table_value *a, *b; + + a = (struct _table_value *)_a; + b = (struct _table_value *)_b; + + if (a->spare1 < b->spare1) + return (-1); + else if (a->spare1 > b->spare1) + return (1); + + return (0); +} + +void +ipfw_list_values(int ac, char *av[]) +{ + ipfw_obj_lheader *olh; + struct _table_value *v; + int error, i; + uint32_t vmask; + char buf[128]; + + error = table_do_get_vlist(&olh); + if (error != 0) + err(EX_OSERR, "Unable to request value list"); + + vmask = 0x7FFFFFFF; /* Similar to IPFW_VTYPE_LEGACY */ + + table_print_valheader(buf, sizeof(buf), vmask); + printf("HEADER: %s\n", buf); + v = (struct _table_value *)(olh + 1); + qsort(v, olh->count, olh->objsize, compare_values); + for (i = 0; i < olh->count; i++) { + table_show_value(buf, sizeof(buf), (ipfw_table_value *)v, + vmask, 0); + printf("[%u] refs=%lu %s\n", v->spare1, (u_long)v->refcnt, buf); + v = (struct _table_value *)((caddr_t)v + olh->objsize); + } + + free(olh); +} + +int +compare_ntlv(const void *_a, const void *_b) +{ + ipfw_obj_ntlv *a, *b; + + a = (ipfw_obj_ntlv *)_a; + b = (ipfw_obj_ntlv *)_b; + + if (a->set < b->set) + return (-1); + else if (a->set > b->set) + return (1); + + if (a->idx < b->idx) + return (-1); + else if (a->idx > b->idx) + return (1); + + return (0); +} + +int +compare_kntlv(const void *k, const void *v) +{ + ipfw_obj_ntlv *ntlv; + uint16_t key; + + key = *((uint16_t *)k); + ntlv = (ipfw_obj_ntlv *)v; + + if (key < ntlv->idx) + return (-1); + else if (key > ntlv->idx) + return (1); + + return (0); +} + +/* + * Finds table name in @ctlv by @idx. + * Uses the following facts: + * 1) All TLVs are the same size + * 2) Kernel implementation provides already sorted list. + * + * Returns table name or NULL. + */ +char * +table_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx) +{ + ipfw_obj_ntlv *ntlv; + + ntlv = bsearch(&idx, (ctlv + 1), ctlv->count, ctlv->objsize, + compare_kntlv); + + if (ntlv != 0) + return (ntlv->name); + + return (NULL); +} + +void +table_sort_ctlv(ipfw_obj_ctlv *ctlv) +{ + + qsort(ctlv + 1, ctlv->count, ctlv->objsize, compare_ntlv); +} + +int +table_check_name(char *tablename) +{ + int c, i, l; + + /* + * Check if tablename is null-terminated and contains + * valid symbols only. Valid mask is: + * [a-zA-Z0-9\-_\.]{1,63} + */ + l = strlen(tablename); + if (l == 0 || l >= 64) + return (EINVAL); + for (i = 0; i < l; i++) { + c = tablename[i]; + if (isalpha(c) || isdigit(c) || c == '_' || + c == '-' || c == '.') + continue; + return (EINVAL); + } + + /* Restrict some 'special' names */ + if (strcmp(tablename, "all") == 0) + return (EINVAL); + + return (0); +} + diff --git a/example/ipfw/sys/net/pfil.h b/example/ipfw/sys/net/pfil.h new file mode 100644 index 0000000..6fa0c25 --- /dev/null +++ b/example/ipfw/sys/net/pfil.h @@ -0,0 +1,148 @@ +/* $FreeBSD: head/sys/net/pfil.h 254777 2013-08-24 12:03:24Z andre $ */ +/* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ + +/*- + * Copyright (c) 1996 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PFIL_H_ +#define _NET_PFIL_H_ + +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> +#include <sys/lock.h> +#include <sys/rmlock.h> + +struct mbuf; +struct ifnet; +struct inpcb; + +typedef int (*pfil_func_t)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + +/* + * The packet filter hooks are designed for anything to call them to + * possibly intercept the packet. Multiple filter hooks are chained + * together and after each other in the specified order. + */ +struct packet_filter_hook { + TAILQ_ENTRY(packet_filter_hook) pfil_chain; + pfil_func_t pfil_func; + void *pfil_arg; +}; + +#define PFIL_IN 0x00000001 +#define PFIL_OUT 0x00000002 +#define PFIL_WAITOK 0x00000004 +#define PFIL_ALL (PFIL_IN|PFIL_OUT) + +typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; + +#define PFIL_TYPE_AF 1 /* key is AF_* type */ +#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ + +#define PFIL_FLAG_PRIVATE_LOCK 0x01 /* Personal lock instead of global */ + +/* + * A pfil head is created by each protocol or packet intercept point. + * For packet is then run through the hook chain for inspection. + */ +struct pfil_head { + pfil_chain_t ph_in; + pfil_chain_t ph_out; + int ph_type; + int ph_nhooks; +#if defined( __linux__ ) || defined( _WIN32 ) + rwlock_t ph_mtx; +#else + struct rmlock *ph_plock; /* Pointer to the used lock */ + struct rmlock ph_lock; /* Private lock storage */ + int flags; +#endif + union { + u_long phu_val; + void *phu_ptr; + } ph_un; +#define ph_af ph_un.phu_val +#define ph_ifnet ph_un.phu_ptr + LIST_ENTRY(pfil_head) ph_list; +}; + +/* Public functions for pfil hook management by packet filters. */ +struct pfil_head *pfil_head_get(int, u_long); +int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); +int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) + +/* Public functions to run the packet inspection by protocols. */ +int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, + int, struct inpcb *inp); + +/* Public functions for pfil head management by protocols. */ +int pfil_head_register(struct pfil_head *); +int pfil_head_unregister(struct pfil_head *); + +/* Public pfil locking functions for self managed locks by packet filters. */ +struct rm_priotracker; /* Do not require including rmlock header */ +int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *); +void pfil_rlock(struct pfil_head *, struct rm_priotracker *); +void pfil_runlock(struct pfil_head *, struct rm_priotracker *); +void pfil_wlock(struct pfil_head *); +void pfil_wunlock(struct pfil_head *); +int pfil_wowned(struct pfil_head *ph); + +/* Internal pfil locking functions. */ +#define PFIL_LOCK_INIT_REAL(l, t) \ + rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE) +#define PFIL_LOCK_DESTROY_REAL(l) \ + rm_destroy(l) +#define PFIL_LOCK_INIT(p) do { \ + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) { \ + PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private"); \ + (p)->ph_plock = &(p)->ph_lock; \ + } else \ + (p)->ph_plock = &V_pfil_lock; \ +} while (0) +#define PFIL_LOCK_DESTROY(p) do { \ + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) \ + PFIL_LOCK_DESTROY_REAL((p)->ph_plock); \ +} while (0) + +#define PFIL_TRY_RLOCK(p, t) rm_try_rlock((p)->ph_plock, (t)) +#define PFIL_RLOCK(p, t) rm_rlock((p)->ph_plock, (t)) +#define PFIL_WLOCK(p) rm_wlock((p)->ph_plock) +#define PFIL_RUNLOCK(p, t) rm_runlock((p)->ph_plock, (t)) +#define PFIL_WUNLOCK(p) rm_wunlock((p)->ph_plock) +#define PFIL_WOWNED(p) rm_wowned((p)->ph_plock) + +/* Internal locking macros for global/vnet pfil_head_list. */ +#define PFIL_HEADLIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_HEADLIST_UNLOCK() mtx_unlock(&pfil_global_lock) + +#endif /* _NET_PFIL_H_ */ diff --git a/example/ipfw/sys/net/radix.c b/example/ipfw/sys/net/radix.c new file mode 100644 index 0000000..b423662 --- /dev/null +++ b/example/ipfw/sys/net/radix.c @@ -0,0 +1,1208 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.c 8.5 (Berkeley) 5/19/95 + * $FreeBSD: head/sys/net/radix.c 272385 2014-10-01 21:24:58Z melifaro $ + */ + +/* + * Routines to build and maintain radix trees for routing lookups. + */ +#include <sys/param.h> +#ifdef _KERNEL +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/syslog.h> +#include <net/radix.h> +#include "opt_mpath.h" +#ifdef RADIX_MPATH +#include <net/radix_mpath.h> +#endif +#else /* !_KERNEL */ +#include <stdio.h> +#include <strings.h> +#include <stdlib.h> +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) +#define min(a, b) ((a) < (b) ? (a) : (b) ) +#include <net/radix.h> +#endif /* !_KERNEL */ + +static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, + walktree_f_t *f, void *w); +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); +static struct radix_node + *rn_insert(void *, struct radix_node_head *, int *, + struct radix_node [2]), + *rn_newpair(void *, int, struct radix_node[2]), + *rn_search(void *, struct radix_node *), + *rn_search_m(void *, struct radix_node *, void *); + +static void rn_detachhead_internal(void **head); +static int rn_inithead_internal(void **head, int off); + +#define RADIX_MAX_KEY_LEN 32 + +static char rn_zeros[RADIX_MAX_KEY_LEN]; +static char rn_ones[RADIX_MAX_KEY_LEN] = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, +}; + + +static int rn_lexobetter(void *m_arg, void *n_arg); +static struct radix_mask * + rn_new_radix_mask(struct radix_node *tt, + struct radix_mask *next); +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, + int skip); + +/* + * The data structure for the keys is a radix tree with one way + * branching removed. The index rn_bit at an internal node n represents a bit + * position to be tested. The tree is arranged so that all descendants + * of a node n have keys whose bits all agree up to position rn_bit - 1. + * (We say the index of n is rn_bit.) + * + * There is at least one descendant which has a one bit at position rn_bit, + * and at least one with a zero there. + * + * A route is determined by a pair of key and mask. We require that the + * bit-wise logical and of the key and mask to be the key. + * We define the index of a route to associated with the mask to be + * the first bit number in the mask where 0 occurs (with bit number 0 + * representing the highest order bit). + * + * We say a mask is normal if every bit is 0, past the index of the mask. + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, + * and m is a normal mask, then the route applies to every descendant of n. + * If the index(m) < rn_bit, this implies the trailing last few bits of k + * before bit b are all 0, (and hence consequently true of every descendant + * of n), so the route applies to all descendants of the node as well. + * + * Similar logic shows that a non-normal mask m such that + * index(m) <= index(n) could potentially apply to many children of n. + * Thus, for each non-host route, we attach its mask to a list at an internal + * node as high in the tree as we can go. + * + * The present version of the code makes use of normal routes in short- + * circuiting an explict mask and compare operation when testing whether + * a key satisfies a normal route, and also in remembering the unique leaf + * that governs a subtree. + */ + +/* + * Most of the functions in this code assume that the key/mask arguments + * are sockaddr-like structures, where the first byte is an u_char + * indicating the size of the entire structure. + * + * To make the assumption more explicit, we use the LEN() macro to access + * this field. It is safe to pass an expression with side effects + * to LEN() as the argument is evaluated only once. + * We cast the result to int as this is the dominant usage. + */ +#define LEN(x) ( (int) (*(const u_char *)(x)) ) + +/* + * XXX THIS NEEDS TO BE FIXED + * In the code, pointers to keys and masks are passed as either + * 'void *' (because callers use to pass pointers of various kinds), or + * 'caddr_t' (which is fine for pointer arithmetics, but not very + * clean when you dereference it to access data). Furthermore, caddr_t + * is really 'char *', while the natural type to operate on keys and + * masks would be 'u_char'. This mismatch require a lot of casts and + * intermediate variables to adapt types that clutter the code. + */ + +/* + * Search a node in the tree matching the key. + */ +static struct radix_node * +rn_search(void *v_arg, struct radix_node *head) +{ + struct radix_node *x; + caddr_t v; + + for (x = head, v = v_arg; x->rn_bit >= 0;) { + if (x->rn_bmask & v[x->rn_offset]) + x = x->rn_right; + else + x = x->rn_left; + } + return (x); +} + +/* + * Same as above, but with an additional mask. + * XXX note this function is used only once. + */ +static struct radix_node * +rn_search_m(void *v_arg, struct radix_node *head, void *m_arg) +{ + struct radix_node *x; + caddr_t v = v_arg, m = m_arg; + + for (x = head; x->rn_bit >= 0;) { + if ((x->rn_bmask & m[x->rn_offset]) && + (x->rn_bmask & v[x->rn_offset])) + x = x->rn_right; + else + x = x->rn_left; + } + return (x); +} + +int +rn_refines(void *m_arg, void *n_arg) +{ + caddr_t m = m_arg, n = n_arg; + caddr_t lim, lim2 = lim = n + LEN(n); + int longer = LEN(n++) - LEN(m++); + int masks_are_equal = 1; + + if (longer > 0) + lim -= longer; + while (n < lim) { + if (*n & ~(*m)) + return (0); + if (*n++ != *m++) + masks_are_equal = 0; + } + while (n < lim2) + if (*n++) + return (0); + if (masks_are_equal && (longer < 0)) + for (lim2 = m - longer; m < lim2; ) + if (*m++) + return (1); + return (!masks_are_equal); +} + +/* + * Search for exact match in given @head. + * Assume host bits are cleared in @v_arg if @m_arg is not NULL + * Note that prefixes with /32 or /128 masks are treated differently + * from host routes. + */ +struct radix_node * +rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) +{ + struct radix_node *x; + caddr_t netmask; + + if (m_arg != NULL) { + /* + * Most common case: search exact prefix/mask + */ + x = rn_addmask(m_arg, head->rnh_masks, 1, + head->rnh_treetop->rn_offset); + if (x == NULL) + return (NULL); + netmask = x->rn_key; + + x = rn_match(v_arg, head); + + while (x != NULL && x->rn_mask != netmask) + x = x->rn_dupedkey; + + return (x); + } + + /* + * Search for host address. + */ + if ((x = rn_match(v_arg, head)) == NULL) + return (NULL); + + /* Check if found key is the same */ + if (LEN(x->rn_key) != LEN(v_arg) || bcmp(x->rn_key, v_arg, LEN(v_arg))) + return (NULL); + + /* Check if this is not host route */ + if (x->rn_mask != NULL) + return (NULL); + + return (x); +} + +static int +rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip) +{ + char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; + char *cplim; + int length = min(LEN(cp), LEN(cp2)); + + if (cp3 == NULL) + cp3 = rn_ones; + else + length = min(length, LEN(cp3)); + cplim = cp + length; cp3 += skip; cp2 += skip; + for (cp += skip; cp < cplim; cp++, cp2++, cp3++) + if ((*cp ^ *cp2) & *cp3) + return (0); + return (1); +} + +/* + * Search for longest-prefix match in given @head + */ +struct radix_node * +rn_match(void *v_arg, struct radix_node_head *head) +{ + caddr_t v = v_arg; + struct radix_node *t = head->rnh_treetop, *x; + caddr_t cp = v, cp2; + caddr_t cplim; + struct radix_node *saved_t, *top = t; + int off = t->rn_offset, vlen = LEN(cp), matched_off; + int test, b, rn_bit; + + /* + * Open code rn_search(v, top) to avoid overhead of extra + * subroutine call. + */ + for (; t->rn_bit >= 0; ) { + if (t->rn_bmask & cp[t->rn_offset]) + t = t->rn_right; + else + t = t->rn_left; + } + /* + * See if we match exactly as a host destination + * or at least learn how many bits match, for normal mask finesse. + * + * It doesn't hurt us to limit how many bytes to check + * to the length of the mask, since if it matches we had a genuine + * match and the leaf we have is the most specific one anyway; + * if it didn't match with a shorter length it would fail + * with a long one. This wins big for class B&C netmasks which + * are probably the most common case... + */ + if (t->rn_mask) + vlen = *(u_char *)t->rn_mask; + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; + for (; cp < cplim; cp++, cp2++) + if (*cp != *cp2) + goto on1; + /* + * This extra grot is in case we are explicitly asked + * to look up the default. Ugh! + * + * Never return the root node itself, it seems to cause a + * lot of confusion. + */ + if (t->rn_flags & RNF_ROOT) + t = t->rn_dupedkey; + return (t); +on1: + test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ + for (b = 7; (test >>= 1) > 0;) + b--; + matched_off = cp - v; + b += matched_off << 3; + rn_bit = -1 - b; + /* + * If there is a host route in a duped-key chain, it will be first. + */ + if ((saved_t = t)->rn_mask == 0) + t = t->rn_dupedkey; + for (; t; t = t->rn_dupedkey) + /* + * Even if we don't match exactly as a host, + * we may match if the leaf we wound up at is + * a route to a net. + */ + if (t->rn_flags & RNF_NORMAL) { + if (rn_bit <= t->rn_bit) + return (t); + } else if (rn_satisfies_leaf(v, t, matched_off)) + return (t); + t = saved_t; + /* start searching up the tree */ + do { + struct radix_mask *m; + t = t->rn_parent; + m = t->rn_mklist; + /* + * If non-contiguous masks ever become important + * we can restore the masking and open coding of + * the search and satisfaction test and put the + * calculation of "off" back before the "do". + */ + while (m) { + if (m->rm_flags & RNF_NORMAL) { + if (rn_bit <= m->rm_bit) + return (m->rm_leaf); + } else { + off = min(t->rn_offset, matched_off); + x = rn_search_m(v, t, m->rm_mask); + while (x && x->rn_mask != m->rm_mask) + x = x->rn_dupedkey; + if (x && rn_satisfies_leaf(v, x, off)) + return (x); + } + m = m->rm_mklist; + } + } while (t != top); + return (0); +} + +#ifdef RN_DEBUG +int rn_nodenum; +struct radix_node *rn_clist; +int rn_saveinfo; +int rn_debug = 1; +#endif + +/* + * Whenever we add a new leaf to the tree, we also add a parent node, + * so we allocate them as an array of two elements: the first one must be + * the leaf (see RNTORT() in route.c), the second one is the parent. + * This routine initializes the relevant fields of the nodes, so that + * the leaf is the left child of the parent node, and both nodes have + * (almost) all all fields filled as appropriate. + * (XXX some fields are left unset, see the '#if 0' section). + * The function returns a pointer to the parent node. + */ + +static struct radix_node * +rn_newpair(void *v, int b, struct radix_node nodes[2]) +{ + struct radix_node *tt = nodes, *t = tt + 1; + t->rn_bit = b; + t->rn_bmask = 0x80 >> (b & 7); + t->rn_left = tt; + t->rn_offset = b >> 3; + +#if 0 /* XXX perhaps we should fill these fields as well. */ + t->rn_parent = t->rn_right = NULL; + + tt->rn_mask = NULL; + tt->rn_dupedkey = NULL; + tt->rn_bmask = 0; +#endif + tt->rn_bit = -1; + tt->rn_key = (caddr_t)v; + tt->rn_parent = t; + tt->rn_flags = t->rn_flags = RNF_ACTIVE; + tt->rn_mklist = t->rn_mklist = 0; +#ifdef RN_DEBUG + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; + tt->rn_ybro = rn_clist; + rn_clist = tt; +#endif + return (t); +} + +static struct radix_node * +rn_insert(void *v_arg, struct radix_node_head *head, int *dupentry, + struct radix_node nodes[2]) +{ + caddr_t v = v_arg; + struct radix_node *top = head->rnh_treetop; + int head_off = top->rn_offset, vlen = LEN(v); + struct radix_node *t = rn_search(v_arg, top); + caddr_t cp = v + head_off; + int b; + struct radix_node *p, *tt, *x; + /* + * Find first bit at which v and t->rn_key differ + */ + caddr_t cp2 = t->rn_key + head_off; + int cmp_res; + caddr_t cplim = v + vlen; + + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return (t); +on1: + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + + x = top; + cp = v; + do { + p = x; + if (cp[x->rn_offset] & x->rn_bmask) + x = x->rn_right; + else + x = x->rn_left; + } while (b > (unsigned) x->rn_bit); + /* x->rn_bit < b && x->rn_bit >= 0 */ +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); +#endif + t = rn_newpair(v_arg, b, nodes); + tt = t->rn_left; + if ((cp[p->rn_offset] & p->rn_bmask) == 0) + p->rn_left = t; + else + p->rn_right = t; + x->rn_parent = t; + t->rn_parent = p; /* frees x, p as temp vars below */ + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + t->rn_right = x; + } else { + t->rn_right = tt; + t->rn_left = x; + } +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); +#endif + return (tt); +} + +struct radix_node * +rn_addmask(void *n_arg, struct radix_node_head *maskhead, int search, int skip) +{ + unsigned char *netmask = n_arg; + unsigned char *cp, *cplim; + struct radix_node *x; + int b = 0, mlen, j; + int maskduplicated, isnormal; + struct radix_node *saved_x; + unsigned char addmask_key[RADIX_MAX_KEY_LEN]; + + if ((mlen = LEN(netmask)) > RADIX_MAX_KEY_LEN) + mlen = RADIX_MAX_KEY_LEN; + if (skip == 0) + skip = 1; + if (mlen <= skip) + return (maskhead->rnh_nodes); + + bzero(addmask_key, RADIX_MAX_KEY_LEN); + if (skip > 1) + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); + bcopy(netmask + skip, addmask_key + skip, mlen - skip); + /* + * Trim trailing zeroes. + */ + for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) + cp--; + mlen = cp - addmask_key; + if (mlen <= skip) + return (maskhead->rnh_nodes); + *addmask_key = mlen; + x = rn_search(addmask_key, maskhead->rnh_treetop); + if (bcmp(addmask_key, x->rn_key, mlen) != 0) + x = 0; + if (x || search) + return (x); + R_Zalloc(x, struct radix_node *, RADIX_MAX_KEY_LEN + 2 * sizeof (*x)); + if ((saved_x = x) == 0) + return (0); + netmask = cp = (unsigned char *)(x + 2); + bcopy(addmask_key, cp, mlen); + x = rn_insert(cp, maskhead, &maskduplicated, x); + if (maskduplicated) { + log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); + Free(saved_x); + return (x); + } + /* + * Calculate index of mask, and check for normalcy. + * First find the first byte with a 0 bit, then if there are + * more bits left (remember we already trimmed the trailing 0's), + * the bits should be contiguous, otherwise we have got + * a non-contiguous mask. + */ +#define CONTIG(_c) (((~(_c) + 1) & (_c)) == (unsigned char)(~(_c) + 1)) + cplim = netmask + mlen; + isnormal = 1; + for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) + cp++; + if (cp != cplim) { + for (j = 0x80; (j & *cp) != 0; j >>= 1) + b++; + if (!CONTIG(*cp) || cp != (cplim - 1)) + isnormal = 0; + } + b += (cp - netmask) << 3; + x->rn_bit = -1 - b; + if (isnormal) + x->rn_flags |= RNF_NORMAL; + return (x); +} + +static int /* XXX: arbitrary ordering for non-contiguous masks */ +rn_lexobetter(void *m_arg, void *n_arg) +{ + u_char *mp = m_arg, *np = n_arg, *lim; + + if (LEN(mp) > LEN(np)) + return (1); /* not really, but need to check longer one first */ + if (LEN(mp) == LEN(np)) + for (lim = mp + LEN(mp); mp < lim;) + if (*mp++ > *np++) + return (1); + return (0); +} + +static struct radix_mask * +rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next) +{ + struct radix_mask *m; + + R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); + if (m == NULL) { + log(LOG_ERR, "Failed to allocate route mask\n"); + return (0); + } + bzero(m, sizeof(*m)); + m->rm_bit = tt->rn_bit; + m->rm_flags = tt->rn_flags; + if (tt->rn_flags & RNF_NORMAL) + m->rm_leaf = tt; + else + m->rm_mask = tt->rn_mask; + m->rm_mklist = next; + tt->rn_mklist = m; + return (m); +} + +struct radix_node * +rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node treenodes[2]) +{ + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; + struct radix_node *t, *x = 0, *tt; + struct radix_node *saved_tt, *top = head->rnh_treetop; + short b = 0, b_leaf = 0; + int keyduplicated; + caddr_t mmask; + struct radix_mask *m, **mp; + + /* + * In dealing with non-contiguous masks, there may be + * many different routes which have the same mask. + * We will find it useful to have a unique pointer to + * the mask to speed avoiding duplicate references at + * nodes and possibly save time in calculating indices. + */ + if (netmask) { + x = rn_addmask(netmask, head->rnh_masks, 0, top->rn_offset); + if (x == NULL) + return (0); + b_leaf = x->rn_bit; + b = -1 - x->rn_bit; + netmask = x->rn_key; + } + /* + * Deal with duplicated keys: attach node to previous instance + */ + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); + if (keyduplicated) { + for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { +#ifdef RADIX_MPATH + /* permit multipath, if enabled for the family */ + if (rn_mpath_capable(head) && netmask == tt->rn_mask) { + /* + * go down to the end of multipaths, so that + * new entry goes into the end of rn_dupedkey + * chain. + */ + do { + t = tt; + tt = tt->rn_dupedkey; + } while (tt && t->rn_mask == tt->rn_mask); + break; + } +#endif + if (tt->rn_mask == netmask) + return (0); + if (netmask == 0 || + (tt->rn_mask && + ((b_leaf < tt->rn_bit) /* index(netmask) > node */ + || rn_refines(netmask, tt->rn_mask) + || rn_lexobetter(netmask, tt->rn_mask)))) + break; + } + /* + * If the mask is not duplicated, we wouldn't + * find it among possible duplicate key entries + * anyway, so the above test doesn't hurt. + * + * We sort the masks for a duplicated key the same way as + * in a masklist -- most specific to least specific. + * This may require the unfortunate nuisance of relocating + * the head of the list. + * + * We also reverse, or doubly link the list through the + * parent pointer. + */ + if (tt == saved_tt) { + struct radix_node *xx = x; + /* link in at head of list */ + (tt = treenodes)->rn_dupedkey = t; + tt->rn_flags = t->rn_flags; + tt->rn_parent = x = t->rn_parent; + t->rn_parent = tt; /* parent */ + if (x->rn_left == t) + x->rn_left = tt; + else + x->rn_right = tt; + saved_tt = tt; x = xx; + } else { + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; + t->rn_dupedkey = tt; + tt->rn_parent = t; /* parent */ + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = tt; /* parent */ + } +#ifdef RN_DEBUG + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + tt->rn_key = (caddr_t) v; + tt->rn_bit = -1; + tt->rn_flags = RNF_ACTIVE; + } + /* + * Put mask in tree. + */ + if (netmask) { + tt->rn_mask = netmask; + tt->rn_bit = x->rn_bit; + tt->rn_flags |= x->rn_flags & RNF_NORMAL; + } + t = saved_tt->rn_parent; + if (keyduplicated) + goto on2; + b_leaf = -1 - t->rn_bit; + if (t->rn_right == saved_tt) + x = t->rn_left; + else + x = t->rn_right; + /* Promote general routes from below */ + if (x->rn_bit < 0) { + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) + if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { + *mp = m = rn_new_radix_mask(x, 0); + if (m) + mp = &m->rm_mklist; + } + } else if (x->rn_mklist) { + /* + * Skip over masks whose index is > that of new node + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m->rm_bit >= b_leaf) + break; + t->rn_mklist = m; *mp = 0; + } +on2: + /* Add new route to highest possible ancestor's list */ + if ((netmask == 0) || (b > t->rn_bit )) + return (tt); /* can't lift at all */ + b_leaf = tt->rn_bit; + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + /* + * Search through routes associated with node to + * insert new route according to index. + * Need same criteria as when sorting dupedkeys to avoid + * double loop on deletion. + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { + if (m->rm_bit < b_leaf) + continue; + if (m->rm_bit > b_leaf) + break; + if (m->rm_flags & RNF_NORMAL) { + mmask = m->rm_leaf->rn_mask; + if (tt->rn_flags & RNF_NORMAL) { +#if !defined(RADIX_MPATH) + log(LOG_ERR, + "Non-unique normal route, mask not entered\n"); +#endif + return (tt); + } + } else + mmask = m->rm_mask; + if (mmask == netmask) { + m->rm_refs++; + tt->rn_mklist = m; + return (tt); + } + if (rn_refines(netmask, mmask) + || rn_lexobetter(netmask, mmask)) + break; + } + *mp = rn_new_radix_mask(tt, *mp); + return (tt); +} + +struct radix_node * +rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) +{ + struct radix_node *t, *p, *x, *tt; + struct radix_mask *m, *saved_m, **mp; + struct radix_node *dupedkey, *saved_tt, *top; + caddr_t v, netmask; + int b, head_off, vlen; + + v = v_arg; + netmask = netmask_arg; + x = head->rnh_treetop; + tt = rn_search(v, x); + head_off = x->rn_offset; + vlen = LEN(v); + saved_tt = tt; + top = x; + if (tt == 0 || + bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) + return (0); + /* + * Delete our route from mask lists. + */ + if (netmask) { + x = rn_addmask(netmask, head->rnh_masks, 1, head_off); + if (x == NULL) + return (0); + netmask = x->rn_key; + while (tt->rn_mask != netmask) + if ((tt = tt->rn_dupedkey) == 0) + return (0); + } + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + goto on1; + if (tt->rn_flags & RNF_NORMAL) { + if (m->rm_leaf != tt || m->rm_refs > 0) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + return (0); /* dangling ref could cause disaster */ + } + } else { + if (m->rm_mask != tt->rn_mask) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + goto on1; + } + if (--m->rm_refs >= 0) + goto on1; + } + b = -1 - tt->rn_bit; + t = saved_tt->rn_parent; + if (b > t->rn_bit) + goto on1; /* Wasn't lifted at all */ + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m == saved_m) { + *mp = m->rm_mklist; + Free(m); + break; + } + if (m == 0) { + log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); + if (tt->rn_flags & RNF_NORMAL) + return (0); /* Dangling ref to us */ + } +on1: + /* + * Eliminate us from tree + */ + if (tt->rn_flags & RNF_ROOT) + return (0); +#ifdef RN_DEBUG + /* Get us out of the creation list */ + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} + if (t) t->rn_ybro = tt->rn_ybro; +#endif + t = tt->rn_parent; + dupedkey = saved_tt->rn_dupedkey; + if (dupedkey) { + /* + * Here, tt is the deletion target and + * saved_tt is the head of the dupekey chain. + */ + if (tt == saved_tt) { + /* remove from head of chain */ + x = dupedkey; x->rn_parent = t; + if (t->rn_left == tt) + t->rn_left = x; + else + t->rn_right = x; + } else { + /* find node in front of tt on the chain */ + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) + p = p->rn_dupedkey; + if (p) { + p->rn_dupedkey = tt->rn_dupedkey; + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = p; + /* parent */ + } else log(LOG_ERR, "rn_delete: couldn't find us\n"); + } + t = tt + 1; + if (t->rn_flags & RNF_ACTIVE) { +#ifndef RN_DEBUG + *++x = *t; + p = t->rn_parent; +#else + b = t->rn_info; + *++x = *t; + t->rn_info = b; + p = t->rn_parent; +#endif + if (p->rn_left == t) + p->rn_left = x; + else + p->rn_right = x; + x->rn_left->rn_parent = x; + x->rn_right->rn_parent = x; + } + goto out; + } + if (t->rn_left == tt) + x = t->rn_right; + else + x = t->rn_left; + p = t->rn_parent; + if (p->rn_right == t) + p->rn_right = x; + else + p->rn_left = x; + x->rn_parent = p; + /* + * Demote routes attached to us. + */ + if (t->rn_mklist) { + if (x->rn_bit >= 0) { + for (mp = &x->rn_mklist; (m = *mp);) + mp = &m->rm_mklist; + *mp = t->rn_mklist; + } else { + /* If there are any key,mask pairs in a sibling + duped-key chain, some subset will appear sorted + in the same order attached to our mklist */ + for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) + if (m == x->rn_mklist) { + struct radix_mask *mm = m->rm_mklist; + x->rn_mklist = 0; + if (--(m->rm_refs) < 0) + Free(m); + m = mm; + } + if (m) + log(LOG_ERR, + "rn_delete: Orphaned Mask %p at %p\n", + m, x); + } + } + /* + * We may be holding an active internal node in the tree. + */ + x = tt + 1; + if (t != x) { +#ifndef RN_DEBUG + *t = *x; +#else + b = t->rn_info; + *t = *x; + t->rn_info = b; +#endif + t->rn_left->rn_parent = t; + t->rn_right->rn_parent = t; + p = x->rn_parent; + if (p->rn_left == x) + p->rn_left = t; + else + p->rn_right = t; + } +out: + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (tt); +} + +/* + * This is the same as rn_walktree() except for the parameters and the + * exit. + */ +static int +rn_walktree_from(struct radix_node_head *h, void *a, void *m, + walktree_f_t *f, void *w) +{ + int error; + struct radix_node *base, *next; + u_char *xa = (u_char *)a; + u_char *xm = (u_char *)m; + struct radix_node *rn, *last = NULL; /* shut up gcc */ + int stopping = 0; + int lastb; + + KASSERT(m != NULL, ("%s: mask needs to be specified", __func__)); + + /* + * rn_search_m is sort-of-open-coded here. We cannot use the + * function because we need to keep track of the last node seen. + */ + /* printf("about to search\n"); */ + for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { + last = rn; + /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", + rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ + if (!(rn->rn_bmask & xm[rn->rn_offset])) { + break; + } + if (rn->rn_bmask & xa[rn->rn_offset]) { + rn = rn->rn_right; + } else { + rn = rn->rn_left; + } + } + /* printf("done searching\n"); */ + + /* + * Two cases: either we stepped off the end of our mask, + * in which case last == rn, or we reached a leaf, in which + * case we want to start from the leaf. + */ + if (rn->rn_bit >= 0) + rn = last; + lastb = last->rn_bit; + + /* printf("rn %p, lastb %d\n", rn, lastb);*/ + + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + + while (!stopping) { + /* printf("node %p (%d)\n", rn, rn->rn_bit); */ + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && !(rn->rn_flags & RNF_ROOT)) { + rn = rn->rn_parent; + + /* if went up beyond last, stop */ + if (rn->rn_bit <= lastb) { + stopping = 1; + /* printf("up too far\n"); */ + /* + * XXX we should jump to the 'Process leaves' + * part, because the values of 'rn' and 'next' + * we compute will not be used. Not a big deal + * because this loop will terminate, but it is + * inefficient and hard to understand! + */ + } + } + + /* + * At the top of the tree, no need to traverse the right + * half, prevent the traversal of the entire tree in the + * case of default route. + */ + if (rn->rn_parent->rn_flags & RNF_ROOT) + stopping = 1; + + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base) != 0) { + base = rn->rn_dupedkey; + /* printf("leaf %p\n", rn); */ + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + + if (rn->rn_flags & RNF_ROOT) { + /* printf("root, stopping"); */ + stopping = 1; + } + + } + return (0); +} + +static int +rn_walktree(struct radix_node_head *h, walktree_f_t *f, void *w) +{ + int error; + struct radix_node *base, *next; + struct radix_node *rn = h->rnh_treetop; + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + + /* First time through node, go left */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + for (;;) { + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && (rn->rn_flags & RNF_ROOT) == 0) + rn = rn->rn_parent; + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base)) { + base = rn->rn_dupedkey; + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + if (rn->rn_flags & RNF_ROOT) + return (0); + } + /* NOTREACHED */ +} + +/* + * Allocate and initialize an empty tree. This has 3 nodes, which are + * part of the radix_node_head (in the order <left,root,right>) and are + * marked RNF_ROOT so they cannot be freed. + * The leaves have all-zero and all-one keys, with significant + * bits starting at 'off'. + * Return 1 on success, 0 on error. + */ +static int +rn_inithead_internal(void **head, int off) +{ + struct radix_node_head *rnh; + struct radix_node *t, *tt, *ttt; + if (*head) + return (1); + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); + if (rnh == 0) + return (0); + *head = rnh; + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); + ttt = rnh->rnh_nodes + 2; + t->rn_right = ttt; + t->rn_parent = t; + tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; + tt->rn_bit = -1 - off; + *ttt = *tt; + ttt->rn_key = rn_ones; + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_lookup = rn_lookup; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_walktree_from = rn_walktree_from; + rnh->rnh_treetop = t; + return (1); +} + +static void +rn_detachhead_internal(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + rnh = *head; + + /* Free <left,root,right> nodes. */ + Free(rnh); + + *head = NULL; +} + +int +rn_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + if (*head != NULL) + return (1); + + if (rn_inithead_internal(head, off) == 0) + return (0); + + rnh = (struct radix_node_head *)(*head); + + if (rn_inithead_internal((void **)&rnh->rnh_masks, 0) == 0) { + rn_detachhead_internal(head); + return (0); + } + + return (1); +} + +static int +rn_freeentry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct radix_node *x; + + x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); + if (x != NULL) + Free(x); + return (0); +} + +int +rn_detachhead(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + + rnh = *head; + + rn_walktree(rnh->rnh_masks, rn_freeentry, rnh->rnh_masks); + rn_detachhead_internal((void **)&rnh->rnh_masks); + rn_detachhead_internal(head); + return (1); +} + diff --git a/example/ipfw/sys/net/radix.h b/example/ipfw/sys/net/radix.h new file mode 100644 index 0000000..43742fa --- /dev/null +++ b/example/ipfw/sys/net/radix.h @@ -0,0 +1,168 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.h 8.2 (Berkeley) 10/31/94 + * $FreeBSD: head/sys/net/radix.h 262758 2014-03-04 23:55:04Z gnn $ + */ + +#ifndef _RADIX_H_ +#define _RADIX_H_ + +#ifdef _KERNEL +#include <sys/_lock.h> +#include <sys/_mutex.h> +#include <sys/_rwlock.h> +#endif + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_RTABLE); +#endif + +/* + * Radix search tree node layout. + */ + +struct radix_node { + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ + struct radix_node *rn_parent; /* parent */ + short rn_bit; /* bit offset; -1-index(netmask) */ + char rn_bmask; /* node: mask for bit test*/ + u_char rn_flags; /* enumerated next */ +#define RNF_NORMAL 1 /* leaf contains normal route */ +#define RNF_ROOT 2 /* leaf is root leaf for tree */ +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ + union { + struct { /* leaf only data: */ + caddr_t rn_Key; /* object of search */ + caddr_t rn_Mask; /* netmask, if present */ + struct radix_node *rn_Dupedkey; + } rn_leaf; + struct { /* node only data: */ + int rn_Off; /* where to start compare */ + struct radix_node *rn_L;/* progeny */ + struct radix_node *rn_R;/* progeny */ + } rn_node; + } rn_u; +#ifdef RN_DEBUG + int rn_info; + struct radix_node *rn_twin; + struct radix_node *rn_ybro; +#endif +}; + +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey +#define rn_key rn_u.rn_leaf.rn_Key +#define rn_mask rn_u.rn_leaf.rn_Mask +#define rn_offset rn_u.rn_node.rn_Off +#define rn_left rn_u.rn_node.rn_L +#define rn_right rn_u.rn_node.rn_R + +/* + * Annotations to tree concerning potential routes applying to subtrees. + */ + +struct radix_mask { + short rm_bit; /* bit offset; -1-index(netmask) */ + char rm_unused; /* cf. rn_bmask */ + u_char rm_flags; /* cf. rn_flags */ + struct radix_mask *rm_mklist; /* more masks to try */ + union { + caddr_t rmu_mask; /* the mask */ + struct radix_node *rmu_leaf; /* for normal routes */ + } rm_rmu; + int rm_refs; /* # of references to this struct */ +}; + +#define rm_mask rm_rmu.rmu_mask +#define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ + +typedef int walktree_f_t(struct radix_node *, void *); + +struct radix_node_head { + struct radix_node *rnh_treetop; + u_int rnh_gen; /* generation counter */ + int rnh_multipath; /* multipath capable ? */ + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchaddr) /* longest match for sockaddr */ + (void *v, struct radix_node_head *head); + struct radix_node *(*rnh_lookup) /*exact match for sockaddr*/ + (void *v, void *mask, struct radix_node_head *head); + int (*rnh_walktree) /* traverse tree */ + (struct radix_node_head *head, walktree_f_t *f, void *w); + int (*rnh_walktree_from) /* traverse tree below a */ + (struct radix_node_head *head, void *a, void *m, + walktree_f_t *f, void *w); + void (*rnh_close) /* do something when the last ref drops */ + (struct radix_node *rn, struct radix_node_head *head); + struct radix_node rnh_nodes[3]; /* empty tree for common case */ + struct radix_node_head *rnh_masks; /* Storage for our masks */ +#ifdef _KERNEL + struct rwlock rnh_lock; /* locks entire radix tree */ +#endif +}; + +#ifndef _KERNEL +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) +#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) +#define R_Free(p) free((char *)p); +#else +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) +#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) +#define Free(p) free((caddr_t)p, M_RTABLE); + +#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ + rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) +#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) + + +#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) +#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) +#endif /* _KERNEL */ + +int rn_inithead(void **, int); +int rn_detachhead(void **); +int rn_refines(void *, void *); +struct radix_node + *rn_addmask(void *, struct radix_node_head *, int, int), + *rn_addroute (void *, void *, struct radix_node_head *, + struct radix_node [2]), + *rn_delete(void *, void *, struct radix_node_head *), + *rn_lookup (void *v_arg, void *m_arg, + struct radix_node_head *head), + *rn_match(void *, struct radix_node_head *); + +#endif /* _RADIX_H_ */ diff --git a/example/ipfw/sys/netgraph/ng_ipfw.h b/example/ipfw/sys/netgraph/ng_ipfw.h new file mode 100644 index 0000000..c60426e --- /dev/null +++ b/example/ipfw/sys/netgraph/ng_ipfw.h @@ -0,0 +1,33 @@ +/*- + * Copyright 2005, Gleb Smirnoff <glebius@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netgraph/ng_ipfw.h 201124 2009-12-28 12:29:13Z luigi $ + */ + +#ifndef _NG_IPFW_H +#define _NG_IPFW_H +#define NG_IPFW_NODE_TYPE "ipfw" +#define NGM_IPFW_COOKIE 1105988990 +#endif /* _NG_IPFW_H */ diff --git a/example/ipfw/sys/netinet/in_cksum.c b/example/ipfw/sys/netinet/in_cksum.c new file mode 100644 index 0000000..8d95ce5 --- /dev/null +++ b/example/ipfw/sys/netinet/in_cksum.c @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netinet/in_cksum.c 238941 2012-07-31 08:04:49Z luigi $"); + +#include <sys/param.h> +#include <sys/mbuf.h> + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(struct mbuf *m, int len) +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (uintptr_t) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/example/ipfw/sys/netinet/ip_dummynet.h b/example/ipfw/sys/netinet/ip_dummynet.h new file mode 100644 index 0000000..3378e82 --- /dev/null +++ b/example/ipfw/sys/netinet/ip_dummynet.h @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netinet/ip_dummynet.h 266941 2014-06-01 07:28:24Z hiren $ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of the kernel-userland API for dummynet. + * + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. + * + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. + */ + +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 + +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ +}; + +/* + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST + */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, +}; + +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ +}; + +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_IS_ECN = 0x0080, + DN_PIPE_CMD = 0x1000, /* pipe config... */ +}; + +/* + * link template. + */ +struct dn_link { + struct dn_id oid; + + /* + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? + */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ +}; + +/* + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). + */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ + int qsize; /* queue size in slots or bytes */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ + + struct ipfw_flow_id flow_mask; + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; + + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + +}; + +/* + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. + */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + + +/* + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. + */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ + + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; + + +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int link_nr; + int loss_level; + int _bandwidth; // XXX use link bandwidth? unused ? + int samples_no; /* actual len of samples[] */ + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ +}; + + + +/* + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * + */ + +#endif /* _IP_DUMMYNET_H */ diff --git a/example/ipfw/sys/netinet/ip_fw.h b/example/ipfw/sys/netinet/ip_fw.h new file mode 100644 index 0000000..7ec6f87 --- /dev/null +++ b/example/ipfw/sys/netinet/ip_fw.h @@ -0,0 +1,1009 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netinet/ip_fw.h 273035 2014-10-13 13:49:28Z melifaro $ + */ + +#ifndef _IPFW2_H +#define _IPFW2_H + +/* + * The default rule number. By the design of ip_fw, the default rule + * is the last one, so its number can also serve as the highest number + * allowed for a rule. The ip_fw code relies on both meanings of this + * constant. + */ +#define IPFW_DEFAULT_RULE 65535 + +#define RESVD_SET 31 /*set for default and persistent rules*/ +#define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/ + +/* + * Default number of ipfw tables. + */ +#define IPFW_TABLES_MAX 65535 +#define IPFW_TABLES_DEFAULT 128 + +/* + * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit + * argument between 1 and 65534. The value 0 (IP_FW_TARG) is used + * to represent 'tablearg' value, e.g. indicate the use of a 'tablearg' + * result of the most recent table() lookup. + * Note that 16bit is only a historical limit, resulting from + * the use of a 16-bit fields for that value. In reality, we can have + * 2^32 pipes, queues, tag values and so on. + */ +#define IPFW_ARG_MIN 1 +#define IPFW_ARG_MAX 65534 +#define IP_FW_TABLEARG 65535 /* Compat value for old clients */ +#define IP_FW_TARG 0 /* Current tablearg value */ + +/* + * Number of entries in the call stack of the call/return commands. + * Call stack currently is an uint16_t array with rule numbers. + */ +#define IPFW_CALLSTACK_SIZE 16 + +/* IP_FW3 header/opcodes */ +typedef struct _ip_fw3_opheader { + uint16_t opcode; /* Operation opcode */ + uint16_t version; /* Opcode version */ + uint16_t reserved[2]; /* Align to 64-bit boundary */ +} ip_fw3_opheader; + +/* IP_FW3 opcodes */ +#define IP_FW_TABLE_XADD 86 /* add entry */ +#define IP_FW_TABLE_XDEL 87 /* delete entry */ +#define IP_FW_TABLE_XGETSIZE 88 /* get table size (deprecated) */ +#define IP_FW_TABLE_XLIST 89 /* list table contents */ +#define IP_FW_TABLE_XDESTROY 90 /* destroy table */ +#define IP_FW_TABLES_XLIST 92 /* list all tables */ +#define IP_FW_TABLE_XINFO 93 /* request info for one table */ +#define IP_FW_TABLE_XFLUSH 94 /* flush table data */ +#define IP_FW_TABLE_XCREATE 95 /* create new table */ +#define IP_FW_TABLE_XMODIFY 96 /* modify existing table */ +#define IP_FW_XGET 97 /* Retrieve configuration */ +#define IP_FW_XADD 98 /* add rule */ +#define IP_FW_XDEL 99 /* del rule */ +#define IP_FW_XMOVE 100 /* move rules to different set */ +#define IP_FW_XZERO 101 /* clear accounting */ +#define IP_FW_XRESETLOG 102 /* zero rules logs */ +#define IP_FW_SET_SWAP 103 /* Swap between 2 sets */ +#define IP_FW_SET_MOVE 104 /* Move one set to another one */ +#define IP_FW_SET_ENABLE 105 /* Enable/disable sets */ +#define IP_FW_TABLE_XFIND 106 /* finds an entry */ +#define IP_FW_XIFLIST 107 /* list tracked interfaces */ +#define IP_FW_TABLES_ALIST 108 /* list table algorithms */ +#define IP_FW_TABLE_XSWAP 109 /* swap two tables */ +#define IP_FW_TABLE_VLIST 110 /* dump table value hash */ + +#define IP_FW_NAT44_XCONFIG 111 /* Create/modify NAT44 instance */ +#define IP_FW_NAT44_DESTROY 112 /* Destroys NAT44 instance */ +#define IP_FW_NAT44_XGETCONFIG 113 /* Get NAT44 instance config */ +#define IP_FW_NAT44_LIST_NAT 114 /* List all NAT44 instances */ +#define IP_FW_NAT44_XGETLOG 115 /* Get log from NAT44 instance */ + +#define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */ + +/* + * The kernel representation of ipfw rules is made of a list of + * 'instructions' (for all practical purposes equivalent to BPF + * instructions), which specify which fields of the packet + * (or its metadata) should be analysed. + * + * Each instruction is stored in a structure which begins with + * "ipfw_insn", and can contain extra fields depending on the + * instruction type (listed below). + * Note that the code is written so that individual instructions + * have a size which is a multiple of 32 bits. This means that, if + * such structures contain pointers or other 64-bit entities, + * (there is just one instance now) they may end up unaligned on + * 64-bit architectures, so the must be handled with care. + * + * "enum ipfw_opcodes" are the opcodes supported. We can have up + * to 256 different opcodes. When adding new opcodes, they should + * be appended to the end of the opcode list before O_LAST_OPCODE, + * this will prevent the ABI from being broken, otherwise users + * will have to recompile ipfw(8) when they update the kernel. + */ + +enum ipfw_opcodes { /* arguments (4 byte each) */ + O_NOP, + + O_IP_SRC, /* u32 = IP */ + O_IP_SRC_MASK, /* ip = IP/mask */ + O_IP_SRC_ME, /* none */ + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_DST, /* u32 = IP */ + O_IP_DST_MASK, /* ip = IP/mask */ + O_IP_DST_ME, /* none */ + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ + O_PROTO, /* arg1=protocol */ + + O_MACADDR2, /* 2 mac addr:mask */ + O_MAC_TYPE, /* same as srcport */ + + O_LAYER2, /* none */ + O_IN, /* none */ + O_FRAG, /* none */ + + O_RECV, /* none */ + O_XMIT, /* none */ + O_VIA, /* none */ + + O_IPOPT, /* arg1 = 2*u8 bitmap */ + O_IPLEN, /* arg1 = len */ + O_IPID, /* arg1 = id */ + + O_IPTOS, /* arg1 = id */ + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ + O_IPTTL, /* arg1 = TTL */ + + O_IPVER, /* arg1 = version */ + O_UID, /* u32 = id */ + O_GID, /* u32 = id */ + O_ESTAB, /* none (tcp established) */ + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ + O_TCPWIN, /* arg1 = desired win */ + O_TCPSEQ, /* u32 = desired seq. */ + O_TCPACK, /* u32 = desired seq. */ + O_ICMPTYPE, /* u32 = icmp bitmap */ + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH, /* none */ + O_VERSRCREACH, /* none */ + + O_PROBE_STATE, /* none */ + O_KEEP_STATE, /* none */ + O_LIMIT, /* ipfw_insn_limit */ + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + + /* + * These are really 'actions'. + */ + + O_LOG, /* ipfw_insn_log */ + O_PROB, /* u32 = match probability */ + + O_CHECK_STATE, /* none */ + O_ACCEPT, /* none */ + O_DENY, /* none */ + O_REJECT, /* arg1=icmp arg (same as deny) */ + O_COUNT, /* none */ + O_SKIPTO, /* arg1=next rule number */ + O_PIPE, /* arg1=pipe number */ + O_QUEUE, /* arg1=queue number */ + O_DIVERT, /* arg1=port number */ + O_TEE, /* arg1=port number */ + O_FORWARD_IP, /* fwd sockaddr */ + O_FORWARD_MAC, /* fwd mac */ + O_NAT, /* nope */ + O_REASS, /* none */ + + /* + * More opcodes. + */ + O_IPSEC, /* has ipsec history */ + O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + O_ANTISPOOF, /* none */ + O_JAIL, /* u32 = id */ + O_ALTQ, /* u32 = altq classif. qid */ + O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ + O_TCPDATALEN, /* arg1 = tcp data len */ + O_IP6_SRC, /* address without mask */ + O_IP6_SRC_ME, /* my addresses */ + O_IP6_SRC_MASK, /* address with the mask */ + O_IP6_DST, + O_IP6_DST_ME, + O_IP6_DST_MASK, + O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ + O_ICMP6TYPE, /* icmp6 packet type filtering */ + O_EXT_HDR, /* filtering for ipv6 extension header */ + O_IP6, + + /* + * actions for ng_ipfw + */ + O_NETGRAPH, /* send to ng_ipfw */ + O_NGTEE, /* copy to ng_ipfw */ + + O_IP4, + + O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ + + O_TAG, /* arg1=tag number */ + O_TAGGED, /* arg1=tag number */ + + O_SETFIB, /* arg1=FIB number */ + O_FIB, /* arg1=FIB desired fib number */ + + O_SOCKARG, /* socket argument */ + + O_CALLRETURN, /* arg1=called rule number */ + + O_FORWARD_IP6, /* fwd sockaddr_in6 */ + + O_DSCP, /* 2 u32 = DSCP mask */ + O_SETDSCP, /* arg1=DSCP value */ + O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */ + + O_LAST_OPCODE /* not an opcode! */ +}; + + +/* + * The extension header are filtered only for presence using a bit + * vector with a flag for each header. + */ +#define EXT_FRAGMENT 0x1 +#define EXT_HOPOPTS 0x2 +#define EXT_ROUTING 0x4 +#define EXT_AH 0x8 +#define EXT_ESP 0x10 +#define EXT_DSTOPTS 0x20 +#define EXT_RTHDR0 0x40 +#define EXT_RTHDR2 0x80 + +/* + * Template for instructions. + * + * ipfw_insn is used for all instructions which require no operands, + * a single 16-bit value (arg1), or a couple of 8-bit values. + * + * For other instructions which require different/larger arguments + * we have derived structures, ipfw_insn_*. + * + * The size of the instruction (in 32-bit words) is in the low + * 6 bits of "len". The 2 remaining bits are used to implement + * NOT and OR on individual instructions. Given a type, you can + * compute the length to be put in "len" using F_INSN_SIZE(t) + * + * F_NOT negates the match result of the instruction. + * + * F_OR is used to build or blocks. By default, instructions + * are evaluated as part of a logical AND. An "or" block + * { X or Y or Z } contains F_OR set in all but the last + * instruction of the block. A match will cause the code + * to skip past the last instruction of the block. + * + * NOTA BENE: in a couple of places we assume that + * sizeof(ipfw_insn) == sizeof(u_int32_t) + * this needs to be fixed. + * + */ +typedef struct _ipfw_insn { /* template for instructions */ + u_int8_t opcode; + u_int8_t len; /* number of 32-bit words */ +#define F_NOT 0x80 +#define F_OR 0x40 +#define F_LEN_MASK 0x3f +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) + + u_int16_t arg1; +} ipfw_insn; + +/* + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of + * a given type. + */ +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) + +/* + * This is used to store an array of 16-bit entries (ports etc.) + */ +typedef struct _ipfw_insn_u16 { + ipfw_insn o; + u_int16_t ports[2]; /* there may be more */ +} ipfw_insn_u16; + +/* + * This is used to store an array of 32-bit entries + * (uid, single IPv4 addresses etc.) + */ +typedef struct _ipfw_insn_u32 { + ipfw_insn o; + u_int32_t d[1]; /* one or more */ +} ipfw_insn_u32; + +/* + * This is used to store IP addr-mask pairs. + */ +typedef struct _ipfw_insn_ip { + ipfw_insn o; + struct in_addr addr; + struct in_addr mask; +} ipfw_insn_ip; + +/* + * This is used to forward to a given address (ip). + */ +typedef struct _ipfw_insn_sa { + ipfw_insn o; + struct sockaddr_in sa; +} ipfw_insn_sa; + +/* + * This is used to forward to a given address (ipv6). + */ +typedef struct _ipfw_insn_sa6 { + ipfw_insn o; + struct sockaddr_in6 sa; +} ipfw_insn_sa6; + +/* + * This is used for MAC addr-mask pairs. + */ +typedef struct _ipfw_insn_mac { + ipfw_insn o; + u_char addr[12]; /* dst[6] + src[6] */ + u_char mask[12]; /* dst[6] + src[6] */ +} ipfw_insn_mac; + +/* + * This is used for interface match rules (recv xx, xmit xx). + */ +typedef struct _ipfw_insn_if { + ipfw_insn o; + union { + struct in_addr ip; + int glob; + uint16_t kidx; + } p; + char name[IFNAMSIZ]; +} ipfw_insn_if; + +/* + * This is used for storing an altq queue id number. + */ +typedef struct _ipfw_insn_altq { + ipfw_insn o; + u_int32_t qid; +} ipfw_insn_altq; + +/* + * This is used for limit rules. + */ +typedef struct _ipfw_insn_limit { + ipfw_insn o; + u_int8_t _pad; + u_int8_t limit_mask; /* combination of DYN_* below */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_int16_t conn_limit; +} ipfw_insn_limit; + +/* + * This is used for log instructions. + */ +typedef struct _ipfw_insn_log { + ipfw_insn o; + u_int32_t max_log; /* how many do we log -- 0 = all */ + u_int32_t log_left; /* how many left to log */ +} ipfw_insn_log; + +/* Legacy NAT structures, compat only */ +#ifndef _KERNEL +/* + * Data structures required by both ipfw(8) and ipfw(4) but not part of the + * management API are protected by IPFW_INTERNAL. + */ +#ifdef IPFW_INTERNAL +/* Server pool support (LSNAT). */ +struct cfg_spool { + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ + struct in_addr addr; + u_short port; +}; +#endif + +/* Redirect modes id. */ +#define REDIR_ADDR 0x01 +#define REDIR_PORT 0x02 +#define REDIR_PROTO 0x04 + +#ifdef IPFW_INTERNAL +/* Nat redirect configuration. */ +struct cfg_redir { + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ + u_int16_t mode; /* type of redirect mode */ + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + u_short lport; /* local port */ + u_short pport; /* public port */ + u_short rport; /* remote port */ + u_short pport_cnt; /* number of public ports */ + u_short rport_cnt; /* number of remote ports */ + int proto; /* protocol: tcp/udp */ + struct alias_link **alink; + /* num of entry in spool chain */ + u_int16_t spool_cnt; + /* chain of spool instances */ + LIST_HEAD(spool_chain, cfg_spool) spool_chain; +}; +#endif + +#ifdef IPFW_INTERNAL +/* Nat configuration data struct. */ +struct cfg_nat { + /* chain of nat instances */ + LIST_ENTRY(cfg_nat) _next; + int id; /* nat id */ + struct in_addr ip; /* nat ip address */ + char if_name[IF_NAMESIZE]; /* interface name */ + int mode; /* aliasing mode */ + struct libalias *lib; /* libalias instance */ + /* number of entry in spool chain */ + int redir_cnt; + /* chain of redir instances */ + LIST_HEAD(redir_chain, cfg_redir) redir_chain; +}; +#endif + +#define SOF_NAT sizeof(struct cfg_nat) +#define SOF_REDIR sizeof(struct cfg_redir) +#define SOF_SPOOL sizeof(struct cfg_spool) + +#endif /* ifndef _KERNEL */ + + +struct nat44_cfg_spool { + struct in_addr addr; + uint16_t port; + uint16_t spare; +}; +#define NAT44_REDIR_ADDR 0x01 +#define NAT44_REDIR_PORT 0x02 +#define NAT44_REDIR_PROTO 0x04 + +/* Nat redirect configuration. */ +struct nat44_cfg_redir { + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + uint16_t lport; /* local port */ + uint16_t pport; /* public port */ + uint16_t rport; /* remote port */ + uint16_t pport_cnt; /* number of public ports */ + uint16_t rport_cnt; /* number of remote ports */ + uint16_t mode; /* type of redirect mode */ + uint16_t spool_cnt; /* num of entry in spool chain */ + uint16_t spare; + uint32_t proto; /* protocol: tcp/udp */ +}; + +/* Nat configuration data struct. */ +struct nat44_cfg_nat { + char name[64]; /* nat name */ + char if_name[64]; /* interface name */ + uint32_t size; /* structure size incl. redirs */ + struct in_addr ip; /* nat IPv4 address */ + uint32_t mode; /* aliasing mode */ + uint32_t redir_cnt; /* number of entry in spool chain */ +}; + +/* Nat command. */ +typedef struct _ipfw_insn_nat { + ipfw_insn o; + struct cfg_nat *nat; +} ipfw_insn_nat; + +/* Apply ipv6 mask on ipv6 addr */ +#define APPLY_MASK(addr,mask) \ + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; + +/* Structure for ipv6 */ +typedef struct _ipfw_insn_ip6 { + ipfw_insn o; + struct in6_addr addr6; + struct in6_addr mask6; +} ipfw_insn_ip6; + +/* Used to support icmp6 types */ +typedef struct _ipfw_insn_icmp6 { + ipfw_insn o; + uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h + * define ICMP6_MAXTYPE + * as follows: n = ICMP6_MAXTYPE/32 + 1 + * Actually is 203 + */ +} ipfw_insn_icmp6; + +/* + * Here we have the structure representing an ipfw rule. + * + * Layout: + * struct ip_fw_rule + * [ counter block, size = rule->cntr_len ] + * [ one or more instructions, size = rule->cmd_len * 4 ] + * + * It starts with a general area (with link fields). + * Counter block may be next (if rule->cntr_len > 0), + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. rule->cmd_len represents + * the total instructions legth in u32 worrd, while act_ofs represents + * rule action offset in u32 words. + * + * When assembling instruction, remember the following: + * + * + if a rule has a "keep-state" (or "limit") option, then the + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE + * + if a rule has a "log" option, then the first action + * (at ACTION_PTR(r)) MUST be O_LOG + * + if a rule has an "altq" option, it comes after "log" + * + if a rule has an O_TAG option, it comes after "log" and "altq" + * + * + * All structures (excluding instructions) are u64-aligned. + * Please keep this. + */ + +struct ip_fw_rule { + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t spare; + uint8_t set; /* rule set (0..31) */ + uint8_t flags; /* rule flags */ + uint32_t rulenum; /* rule number */ + uint32_t id; /* rule id */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; +#define IPFW_RULE_NOOPT 0x01 /* Has no options in body */ + +/* Unaligned version */ + +/* Base ipfw rule counter block. */ +struct ip_fw_bcounter { + uint16_t size; /* Size of counter block, bytes */ + uint8_t flags; /* flags for given block */ + uint8_t spare; + uint32_t timestamp; /* tv_sec of last match */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ +}; + + +#ifndef _KERNEL +/* + * Legacy rule format + */ +struct ip_fw { + struct ip_fw *x_next; /* linked list of rules */ + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; +#endif + +#define ACTION_PTR(rule) \ + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) + +#define RULESIZE(rule) (sizeof(*(rule)) + (rule)->cmd_len * 4 - 4) + + +#if 1 // should be moved to in.h +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + * addr_type is used in userland and kernel to mark the address type. + * fib is used in the kernel to record the fib in use. + * _flags is used in the kernel to store tcp flags for dynamic rules. + */ +struct ipfw_flow_id { + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t _flags; /* protocol-specific flags */ + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ + struct in6_addr dst_ip6; + struct in6_addr src_ip6; + uint32_t flow_id6; + uint32_t extra; /* queue/pipe or frag_id */ +}; +#endif + +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) + +/* + * Dynamic ipfw rule. + */ +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ +#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ + +/* + * These are used for lookup tables. + */ + +#define IPFW_TABLE_ADDR 1 /* Table for holding IPv4/IPv6 prefixes */ +#define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ +#define IPFW_TABLE_NUMBER 3 /* Table for holding ports/uid/gid/etc */ +#define IPFW_TABLE_FLOW 4 /* Table for holding flow data */ +#define IPFW_TABLE_MAXTYPE 4 /* Maximum valid number */ + +#define IPFW_TABLE_CIDR IPFW_TABLE_ADDR /* compat */ + +/* Value types */ +#define IPFW_VTYPE_LEGACY 0xFFFFFFFF /* All data is filled in */ +#define IPFW_VTYPE_SKIPTO 0x00000001 /* skipto/call/callreturn */ +#define IPFW_VTYPE_PIPE 0x00000002 /* pipe/queue */ +#define IPFW_VTYPE_FIB 0x00000004 /* setfib */ +#define IPFW_VTYPE_NAT 0x00000008 /* nat */ +#define IPFW_VTYPE_DSCP 0x00000010 /* dscp */ +#define IPFW_VTYPE_TAG 0x00000020 /* tag/untag */ +#define IPFW_VTYPE_DIVERT 0x00000040 /* divert/tee */ +#define IPFW_VTYPE_NETGRAPH 0x00000080 /* netgraph/ngtee */ +#define IPFW_VTYPE_LIMIT 0x00000100 /* limit */ +#define IPFW_VTYPE_NH4 0x00000200 /* IPv4 nexthop */ +#define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */ + +typedef struct _ipfw_table_entry { + in_addr_t addr; /* network address */ + u_int32_t value; /* value */ + u_int16_t tbl; /* table number */ + u_int8_t masklen; /* mask length */ +} ipfw_table_entry; + +typedef struct _ipfw_table_xentry { + uint16_t len; /* Total entry length */ + uint8_t type; /* entry type */ + uint8_t masklen; /* mask length */ + uint16_t tbl; /* table number */ + uint16_t flags; /* record flags */ + uint32_t value; /* value */ + union { + /* Longest field needs to be aligned by 4-byte boundary */ + struct in6_addr addr6; /* IPv6 address */ + char iface[IF_NAMESIZE]; /* interface name */ + } k; +} ipfw_table_xentry; +#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ + +typedef struct _ipfw_table { + u_int32_t size; /* size of entries in bytes */ + u_int32_t cnt; /* # of entries */ + u_int16_t tbl; /* table number */ + ipfw_table_entry ent[0]; /* entries */ +} ipfw_table; + +typedef struct _ipfw_xtable { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t size; /* size of entries in bytes */ + uint32_t cnt; /* # of entries */ + uint16_t tbl; /* table number */ + uint8_t type; /* table type */ + ipfw_table_xentry xent[0]; /* entries */ +} ipfw_xtable; + +typedef struct _ipfw_obj_tlv { + uint16_t type; /* TLV type */ + uint16_t flags; /* TLV-specific flags */ + uint32_t length; /* Total length, aligned to u64 */ +} ipfw_obj_tlv; +#define IPFW_TLV_TBL_NAME 1 +#define IPFW_TLV_TBLNAME_LIST 2 +#define IPFW_TLV_RULE_LIST 3 +#define IPFW_TLV_DYNSTATE_LIST 4 +#define IPFW_TLV_TBL_ENT 5 +#define IPFW_TLV_DYN_ENT 6 +#define IPFW_TLV_RULE_ENT 7 +#define IPFW_TLV_TBLENT_LIST 8 +#define IPFW_TLV_RANGE 9 + +/* Object name TLV */ +typedef struct _ipfw_obj_ntlv { + ipfw_obj_tlv head; /* TLV header */ + uint16_t idx; /* Name index */ + uint8_t spare; /* unused */ + uint8_t type; /* object type, if applicable */ + uint32_t set; /* set, if applicable */ + char name[64]; /* Null-terminated name */ +} ipfw_obj_ntlv; + +/* IPv4/IPv6 L4 flow description */ +struct tflow_entry { + uint8_t af; + uint8_t proto; + uint16_t spare; + uint16_t sport; + uint16_t dport; + union { + struct { + struct in_addr sip; + struct in_addr dip; + } a4; + struct { + struct in6_addr sip6; + struct in6_addr dip6; + } a6; + } a; +}; + +typedef struct _ipfw_table_value { + uint32_t tag; /* O_TAG/O_TAGGED */ + uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t divert; /* O_DIVERT/O_TEE */ + uint16_t skipto; /* skipto, CALLRET */ + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ + uint32_t fib; /* O_SETFIB */ + uint32_t nat; /* O_NAT */ + uint32_t nh4; + uint8_t dscp; + uint8_t spare0[3]; + struct in6_addr nh6; + uint32_t limit; /* O_LIMIT */ + uint32_t spare1; + uint64_t reserved; +} ipfw_table_value; + +/* Table entry TLV */ +typedef struct _ipfw_obj_tentry { + ipfw_obj_tlv head; /* TLV header */ + uint8_t subtype; /* subtype (IPv4,IPv6) */ + uint8_t masklen; /* mask length */ + uint8_t result; /* request result */ + uint8_t spare0; + uint16_t idx; /* Table name index */ + uint16_t spare1; + union { + /* Longest field needs to be aligned by 8-byte boundary */ + struct in_addr addr; /* IPv4 address */ + uint32_t key; /* uid/gid/port */ + struct in6_addr addr6; /* IPv6 address */ + char iface[IF_NAMESIZE]; /* interface name */ + struct tflow_entry flow; + } k; + union { + ipfw_table_value value; /* value data */ + uint32_t kidx; /* value kernel index */ + } v; +} ipfw_obj_tentry; +#define IPFW_TF_UPDATE 0x01 /* Update record if exists */ +/* Container TLV */ +#define IPFW_CTF_ATOMIC 0x01 /* Perform atomic operation */ +/* Operation results */ +#define IPFW_TR_IGNORED 0 /* Entry was ignored (rollback) */ +#define IPFW_TR_ADDED 1 /* Entry was succesfully added */ +#define IPFW_TR_UPDATED 2 /* Entry was succesfully updated*/ +#define IPFW_TR_DELETED 3 /* Entry was succesfully deleted*/ +#define IPFW_TR_LIMIT 4 /* Entry was ignored (limit) */ +#define IPFW_TR_NOTFOUND 5 /* Entry was not found */ +#define IPFW_TR_EXISTS 6 /* Entry already exists */ +#define IPFW_TR_ERROR 7 /* Request has failed (unknown) */ + +typedef struct _ipfw_obj_dyntlv { + ipfw_obj_tlv head; + ipfw_dyn_rule state; +} ipfw_obj_dyntlv; +#define IPFW_DF_LAST 0x01 /* Last state in chain */ + +/* Containter TLVs */ +typedef struct _ipfw_obj_ctlv { + ipfw_obj_tlv head; /* TLV header */ + uint32_t count; /* Number of sub-TLVs */ + uint16_t objsize; /* Single object size */ + uint8_t version; /* TLV version */ + uint8_t flags; /* TLV-specific flags */ +} ipfw_obj_ctlv; + +/* Range TLV */ +typedef struct _ipfw_range_tlv { + ipfw_obj_tlv head; /* TLV header */ + uint32_t flags; /* Range flags */ + uint16_t start_rule; /* Range start */ + uint16_t end_rule; /* Range end */ + uint32_t set; /* Range set to match */ + uint32_t new_set; /* New set to move/swap to */ +} ipfw_range_tlv; +#define IPFW_RCFLAG_RANGE 0x01 /* rule range is set */ +#define IPFW_RCFLAG_ALL 0x02 /* match ALL rules */ +#define IPFW_RCFLAG_SET 0x04 /* match rules in given set */ +/* User-settable flags */ +#define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \ + IPFW_RCFLAG_SET) +/* Internally used flags */ +#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip defaul rule */ + +typedef struct _ipfw_ta_tinfo { + uint32_t flags; /* Format flags */ + uint32_t spare; + uint8_t taclass4; /* algorithm class */ + uint8_t spare4; + uint16_t itemsize4; /* item size in runtime */ + uint32_t size4; /* runtime structure size */ + uint32_t count4; /* number of items in runtime */ + uint8_t taclass6; /* algorithm class */ + uint8_t spare6; + uint16_t itemsize6; /* item size in runtime */ + uint32_t size6; /* runtime structure size */ + uint32_t count6; /* number of items in runtime */ +} ipfw_ta_tinfo; +#define IPFW_TACLASS_HASH 1 /* algo is based on hash */ +#define IPFW_TACLASS_ARRAY 2 /* algo is based on array */ +#define IPFW_TACLASS_RADIX 3 /* algo is based on radix tree */ + +#define IPFW_TATFLAGS_DATA 0x0001 /* Has data filled in */ +#define IPFW_TATFLAGS_AFDATA 0x0002 /* Separate data per AF */ +#define IPFW_TATFLAGS_AFITEM 0x0004 /* diff. items per AF */ + +typedef struct _ipfw_xtable_info { + uint8_t type; /* table type (addr,iface,..) */ + uint8_t tflags; /* type flags */ + uint16_t mflags; /* modification flags */ + uint16_t flags; /* generic table flags */ + uint16_t spare[3]; + uint32_t vmask; /* bitmask with value types */ + uint32_t set; /* set table is in */ + uint32_t kidx; /* kernel index */ + uint32_t refcnt; /* number of references */ + uint32_t count; /* Number of records */ + uint32_t size; /* Total size of records(export)*/ + uint32_t limit; /* Max number of records */ + char tablename[64]; /* table name */ + char algoname[64]; /* algorithm name */ + ipfw_ta_tinfo ta_info; /* additional algo stats */ +} ipfw_xtable_info; +/* Generic table flags */ +#define IPFW_TGFLAGS_LOCKED 0x01 /* Tables is locked from changes*/ +/* Table type-specific flags */ +#define IPFW_TFFLAG_SRCIP 0x01 +#define IPFW_TFFLAG_DSTIP 0x02 +#define IPFW_TFFLAG_SRCPORT 0x04 +#define IPFW_TFFLAG_DSTPORT 0x08 +#define IPFW_TFFLAG_PROTO 0x10 +/* Table modification flags */ +#define IPFW_TMFLAGS_LIMIT 0x0002 /* Change limit value */ +#define IPFW_TMFLAGS_LOCK 0x0004 /* Change table lock state */ + +typedef struct _ipfw_iface_info { + char ifname[64]; /* interface name */ + uint32_t ifindex; /* interface index */ + uint32_t flags; /* flags */ + uint32_t refcnt; /* number of references */ + uint32_t gencnt; /* number of changes */ + uint64_t spare; +} ipfw_iface_info; +#define IPFW_IFFLAG_RESOLVED 0x01 /* Interface exists */ + +typedef struct _ipfw_ta_info { + char algoname[64]; /* algorithm name */ + uint32_t type; /* lookup type */ + uint32_t flags; + uint32_t refcnt; + uint32_t spare0; + uint64_t spare1; +} ipfw_ta_info; + +#define IPFW_OBJTYPE_TABLE 1 +typedef struct _ipfw_obj_header { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t spare; + uint16_t idx; /* object name index */ + uint8_t objtype; /* object type */ + uint8_t objsubtype; /* object subtype */ + ipfw_obj_ntlv ntlv; /* object name tlv */ +} ipfw_obj_header; + +typedef struct _ipfw_obj_lheader { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t set_mask; /* disabled set mask */ + uint32_t count; /* Total objects count */ + uint32_t size; /* Total size (incl. header) */ + uint32_t objsize; /* Size of one object */ +} ipfw_obj_lheader; + +#define IPFW_CFG_GET_STATIC 0x01 +#define IPFW_CFG_GET_STATES 0x02 +#define IPFW_CFG_GET_COUNTERS 0x04 +typedef struct _ipfw_cfg_lheader { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t set_mask; /* enabled set mask */ + uint32_t spare; + uint32_t flags; /* Request flags */ + uint32_t size; /* neded buffer size */ + uint32_t start_rule; + uint32_t end_rule; +} ipfw_cfg_lheader; + +typedef struct _ipfw_range_header { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + ipfw_range_tlv range; +} ipfw_range_header; + +typedef struct _ipfw_sopt_info { + uint16_t opcode; + uint8_t version; + uint8_t dir; + uint8_t spare; + uint64_t refcnt; +} ipfw_sopt_info; + +#endif /* _IPFW2_H */ diff --git a/example/ipfw/sys/netinet/tcp.h b/example/ipfw/sys/netinet/tcp.h new file mode 100644 index 0000000..29c4313 --- /dev/null +++ b/example/ipfw/sys/netinet/tcp.h @@ -0,0 +1,247 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: head/sys/netinet/tcp.h 246210 2013-02-01 15:32:20Z jhb $ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +#include <sys/cdefs.h> +#include <sys/types.h> + +#if __BSD_VISIBLE + +typedef u_int32_t tcp_seq; + +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_char th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) +#define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOLEN_EOL 1 +#define TCPOPT_PAD 0 /* padding after EOL */ +#define TCPOLEN_PAD 1 +#define TCPOPT_NOP 1 +#define TCPOLEN_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 +#define TCPOLEN_SACKHDR 2 +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ +#define TCPOLEN_SIGNATURE 18 + +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ +#define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ + + +/* + * The default maximum segment size (MSS) to be used for new TCP connections + * when path MTU discovery is not enabled. + * + * RFC879 derives the default MSS from the largest datagram size hosts are + * minimally required to handle directly or through IP reassembly minus the + * size of the IP and TCP header. With IPv6 the minimum MTU is specified + * in RFC2460. + * + * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) + * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) + * + * We use explicit numerical definition here to avoid header pollution. + */ +#define TCP_MSS 536 +#define TCP6_MSS 1220 + +/* + * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS + * option. Allowing low values of MSS can consume significant resources and + * be used to mount a resource exhaustion attack. + * Connections requesting lower MSS values will be rounded up to this value + * and the IP_DF flag will be cleared to allow fragmentation along the path. + * + * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting + * it to "0" disables the minmss check. + * + * The default value is fine for TCP across the Internet's smallest official + * link MTU (256 bytes for AX.25 packet radio). However, a connection is very + * unlikely to come across such low MTU interfaces these days (anno domini 2003). + */ +#define TCP_MINMSS 216 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXBURST 4 /* maximum segments in a burst */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ +#endif /* __BSD_VISIBLE */ + +/* + * User-settable options (used with setsockopt). These are discrete + * values and are not masked together. Some values appear to be + * bitmasks for historical reasons. + */ +#define TCP_NODELAY 1 /* don't delay send to coalesce packets */ +#if __BSD_VISIBLE +#define TCP_MAXSEG 2 /* set maximum segment size */ +#define TCP_NOPUSH 4 /* don't push last block of write */ +#define TCP_NOOPT 8 /* don't use TCP options */ +#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ +#define TCP_INFO 32 /* retrieve tcp_info structure */ +#define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_KEEPINIT 128 /* N, time to establish connection */ +#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ +#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ +#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ + +/* Start of reserved space for third-party user-settable options. */ +#define TCP_VENDOR SO_VENDOR + +#define TCP_CA_NAME_MAX 16 /* max congestion control name length */ + +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 +#define TCPI_OPT_TOE 0x10 + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +struct tcp_info { + u_int8_t tcpi_state; /* TCP FSM state. */ + u_int8_t __tcpi_ca_state; + u_int8_t __tcpi_retransmits; + u_int8_t __tcpi_probes; + u_int8_t __tcpi_backoff; + u_int8_t tcpi_options; /* Options enabled on conn. */ + u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ + tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ + + u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ + u_int32_t __tcpi_ato; + u_int32_t tcpi_snd_mss; /* Max segment size for send. */ + u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ + + u_int32_t __tcpi_unacked; + u_int32_t __tcpi_sacked; + u_int32_t __tcpi_lost; + u_int32_t __tcpi_retrans; + u_int32_t __tcpi_fackets; + + /* Times; measurements in usecs. */ + u_int32_t __tcpi_last_data_sent; + u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ + u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ + u_int32_t __tcpi_last_ack_recv; + + /* Metrics; variable units. */ + u_int32_t __tcpi_pmtu; + u_int32_t __tcpi_rcv_ssthresh; + u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ + u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ + u_int32_t __tcpi_advmss; + u_int32_t __tcpi_reordering; + + u_int32_t __tcpi_rcv_rtt; + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ + + /* FreeBSD extensions to tcp_info. */ + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ + u_int32_t tcpi_snd_bwnd; /* No longer used. */ + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ + u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ + u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ + u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ + u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ + + /* Padding to grow without breaking ABI. */ + u_int32_t __tcpi_pad[26]; /* Padding. */ +}; +#endif + +#endif /* !_NETINET_TCP_H_ */ diff --git a/example/ipfw/sys/netinet/udp.h b/example/ipfw/sys/netinet/udp.h new file mode 100644 index 0000000..c4e6e08 --- /dev/null +++ b/example/ipfw/sys/netinet/udp.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: head/sys/netinet/udp.h 246210 2013-02-01 15:32:20Z jhb $ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * UDP protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +/* + * User-settable options (used with setsockopt). + */ +#define UDP_ENCAP 1 + +/* Start of reserved space for third-party user-settable options. */ +#define UDP_VENDOR SO_VENDOR + +/* + * UDP Encapsulation of IPsec Packets options. + */ +/* Encapsulation types. */ +#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ +#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ + +/* Default ESP in UDP encapsulation port. */ +#define UDP_ENCAP_ESPINUDP_PORT 500 + +/* Maximum UDP fragment size for ESP over UDP. */ +#define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 + +#endif diff --git a/example/ipfw/sys/netpfil/ipfw/dn_heap.c b/example/ipfw/sys/netpfil/ipfw/dn_heap.c new file mode 100644 index 0000000..b47bd28 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_heap.c @@ -0,0 +1,552 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $FreeBSD: head/sys/netpfil/ipfw/dn_heap.c 240494 2012-09-14 11:51:49Z glebius $ + */ + +#include <sys/cdefs.h> +#include <sys/param.h> +#ifdef _KERNEL +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/dn_heap.c 240494 2012-09-14 11:51:49Z glebius $"); +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <netpfil/ipfw/dn_heap.h> +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include <stdio.h> +#include <dn_test.h> +#include <strings.h> +#include <stdlib.h> + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +static MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*newh)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'newh' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->newh = newh; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* lookup and optionally create or delete element */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i; + void **pp, *p; + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { + if (flags & DNHT_MATCH_PTR) { + if (key == (uintptr_t)p) + break; + } else if (ht->match(p, key, flags, arg)) /* found match */ + break; + } + if (p) { + if (flags & DNHT_REMOVE) { + /* link in the next element */ + *pp = *(void **)((char *)p + ht->ofs); + *(void **)((char *)p + ht->ofs) = NULL; + ht->entries--; + } + } else if (flags & DNHT_INSERT) { + // printf("%s before calling new, bucket %d ofs %d\n", + // __FUNCTION__, i, ht->ofs); + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; + // printf("%s newh returns %p\n", __FUNCTION__, p); + if (p) { + ht->entries++; + *(void **)((char *)p + ht->ofs) = ht->ht[i]; + ht->ht[i] = p; + } + } + return p; +} + +/* + * do a scan with the option to delete the object. Extract next before + * running the callback because the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return (ret & DNHT_COPY_ERR) ? -1 : found; + } + } + return found; +} + +/* + * Similar to dn_ht_scan(), except that the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid. + * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] + * pointer to the last entry processed. Moreover, the bucket number passed + * by caller is decremented, because usually the caller increment it. + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets) + *bucket = 0; + i = *bucket; + + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + return found; +} diff --git a/example/ipfw/sys/netpfil/ipfw/dn_heap.h b/example/ipfw/sys/netpfil/ipfw/dn_heap.h new file mode 100644 index 0000000..2b44d8e --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_heap.h @@ -0,0 +1,192 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD: head/sys/netpfil/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * newh(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ + DNHT_COPY_ERR = 0x0080, /* error during a copy */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched.h b/example/ipfw/sys/netpfil/ipfw/dn_sched.h new file mode 100644 index 0000000..a81a9c0 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched.h 258467 2013-11-22 05:02:37Z luigi $ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * Note that q should be interpreted only as a hint + * on the flow that the mbuf belongs to: while a + * scheduler will normally enqueue m into q, it is ok + * to leave q alone and put the mbuf elsewhere. + * This function is called in two cases: + * - when a new packet arrives to the scheduler; + * - when a scheduler is reconfigured. In this case the + * call is issued by the new_queue callback, with a + * non empty queue (q) and m pointing to the first + * mbuf in the queue. For this reason, the function + * should internally check for (m != q->mq.head) + * before calling dn_enqueue(). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * + * The new_queue callback is normally called from when + * creating a new queue. In some cases (such as a + * scheduler change or reconfiguration) it can be called + * with a non empty queue. In this case, the queue + * In case of non empty queue, the new_queue callback could + * need to call the enqueue function. In this case, + * the callback should eventually call enqueue() passing + * as m the first element in the queue. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + q->mq.count--; + + /* Update stats for the queue */ + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + if (q->_si) { + q->_si->ni.length--; + q->_si->ni.len_bytes -= m->m_pkthdr.len; + } + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3) +#endif /* _DN_SCHED_H */ diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c new file mode 100644 index 0000000..1119221 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_fifo.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_fifo.c 240494 2012-09-14 11:51:49Z glebius $ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c new file mode 100644 index 0000000..f0ca44e --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_prio.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_prio.c 240494 2012-09-14 11:51:49Z glebius $ + */ +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#define DN_SCHED_PRIO 5 //XXX + +#if !defined(_KERNEL) || !defined(__linux__) +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/* Size of the array of queues pointers. */ +#define BITMAP_T unsigned long +#define MAXPRIO (sizeof(BITMAP_T) * 8) + +/* + * The scheduler instance contains an array of pointers to queues, + * one for each priority, and a bitmap listing backlogged queues. + */ +struct prio_si { + BITMAP_T bitmap; /* array bitmap */ + struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ +}; + +/* + * If a queue with the same priority is already backlogged, use + * that one instead of the queue passed as argument. + */ +static int +prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + int prio = q->fs->fs.par[0]; + + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else { /* use the existing queue */ + q = si->q_array[prio]; + } + if (dn_enqueue(q, m, 0)) + return 1; + return 0; +} + +/* + * Packets are dequeued only from the highest priority queue. + * The function ffs() return the lowest bit in the bitmap that rapresent + * the array index (-1) which contains the pointer to the highest priority + * queue. + * After the dequeue, if this queue become empty, it is index is removed + * from the bitmap. + * Scheduler is idle if the bitmap is empty + * + * NOTE: highest priority is 0, lowest is sched->max_prio_q + */ +static struct mbuf * +prio_dequeue(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + int prio; + + if (si->bitmap == 0) /* scheduler idle */ + return NULL; + + prio = ffs(si->bitmap) - 1; + + /* Take the highest priority queue in the scheduler */ + q = si->q_array[prio]; + // assert(q) + + m = dn_dequeue(q); + if (q->mq.head == NULL) { + /* Queue is now empty, remove from scheduler + * and mark it + */ + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return m; +} + +static int +prio_new_sched(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + + bzero(si->q_array, sizeof(si->q_array)); + si->bitmap = 0; + + return 0; +} + +static int +prio_new_fsk(struct dn_fsk *fs) +{ + /* Check if the prioritiy is between 0 and MAXPRIO-1 */ + ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); + return 0; +} + +static int +prio_new_queue(struct dn_queue *q) +{ + struct prio_si *si = (struct prio_si *)(q->_si + 1); + int prio = q->fs->fs.par[0]; + struct dn_queue *oldq; + + q->ni.oid.subtype = DN_SCHED_PRIO; + + if (q->mq.head == NULL) + return 0; + + /* Queue already full, must insert in the scheduler or append + * mbufs to existing queue. This partly duplicates prio_enqueue + */ + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else if ( (oldq = si->q_array[prio]) != q) { + /* must append to the existing queue. + * can simply append q->mq.head to q2->... + * and add the counters to those of q2 + */ + oldq->mq.tail->m_nextpkt = q->mq.head; + oldq->mq.tail = q->mq.tail; + oldq->ni.length += q->ni.length; + q->ni.length = 0; + oldq->ni.len_bytes += q->ni.len_bytes; + q->ni.len_bytes = 0; + q->mq.tail = q->mq.head = NULL; + } + return 0; +} + +static int +prio_free_queue(struct dn_queue *q) +{ + int prio = q->fs->fs.par[0]; + struct prio_si *si = (struct prio_si *)(q->_si + 1); + + if (si->q_array[prio] == q) { + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return 0; +} + + +static struct dn_alg prio_desc = { + _SI( .type = ) DN_SCHED_PRIO, + _SI( .name = ) "PRIO", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct prio_si), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) prio_enqueue, + _SI( .dequeue = ) prio_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) prio_new_sched, + _SI( .free_sched = ) NULL, + + _SI( .new_fsk = ) prio_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) prio_new_queue, + _SI( .free_queue = ) prio_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c new file mode 100644 index 0000000..5cc5901 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_qfq.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_qfq.c 240494 2012-09-14 11:51:49Z glebius $ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1<<ix); +} +void __set_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + *p |= (1<<ix); +} +void __clear_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + *p &= ~(1<<ix); +} +#else /* !QFQ_DEBUG */ +/* XXX do we have fast version, or leave it to the compiler ? */ +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif /* !QFQ_DEBUG */ +#endif /* !__linux__ */ + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/*-------------------------------------------*/ +/* + +Virtual time computations. + +S, F and V are all computed in fixed point arithmetic with +FRAC_BITS decimal bits. + + QFQ_MAX_INDEX is the maximum index allowed for a group. We need + one bit per index. + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + The layout of the bits is as below: + + [ MTU_SHIFT ][ FRAC_BITS ] + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + ^.__grp->index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<<MTU_SHIFT, w_min = 1 . +From this, and knowing how many groups (MAX_INDEX) we want, +we can derive the shift corresponding to each group. + +Because we often need to compute + F = S + len/w_i and V = V + len/wsum +instead of storing w_i store the value + inv_w = (1<<FRAC_BITS)/w_i +so we can do F = S + len * inv_w * wsum. +We use W_TOT in the formulas so we can easily move between +static and adaptive weight sum. + +The per-scheduler-instance data contain all the data structures +for the scheduler: bitmaps and bucket lists. + + */ +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. This is approx lmax/lmin + 5. + * XXX check because it poses constraints on MAX_INDEX + */ +#define QFQ_MAX_SLOTS 32 +/* + * Shifts used for class<->group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<<FP_SHIFT)/weight, + * then compute an FLS (which is like a log2()), and if the result + * is below the MAX_INDEX region we use 0 (which is the same as + * using a larger len). + */ +#define QFQ_MAX_INDEX 19 +#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ + +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) +#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) +//#define IWSUM (q->i_wsum) +#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM) + +#define FRAC_BITS 30 /* fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) + +#define QFQ_MTU_SHIFT 11 /* log2(max_len) */ +#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) + +/* + * Possible group states, also indexes for the bitmaps array in + * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3 + */ +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; + +struct qfq_group; +/* + * additional queue info. Some of this info should come from + * the flowset, we copy them here for faster processing. + * This is an overlay of the struct dn_queue + */ +struct qfq_class { + struct dn_queue _q; + uint64_t S, F; /* flow timestamps (exact) */ + struct qfq_class *next; /* Link for the slot list. */ + + /* group we belong to. In principle we would need the index, + * which is log_2(lmax/weight), but we never reference it + * directly, only the group. + */ + struct qfq_group *grp; + + /* these are copied from the flowset. */ + uint32_t inv_w; /* ONE_FP/weight */ + uint32_t lmax; /* Max packet size for this flow. */ +}; + +/* Group descriptor, see the paper for details. + * Basically this contains the bucket lists + */ +struct qfq_group { + uint64_t S, F; /* group timestamps (approx). */ + unsigned int slot_shift; /* Slot shift. */ + unsigned int index; /* Group index. */ + unsigned int front; /* Index of the front slot. */ + bitmap full_slots; /* non-empty slots */ + + /* Array of lists of active classes. */ + struct qfq_class *slots[QFQ_MAX_SLOTS]; +}; + +/* scheduler instance descriptor. */ +struct qfq_sched { + uint64_t V; /* Precise virtual time. */ + uint32_t wsum; /* weight sum */ + NO(uint32_t i_wsum; /* ONE_FP/w_sum */ + uint32_t _queued; /* debugging */ + uint32_t loops; /* debugging */) + bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ +}; + +/*---- support functions ----------------------------*/ + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(uint64_t a, uint64_t b) +{ + return (int64_t)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint64_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<<i))) + continue; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + if (g->slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<<i) ? '1' : '0'); + } +} + +static void +dump_sched(struct qfq_sched *q, const char *msg) +{ + D("--- in %s: ---", msg); + ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c new file mode 100644 index 0000000..28edb29 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_rr.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_rr.c 240494 2012-09-14 11:51:49Z glebius $ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c b/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c new file mode 100644 index 0000000..c07f4c7 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/dn_sched_wf2q.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/dn_sched_wf2q.c 240494 2012-09-14 11:51:49Z glebius $ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* F<S means timestamps are invalid ->brand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c b/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c new file mode 100644 index 0000000..753331f --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_glue.c @@ -0,0 +1,846 @@ +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD: head/sys/netpfil/ipfw/ip_dn_glue.c 266955 2014-06-01 20:19:17Z hiren $ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/taskqueue.h> +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknown (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknown, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf = NULL, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAITOK|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + if (buf) + free(buf, M_DUMMYNET); + return error; +} + +int +dn_compat_calc_size(void) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay * 1000 / hz; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_TEMP); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_TEMP); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c b/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c new file mode 100644 index 0000000..6211221 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_io.c @@ -0,0 +1,960 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_dn_io.c 272089 2014-09-25 02:26:05Z sbruno $"); + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/sysctl.h> + +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <net/netisr.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> /* ip_len, ip_off */ +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/if_ether.h> /* various ether_* routines */ +#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ +#include <netinet6/ip6_var.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; +//VNET_DEFINE(struct dn_parms, _base_dn_cfg); + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +/* + * Because of the way the SYSBEGIN/SYSEND macros work on other + * platforms, there should not be functions between them. + * So keep the handlers outside the block. + */ +static int +sysctl_hash_size(SYSCTL_HANDLER_ARGS) +{ + int error, value; + + value = dn_cfg.hash_size; + error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 16 || value > 65536) + return (EINVAL); + dn_cfg.hash_size = value; + return (0); +} + +static int +sysctl_limits(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (arg2 != 0) + value = dn_cfg.slot_limit; + else + value = dn_cfg.byte_limit; + error = sysctl_handle_long(oidp, &value, 0, req); + + if (error != 0 || req->newptr == NULL) + return (error); + if (arg2 != 0) { + if (value < 1) + return (EINVAL); + dn_cfg.slot_limit = value; + } else { + if (value < 1500) + return (EINVAL); + dn_cfg.byte_limit = value; + } + return (0); +} + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* wrapper to pass dn_cfg fields to SYSCTL_* */ +//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) +#define DC(x) (&(dn_cfg.x)) +/* parameters */ + + +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, + "I", "Default hash table size"); + + +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, + "L", "Upper limit in slots for pipe queue."); +SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, + "L", "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* Drain parameters */ +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); +SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); +#undef DC +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ +// struct _ip6dn_args ip6opt; /* XXX ipv6 options, 192 bytes */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ +#ifdef USERSPACE + // buffers from netmap need to be copied + // XXX note that the routine is not expected to fail + ND("append %p to %p", m, q); + if (m->m_flags & M_STACK) { + struct mbuf *m_new; + void *p; + int l, ofs; + + ofs = m->m_data - m->__m_extbuf; + // XXX allocate + MGETHDR(m_new, M_NOWAIT, MT_DATA); + ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", + m, m->__m_extbuf, m->__m_extlen, ofs, m_new); + p = m_new->__m_extbuf; /* new pointer */ + l = m_new->__m_extlen; /* new len */ + if (l <= m->__m_extlen) { + panic("extlen too large"); + } + + *m_new = *m; // copy + m_new->m_flags &= ~M_STACK; + m_new->__m_extbuf = p; // point to new buffer + _pkt_copy(m->__m_extbuf, p, m->__m_extlen); + m_new->m_data = p + ofs; + m = m_new; + } +#endif /* USERSPACE */ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->count++; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_ECN) + return (1); + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + if (fs->fs.flags & DN_IS_ECN) + return (1); + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * ECN/ECT Processing (partially adopted from altq) + */ +static int +ecn_mark(struct mbuf* m) +{ + struct ip *ip; + ip = mtod(m, struct ip *); + + switch (ip->ip_v) { + case IPVERSION: + { + u_int8_t otos; + int sum; + + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } +#ifdef INET6 + case (IPV6_VERSION >> 4): + { + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } +#endif + } + return (0); +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { + if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) + goto drop; + } + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return (0); + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return (1); +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + dline->mq.count--; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + + done++; + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; + mq_append(&si->dline.mq, m); + } + + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + CURVNET_SET((struct vnet *)context); + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { + dn_cfg.expire_cycle = 0; + dn_drain_scheduler(); + dn_drain_queue(); + } + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); + CURVNET_RESTORE(); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + if (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2) && + pkt->ifp == NULL) { + dst = DIR_DROP; + } else { + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + } + + switch (dst) { + case DIR_OUT: + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* find scheduler instance, possibly applying sched_mask */ + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + if (si == NULL) + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, si, &(fwa->f_id)); + if (q == NULL) + goto dropit; + } + if (fs->sched->fp->enqueue(si, q, m)) { + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + if (si->idle_time < dn_cfg.curr_time) { + /* Do this only on the first packet on an idle pipe */ + struct dn_link *p = &fs->sched->link; + + si->sched_time = dn_cfg.curr_time; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io, rename the tag * to carry reinject info. */ + struct m_tag *tag = m_tag_first(m); + + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h b/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h new file mode 100644 index 0000000..fdd7448 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_dn_private.h @@ -0,0 +1,404 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD: head/sys/netpfil/ipfw/ip_dn_private.h 258467 2013-11-22 05:02:37Z luigi $ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; + int count; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +} + +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Store the fs/sch to scan when draining. The value is the + * bucket number of the hash table. Expire can be disabled + * with net.inet.ip.dummynet.expire=0, or it happens every + * expire ticks. + **/ + int drain_fs; + int drain_sch; + uint32_t expire; + uint32_t expire_cycle; /* tick count */ + + int init_done; + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. See notes below. + */ + int q_count; + +}; + +/* + * NOTE about object drain. + * The system will automatically (XXX check when) drain queues and + * scheduler instances when they are idle. + * A queue is idle when it has no packets; an instance is idle when + * it is not in the evheap heap, and the corresponding delay line is empty. + * A queue can be safely deleted when it is idle because of the scheduler + * function xxx_free_queue() will remove any references to it. + * An instance can be only deleted when no queues reference it. To be sure + * of that, a counter (q_count) stores the number of queues that are pointing + * to the instance. + * + * XXX + * Order of scan: + * - take all flowset in a bucket for the flowset hash table + * - take all queues in a bucket for the flowset + * - increment the queue bucket + * - scan next flowset bucket + * Nothing is done if a bucket contains no entries. + * + * The same schema is used for sceduler instances + */ + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed + * by scheduler */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; +//VNET_DECLARE(struct dn_parms, _base_dn_cfg); +//#define dn_cfg VNET(_base_dn_cfg) + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, + struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* + * copy_range is a template for requests for ranges of pipes/queues/scheds. + * The number of ranges is variable and can be derived by o.len. + * As a default, we use a small number of entries so that the struct + * fits easily on the stack and is sufficient for most common requests. + */ +#define DEFAULT_RANGES 5 +struct copy_range { + struct dn_id o; + uint32_t r[ 2 * DEFAULT_RANGES ]; +}; + +struct copy_args { + char **start; + char *end; + int flags; + int type; + struct copy_range *extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(void); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c b/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c new file mode 100644 index 0000000..a4dcb4f --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_dummynet.c @@ -0,0 +1,2334 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_dummynet.c 272840 2014-10-09 19:32:35Z melifaro $"); + +/* + * Configuration and internal object management for dummynet. + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/taskqueue.h> +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/dn_heap.h> +#include <netpfil/ipfw/ip_dn_private.h> +#include <netpfil/ipfw/dn_sched.h> + +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 + +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; + +/*---- callout hooks. ----*/ +static struct callout dn_timeout; +static struct task dn_task; +static struct taskqueue *dn_tq = NULL; + +static void +dummynet(void *arg) +{ + + (void)arg; /* UNUSED */ + taskqueue_enqueue_fast(dn_tq, &dn_task); +} + +void +dn_reschedule(void) +{ + callout_reset(&dn_timeout, 1, dummynet, NULL); +} +/*----- end of callout hooks -----*/ + +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) +{ + struct dn_alg *d; + + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcasecmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + int oldv = *v; + const char *op = NULL; + if (dflt < lo) + dflt = lo; + if (dflt > hi) + dflt = hi; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} + +/*---- flow_id mask, hash and compare functions ---*/ +/* + * The flow_id includes the 5-tuple, the queue/pipe number + * which we store in the extra area in host order, + * and for ipv6 also the flow_id6. + * XXX see if we want the tos byte (can store in 'flags') + */ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) +{ + int is_v6 = IS_IP6_FLOW_ID(id); + + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->extra &= mask->extra; + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; + } else { + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; + } + return id; +} + +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); + + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->extra |= src->extra; + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; + } + return dst; +} + +static int +nonzero_mask(struct ipfw_flow_id *m) +{ + if (m->dst_port || m->src_port || m->proto || m->extra) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; + } else { + return m->dst_ip || m->src_ip; + } +} + +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) +{ + uint32_t i; + + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->extra) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->extra) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); + } + return i; +} + +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ +static int +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) +{ + int is_v6 = IS_IP6_FLOW_ID(id1); + + if (!is_v6) { + if (IS_IP6_FLOW_ID(id2)) + return 1; /* different address families */ + + return (id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra) ? 0 : 1; + } + /* the ipv6 case */ + return ( + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra && + id1->flow_id6 == id2->flow_id6) ? 0 : 1; +} +/*--------- end of flow-id mask, hash and compare ---------*/ + +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id + */ +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; + } + return (0 == flow_id_cmp(&o->ni.fid, id2)); +} + +/* + * create a new queue instance for the given 'key'. + */ +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; + } + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = template->_si; + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + return q; +} + +/* + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). + */ +static void +dn_delete_queue(struct dn_queue *q, int flags) +{ + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + fs->sched->fp->free_queue(q); + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } +} + +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; +} + +/* + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; + */ +static void +qht_delete(struct dn_fsk *fs, int flags) +{ + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } +} + +/* + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). + */ +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, + struct ipfw_flow_id *id) +{ + struct dn_queue template; + + template._si = si; + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } +} +/*--- end of queue hash table ---*/ + +/*--- support functions for the sch_inst hashtable ---- + * + * These are hashed by flow-id + */ +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; + + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; + } + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; + + dn_cfg.si_count++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); + } + return NULL; +} + +/* + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. + */ +static int +si_destroy(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; + + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; +} + +/* + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. + */ +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) +{ + + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); + } + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; +} + +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; +} + +static void +schk_reset_credit(struct dn_schk *s) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); +} +/*---- end of sch_inst hashtable ---------------------*/ + +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. + */ +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return ( (i>>8)^(i>>4)^i ); +} + +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return (fs->fs.fs_nr == i); +} + +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; +} + +/* + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). + */ +static void +fsk_detach(struct dn_fsk *fs, int flags) +{ + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); + } + /* Free the RED parameters, they will be recomputed on + * subsequent attach if needed. + */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(*fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } +} + +/* + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY + */ +static void +fsk_detach_list(struct dn_fsk_head *h, int flags) +{ + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); +} + +/* + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. + */ +static int +delete_fs(int i, int locked) +{ + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; + } else + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; +} + +/*----- end of flowset hashtable support -------------*/ + +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); +} + +static int +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); +} + +/* + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. + */ +static void * +schk_new(uintptr_t key, int flags, void *arg) +{ + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; + } + } + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; +} + +/* + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) + */ +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; + struct dn_profile **p = &s->profile; + int i, lim = 1 /* how many profiles */; + +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan(s->siht, si_destroy, NULL); + dn_ht_free(s->siht, 0); + } else if (s->siht) + si_destroy(s->siht, NULL); + + for (i = 0; i < lim; i++) { + if (p[i]) { + free(p[i], M_DUMMYNET); + p[i] = NULL; + } + } + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; +} + +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ +static int +delete_schk(int i) +{ + struct dn_schk *s; + + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ + +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + + if (have < o->len || o->len == 0 || o->type == 0) { + D("(WARN) type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + /* convert back to milliseconds */ + l->delay = l->delay * 1000 / hz; + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); + } + *start += o->len; + return 0; +} + +/* Specific function to copy a queue. + * Copies only the user-visible part of a queue (which is in + * a struct dn_flow), and sets len accordingly. + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_flow); /* see above comment */ + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; +} + +static int +copy_q_cb(void *obj, void *arg) +{ + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; +} + +static int +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); + else + copy_q_cb(fs->qht, a); + return 0; +} + +/* + * This routine only copies the initial part of a profile ? XXX + */ +static int +copy_profile(struct copy_args *a, struct dn_profile *p) +{ + int have = a->end - *a->start; + /* XXX start with base length */ + int profile_len = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); + + if (p == NULL) + return 0; + profile_len += p->samples_no * sizeof(int); /* add actual samples */ + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; + } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} + +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); + } + return 0; +} + +static int +copy_si_cb(void *obj, void *arg) +{ + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} + +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; +} + +/* + * compute a list of children of a scheduler and copy up + */ +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) +{ + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} + +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + uint32_t *r = a->extra->r; /* start of first range */ + uint32_t *lim; /* first invalid pointer */ + int n; + + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); + + if (a->type == DN_LINK || a->type == DN_SCH) { + /* pipe|sched show, we receive a dn_schk */ + struct dn_schk *s = _o; + + n = s->sch.sched_nr; + if (a->type == DN_SCH && n >= DN_MAX_ID) + return 0; /* not a scheduler */ + if (a->type == DN_LINK && n <= DN_MAX_ID) + return 0; /* not a pipe */ + + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + /* Found a valid entry, copy and we are done */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, + &s->link, "link", n)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END | DNHT_COPY_ERR; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, + &s->sch, "sched", n)) + return DNHT_SCAN_END | DNHT_COPY_ERR; + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END | DNHT_COPY_ERR; + } + if (a->flags & DN_C_FLOW) + copy_si(a, s, 0); + break; + } + } else if (a->type == DN_FS) { + /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + + n = fs->fs.fs_nr; + if (n >= DN_MAX_ID) + return 0; + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END | DNHT_COPY_ERR; + copy_q(a, fs, 0); + break; /* we are done */ + } + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); +} + +/* + * red parameters are in fixed point arithmetic. + */ +static int +config_red(struct dn_fsk *fs) +{ + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + ND("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + if (fs->fs.max_th == fs->fs.min_th) + fs->c_1 = fs->max_p; + else + fs->c_1 = SCALE((int64_t)(fs->max_p)) / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; + } + + /* If the lookup table already exist, free and create it again. */ + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + } + if (dn_cfg.red_lookup_depth == 0) { + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" + "must be > 0\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return (EINVAL); + } + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), + M_DUMMYNET, M_NOWAIT); + if (fs->w_q_lookup == NULL) { + printf("dummynet: sorry, cannot allocate red lookup table\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return(ENOSPC); + } + + /* Fill the lookup table with (1 - w_q)^x */ + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; + + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); + + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + ND("exit"); + return 0; +} + +/* Scan all flowset attached to this scheduler and update red */ +static void +update_red(struct dn_schk *s) +{ + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } +} + +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); +} + +/* update all flowsets which may refer to this scheduler */ +static void +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); + } +} + +/* + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) + */ +static int +config_link(struct dn_link *p, struct dn_id *arg) +{ + int i; + + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + * burst ??? + */ + p->delay = (p->delay * hz) / 1000; + /* Scale burst size: bytes -> bits * hz */ + p->burst *= 8 * hz; + + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; + } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); + } + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; +} + +/* + * configure a flowset. Can be called from inside with locked=1, + */ +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) +{ + int i; + struct dn_fsk *fs; + + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; + } + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; +} + +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) +{ + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf = NULL; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; + int err = ENOMEM; + + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; + } + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + err = EINVAL; + goto error; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + D("cannot allocate scheduler %d", i); + goto error; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ + s->profile = NULL; /* XXX maybe not needed */ + } else { + s->profile = malloc(sizeof(struct dn_profile), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("cannot allocate profile"); + goto error; //XXX + } + bcopy(pf, s->profile, sizeof(*pf)); + } + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) {/* preserve profile */ + if (!pf) + pf = malloc(sizeof(*pf), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf) /* XXX should issue a warning otherwise */ + bcopy(s->profile, pf, sizeof(*pf)); + } + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + goto error; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; + } else { + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + err = 0; /* and this is not an error */ + goto error; + } + } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + err = 0; +error: + DN_BH_WUNLOCK(); + if (pf) + free(pf, M_DUMMYNET); + return err; +} + +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; + + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_profile **pkpf, *kpf; + + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + pkpf = &s->profile; /* prepare to handle multiple profiles */ + kpf = *pkpf; + + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (kpf && (pf->samples_no == 0 || + kpf->oid.len < pf->oid.len)) { + free(kpf, M_DUMMYNET); + *pkpf = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (kpf == NULL) + *pkpf = kpf = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (kpf == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; + } + /* preserve larger length XXX double check */ + olen = kpf->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, kpf, pf->oid.len); + kpf->oid.len = olen; + } + DN_BH_WUNLOCK(); + return err; +} + +/* + * Delete all objects: + */ +static void +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); +} + +/* + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. + */ +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; + } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; + +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; +} + +static int +compute_space(struct dn_id *cmd, struct copy_args *a) +{ + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); + /* note, this may be short */ + + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow) all scheduler instance (includes + * the queue instance) + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + /* 'ipfw sched show' could list all queues associated to + * a scheduler. This feature for now is disabled + */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(); + break; + } + a->flags = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; + } + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* + * When exporting a queue to userland, only pass up the + * struct dn_flow, which is the only visible part. + */ + + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_flow); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; +} + +/* + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw + */ +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id *cmd; + struct copy_args a; + struct copy_range r; + int l = sizeof(struct dn_id); + + bzero(&a, sizeof(a)); + bzero(&r, sizeof(r)); + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + + cmd = &r.o; + + if (!compat) { + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_WAITOK); + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + } + } else { /* compatibility */ + error = 0; + cmd->type = DN_CMD_GET; + cmd->len = sizeof(struct dn_id); + cmd->subtype = DN_GET_COMPAT; + // cmd->id = sopt_valsize; + D("compatibility mode"); + } + a.extra = (struct copy_range *)cmd; + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ + uint32_t *rp = (uint32_t *)(cmd + 1); + cmd->len += 2* sizeof(uint32_t); + rp[0] = 1; + rp[1] = DN_MAX_ID - 1; + if (cmd->subtype == DN_LINK) { + rp[0] += DN_MAX_ID; + rp[1] += DN_MAX_ID; + } + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(cmd, &a); + + /* if there is a range, ignore value from compute_space() */ + if (l > sizeof(*cmd)) + need = sopt_valsize - sizeof(*cmd); + + if (need < 0) { + DN_BH_WUNLOCK(); + error = EINVAL; + goto done; + } + need += sizeof(*cmd); + cmd->id = need; + if (have >= need) + break; + + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + } + + if (start == NULL) { + if (compat) { + *compat = NULL; + error = 1; // XXX + } else { + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); + } + goto done; + } + ND("have %d:%lu sched %d, %d:%lu links %d, %d:%lu flowsets %d, " + "%d:%lu si %d, %d:%lu queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd->subtype; + + if (compat == NULL) { + bcopy(cmd, start, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + /* XXX set error in case of no space */ + if (compat) { + a.type = DN_COMPAT_PIPE; + error = dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) { + error = dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + } else { + error = dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + } + DN_BH_WUNLOCK(); + if (error < 0) { + error = 0; /* we skip the sooptcopyout so we fail, hopefully */ + goto done; + } else { + error = 0; /* all fine */ + } + + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + start = NULL; //XXX hack + } else { + error = sooptcopyout(sopt, start, buf - start); + } +done: + if (cmd && cmd != &r.o) + free(cmd, M_DUMMYNET); + if (start) + free(start, M_DUMMYNET); + return error; +} + +/* Callback called on scheduler instance to delete it if idle */ +static int +drain_scheduler_cb(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; + + if (si->sched->fp->flags & DN_MULTIQUEUE) { + if (si->q_count == 0) + return si_destroy(si, NULL); + else + return 0; + } else { /* !DN_MULTIQUEUE */ + if ((si+1)->ni.length == 0) + return si_destroy(si, NULL); + else + return 0; + } + return 0; /* unreachable */ +} + +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *arg) +{ + struct dn_schk *s = _s; + + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, NULL); + s->drain_bucket++; + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) + s->siht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, + drain_scheduler_sch_cb, NULL); + dn_cfg.drain_sch++; +} + +/* Callback called on queue to delete if it is idle */ +static int +drain_queue_cb(void *_q, void *arg) +{ + struct dn_queue *q = _q; + + if (q->ni.length == 0) { + dn_delete_queue(q, DN_DESTROY); + return DNHT_SCAN_DEL; /* queue is deleted */ + } + + return 0; /* queue isn't deleted */ +} + +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; + + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, NULL); + fs->drain_bucket++; + } else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, + drain_queue_fs_cb, NULL); + dn_cfg.drain_fs++; +} + +/* + * Handler for the various dummynet socket options + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + void *p = NULL; + int error, l; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); + if (error) + return (error); + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; + + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + /* XXX bumped size to 16000 for 3 profiles */ + if (l < sizeof(struct dn_id) || l > 16000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } + + if (p != NULL) + free(p, M_TEMP); + + return error ; +} + + +static void +ip_dn_init(void) +{ + if (dn_cfg.init_done) + return; + printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); + dn_cfg.init_done = 1; + /* Set defaults here. MSVC does not accept initializers, + * and this is also useful for vimages + */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; + dn_cfg.expire = 1; + + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ + + /* hash tables */ + dn_cfg.max_hash_size = 65536; /* max in the hash tables */ + dn_cfg.hash_size = 64; /* default hash size */ + + /* create hash tables for schedulers and flowsets. + * In both we search by key and by pointer. + */ + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); + + TASK_INIT(&dn_task, 0, dummynet_task, curvnet); + dn_tq = taskqueue_create_fast("dummynet", M_WAITOK, + taskqueue_thread_enqueue, &dn_tq); + taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); + + callout_init(&dn_timeout, CALLOUT_MPSAFE); + dn_reschedule(); + + /* Initialize curr_time adjustment mechanics. */ + getmicrouptime(&dn_cfg.prev_t); +} + +static void +ip_dn_destroy(int last) +{ + callout_drain(&dn_timeout); + + DN_BH_WLOCK(); + if (last) { + ND("removing last instance\n"); + ip_dn_ctl_ptr = NULL; + ip_dn_io_ptr = NULL; + } + + dummynet_flush(); + DN_BH_WUNLOCK(); + taskqueue_drain(dn_tq, &dn_task); + taskqueue_free(dn_tq); + + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); + + DN_LOCK_DESTROY(); +} + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + + if (type == MOD_LOAD) { + if (ip_dn_io_ptr) { + printf("DUMMYNET already loaded\n"); + return EEXIST ; + } + ip_dn_init(); + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + return 0; + } else if (type == MOD_UNLOAD) { + ip_dn_destroy(1 /* last */); + return 0; + } else + return EOPNOTSUPP; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + ND("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + ND("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; + } + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; +} + +static moduledata_t dummynet_mod = { + "dummynet", dummynet_modevent, NULL +}; + +#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN +#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ +DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); +MODULE_DEPEND(dummynet, ipfw, 3, 3, 3); +MODULE_VERSION(dummynet, 3); + +/* + * Starting up. Done in order after dummynet_modevent() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); + +/* + * Shutdown handlers up shop. These are done in REVERSE ORDER, but still + * after dummynet_modevent() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); + +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw2.c b/example/ipfw/sys/netpfil/ipfw/ip_fw2.c new file mode 100644 index 0000000..7e94502 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw2.c @@ -0,0 +1,2905 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw2.c 272840 2014-10-09 19:32:35Z melifaro $"); + +/* + * The FreeBSD IP packet firewall, main file + */ + +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_inet.h" +#ifndef INET +#error "IPFIREWALL requires INET" +#endif /* INET */ +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/condvar.h> +#include <sys/counter.h> +#include <sys/eventhandler.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/jail.h> +#include <sys/module.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucred.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/if_var.h> +#include <net/route.h> +#include <net/pfil.h> +#include <net/vnet.h> + +#include <netpfil/pf/pf_mtag.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_carp.h> +#include <netinet/pim.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <netinet/sctp.h> + +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#include <netinet6/scope6_var.h> +#include <netinet6/ip6_var.h> +#endif + +#include <netpfil/ipfw/ip_fw_private.h> + +#include <machine/in_cksum.h> /* XXX for in_cksum */ + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * static variables followed by global ones. + * All ipfw global variables are here. + */ + +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) + +static VNET_DEFINE(int, fw_permit_single_frag6) = 1; +#define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) + +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT +static int default_to_accept = 1; +#else +static int default_to_accept; +#endif + +VNET_DEFINE(int, autoinc_step); +VNET_DEFINE(int, fw_one_pass) = 1; + +VNET_DEFINE(unsigned int, fw_tables_max); +VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ +/* Use 128 tables by default */ +static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; + +#ifndef LINEAR_SKIPTO +static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards); +#define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) +#else +static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards); +#define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) +#endif + +/* + * Each rule belongs to one of 32 different sets (0..31). + * The variable set_disable contains one bit per set. + * If the bit is set, all rules in the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted individually. + */ +VNET_DEFINE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DEFINE(int, fw_verbose); +/* counter for ipfw_log(NULL...) */ +VNET_DEFINE(u_int64_t, norule_counter); +VNET_DEFINE(int, verbose_limit); + +/* layer3_chain contains the list of rules for layer 3 */ +VNET_DEFINE(struct ip_fw_chain, layer3_chain); + +/* ipfw_vnet_ready controls when we are open for business */ +VNET_DEFINE(int, ipfw_vnet_ready) = 0; + +VNET_DEFINE(int, ipfw_nat_ready) = 0; + +ipfw_nat_t *ipfw_nat_ptr = NULL; +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); +ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_del_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); +static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); + +SYSBEGIN(f3) + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, + "Rule number auto-increment step"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, + "Log matches to ipfw rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, + "Set upper limit of matches of ipfw rules logged"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, + &dummy_def, 0, + "The default/max possible rule number."); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", + "Maximum number of concurrently used tables"); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", + "Use per-set namespace for tables"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, + &default_to_accept, 0, + "Make the default rule accept all packets."); +TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, + "Number of static rules"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, + "Deny packets with unknown IPv6 Extension Headers"); +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, + "Permit single packet IPv6 fragments"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + + +/* + * Some macros used in the various matching options. + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +static __inline int +icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) +{ + int type = icmp->icmp_type; + + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) ); +} + +#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \ + (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) ) + +static int +is_icmp_query(struct icmphdr *icmp) +{ + int type = icmp->icmp_type; + + return (type <= ICMP_MAXTYPE && (TT & (1<<type)) ); +} +#undef TT + +/* + * The following checks use two arrays of 8 or 16 bits to store the + * bits that we want set or clear, respectively. They are in the + * low and high half of cmd->arg1 or cmd->d[0]. + * + * We scan options and store the bits we find set. We succeed if + * + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear + * + * The code is sometimes optimized not to store additional variables. + */ + +static int +flags_match(ipfw_insn *cmd, u_int8_t bits) +{ + u_char want_clear; + bits = ~bits; + + if ( ((cmd->arg1 & 0xff) & bits) != 0) + return 0; /* some bits we want set were clear */ + want_clear = (cmd->arg1 >> 8) & 0xff; + if ( (want_clear & bits) != want_clear) + return 0; /* some bits we want clear were set */ + return 1; +} + +static int +ipopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(ip + 1); + int x = (ip->ip_hl << 2) - sizeof (struct ip); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[IPOPT_OPTVAL]; + + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > x) + return 0; /* invalid or truncated */ + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + bits |= IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + bits |= IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + bits |= IP_FW_IPOPT_RR; + break; + + case IPOPT_TS: + bits |= IP_FW_IPOPT_TS; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(tcp + 1); + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + bits |= IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + bits |= IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + bits |= IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + bits |= IP_FW_TCPOPT_TS; + break; + + } + } + return (flags_match(cmd, bits)); +} + +static int +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, + uint32_t *tablearg) +{ + + if (ifp == NULL) /* no iface with this packet, match fails */ + return (0); + + /* Check by name or by IP address */ + if (cmd->name[0] != '\0') { /* match by name */ + if (cmd->name[0] == '\1') /* use tablearg to match */ + return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0, + &ifp->if_index, tablearg); + /* Check name */ + if (cmd->p.glob) { + if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) + return(1); + } else { + if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) + return(1); + } + } else { +#if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ + struct ifaddr *ia; + + if_addr_rlock(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) { + if_addr_runlock(ifp); + return(1); /* match */ + } + } + if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ + } + return(0); /* no match, fail ... */ +} + +/* + * The verify_path function checks if a route to the src exists and + * if it is reachable via ifp (when provided). + * + * The 'verrevpath' option checks that the interface that an IP packet + * arrives on is the same interface that traffic destined for the + * packet's source address would be routed out of. + * The 'versrcreach' option just checks that the source address is + * reachable via any route (except default) in the routing table. + * These two are a measure to block forged packets. This is also + * commonly known as "anti-spoofing" or Unicast Reverse Path + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs + * is purposely reminiscent of the Cisco IOS command, + * + * ip verify unicast reverse-path + * ip verify unicast source reachable-via any + * + * which implements the same functionality. But note that the syntax + * is misleading, and the check may be performed on all IP packets + * whether unicast, multicast, or broadcast. + */ +static int +verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) +{ +#if defined(USERSPACE) || !defined(__FreeBSD__) + return 0; +#else + struct route ro; + struct sockaddr_in *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in *)&(ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = src; + in_rtalloc_ign(&ro, 0, fib); + + if (ro.ro_rt == NULL) + return 0; + + /* + * If ifp is provided, check for equality with rtentry. + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * in order to pass packets injected back by if_simloop(): + * routing entry (via lo0) for our own address + * may exist, so we need to handle routing assymetry. + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; +#endif /* __FreeBSD__ */ +} + +#ifdef INET6 +/* + * ipv6 specific rules here... + */ +static __inline int +icmp6type_match (int type, ipfw_insn_u32 *cmd) +{ + return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); +} + +static int +flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) +{ + int i; + for (i=0; i <= cmd->o.arg1; ++i ) + if (curr_flow == cmd->d[i] ) + return 1; + return 0; +} + +/* support for IP6_*_ME opcodes */ +static int +search_ip6_addr_net (struct in6_addr * ip6_addr) +{ + struct ifnet *mdc; + struct ifaddr *mdc2; + struct in6_ifaddr *fdm; + struct in6_addr copia; + + TAILQ_FOREACH(mdc, &V_ifnet, if_link) { + if_addr_rlock(mdc); + TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { + if (mdc2->ifa_addr->sa_family == AF_INET6) { + fdm = (struct in6_ifaddr *)mdc2; + copia = fdm->ia_addr.sin6_addr; + /* need for leaving scope_id in the sock_addr */ + in6_clearscope(&copia); + if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { + if_addr_runlock(mdc); + return 1; + } + } + } + if_addr_runlock(mdc); + } + return 0; +} + +static int +verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) +{ + struct route_in6 ro; + struct sockaddr_in6 *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in6 * )&(ro.ro_dst); + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = *src; + + in6_rtalloc_ign(&ro, 0, fib); + if (ro.ro_rt == NULL) + return 0; + + /* + * if ifp is provided, check for equality with rtentry + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * to support the case of sending packets to an address of our own. + * (where the former interface is the first argument of if_simloop() + * (=ifp), the latter is lo0) + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; + +} + +static int +is_icmp6_query(int icmp6_type) +{ + if ((icmp6_type <= ICMP6_MAXTYPE) && + (icmp6_type == ICMP6_ECHO_REQUEST || + icmp6_type == ICMP6_MEMBERSHIP_QUERY || + icmp6_type == ICMP6_WRUREQUEST || + icmp6_type == ICMP6_FQDN_QUERY || + icmp6_type == ICMP6_NI_QUERY)) + return (1); + + return (0); +} + +static void +send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) +{ + struct mbuf *m; + + m = args->m; + if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *tcp; + tcp = (struct tcphdr *)((char *)ip6 + hlen); + + if ((tcp->th_flags & TH_RST) == 0) { + struct mbuf *m0; + m0 = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m0 != NULL) + ip6_output(m0, NULL, NULL, 0, NULL, NULL, + NULL); + } + FREE_PKT(m); + } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ +#if 0 + /* + * Unlike above, the mbufs need to line up with the ip6 hdr, + * as the contents are read. We need to m_adj() the + * needed amount. + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + icmp6_error(m, ICMP6_DST_UNREACH, code, 0); + } else + FREE_PKT(m); + + args->m = NULL; +} + +#endif /* INET6 */ + + +/* + * sends a reject message, consuming the mbuf passed as an argument. + */ +static void +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) +{ + +#if 0 + /* XXX When ip is not guaranteed to be at mtod() we will + * need to account for this */ + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); + } else if (args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *const tcp = + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + if ( (tcp->th_flags & TH_RST) == 0) { + struct mbuf *m; + m = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m != NULL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + } + FREE_PKT(args->m); + } else + FREE_PKT(args->m); + args->m = NULL; +} + +/* + * Support for uid/gid/jail lookup. These tests are expensive + * (because we may need to look into the list of active sockets) + * so we cache the results. ugid_lookupp is 0 if we have not + * yet done a lookup, 1 if we succeeded, and -1 if we tried + * and failed. The function always returns the match value. + * We could actually spare the variable and use *uc, setting + * it to '(void *)check_uidgid if we have no info, NULL if + * we tried and failed, or any other value if successful. + */ +static int +check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, + struct ucred **uc) +{ +#if defined(USERSPACE) + return 0; // not supported in userspace +#else +#ifndef __FreeBSD__ + /* XXX */ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ + struct in_addr src_ip, dst_ip; + struct inpcbinfo *pi; + struct ipfw_flow_id *id; + struct inpcb *pcb, *inp; + struct ifnet *oif; + int lookupflags; + int match; + + id = &args->f_id; + inp = args->inp; + oif = args->oif; + + /* + * Check to see if the UDP or TCP stack supplied us with + * the PCB. If so, rather then holding a lock and looking + * up the PCB, we can use the one that was supplied. + */ + if (inp && *ugid_lookupp == 0) { + INP_LOCK_ASSERT(inp); + if (inp->inp_socket != NULL) { + *uc = crhold(inp->inp_cred); + *ugid_lookupp = 1; + } else + *ugid_lookupp = -1; + } + /* + * If we have already been here and the packet has no + * PCB entry associated with it, then we can safely + * assume that this is a no match. + */ + if (*ugid_lookupp == -1) + return (0); + if (id->proto == IPPROTO_TCP) { + lookupflags = 0; + pi = &V_tcbinfo; + } else if (id->proto == IPPROTO_UDP) { + lookupflags = INPLOOKUP_WILDCARD; + pi = &V_udbinfo; + } else + return 0; + lookupflags |= INPLOOKUP_RLOCKPCB; + match = 0; + if (*ugid_lookupp == 0) { + if (id->addr_type == 6) { +#ifdef INET6 + if (oif == NULL) + pcb = in6_pcblookup_mbuf(pi, + &id->src_ip6, htons(id->src_port), + &id->dst_ip6, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in6_pcblookup_mbuf(pi, + &id->dst_ip6, htons(id->dst_port), + &id->src_ip6, htons(id->src_port), + lookupflags, oif, args->m); +#else + *ugid_lookupp = -1; + return (0); +#endif + } else { + src_ip.s_addr = htonl(id->src_ip); + dst_ip.s_addr = htonl(id->dst_ip); + if (oif == NULL) + pcb = in_pcblookup_mbuf(pi, + src_ip, htons(id->src_port), + dst_ip, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in_pcblookup_mbuf(pi, + dst_ip, htons(id->dst_port), + src_ip, htons(id->src_port), + lookupflags, oif, args->m); + } + if (pcb != NULL) { + INP_RLOCK_ASSERT(pcb); + *uc = crhold(pcb->inp_cred); + *ugid_lookupp = 1; + INP_RUNLOCK(pcb); + } + if (*ugid_lookupp == 0) { + /* + * We tried and failed, set the variable to -1 + * so we will not try again on this packet. + */ + *ugid_lookupp = -1; + return (0); + } + } + if (insn->o.opcode == O_UID) + match = ((*uc)->cr_uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = groupmember((gid_t)insn->d[0], *uc); + else if (insn->o.opcode == O_JAIL) + match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); + return (match); +#endif /* __FreeBSD__ */ +#endif /* not supported in userspace */ +} + +/* + * Helper function to set args with info on the rule after the matching + * one. slot is precise, whereas we guess rule_id as they are + * assigned sequentially. + */ +static inline void +set_match(struct ip_fw_args *args, int slot, + struct ip_fw_chain *chain) +{ + args->rule.chain_id = chain->id; + args->rule.slot = slot + 1; /* we use 0 as a marker */ + args->rule.rule_id = 1 + chain->map[slot]->id; + args->rule.rulenum = chain->map[slot]->rulenum; +} + +#ifndef LINEAR_SKIPTO +/* + * Helper function to enable cached rule lookups using + * cached_id and cached_pos fields in ipfw rule. + */ +static int +jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards) +{ + int f_pos; + + /* If possible use cached f_pos (in f->cached_pos), + * whose version is written in f->cached_id + * (horrible hacks to avoid changing the ABI). + */ + if (num != IP_FW_TARG && f->cached_id == chain->id) + f_pos = f->cached_pos; + else { + int i = IP_FW_ARG_TABLEARG(chain, num, skipto); + /* make sure we do not jump backward */ + if (jump_backwards == 0 && i <= f->rulenum) + i = f->rulenum + 1; + if (chain->idxmap != NULL) + f_pos = chain->idxmap[i]; + else + f_pos = ipfw_find_rule(chain, i, 0); + /* update the cache */ + if (num != IP_FW_TARG) { + f->cached_id = chain->id; + f->cached_pos = f_pos; + } + } + + return (f_pos); +} +#else +/* + * Helper function to enable real fast rule lookups. + */ +static int +jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, + int tablearg, int jump_backwards) +{ + int f_pos; + + num = IP_FW_ARG_TABLEARG(chain, num, skipto); + /* make sure we do not jump backward */ + if (jump_backwards == 0 && num <= f->rulenum) + num = f->rulenum + 1; + f_pos = chain->idxmap[num]; + + return (f_pos); +} +#endif + +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) +/* + * The main check routine for the firewall. + * + * All arguments are in args so we can modify them and return them + * back to the caller. + * + * Parameters: + * + * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * Starts with the IP header. + * args->eh (in) Mac header if present, NULL for layer3 packet. + * args->L3offset Number of bytes bypassed if we came from L2. + * e.g. often sizeof(eh) ** NOTYET ** + * args->oif Outgoing interface, NULL if packet is incoming. + * The incoming interface is in the mbuf. (in) + * args->divert_rule (in/out) + * Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * + * args->rule Pointer to the last matching rule (in/out) + * args->next_hop Socket we are forwarding to (out). + * args->next_hop6 IPv6 next hop we are forwarding to (out). + * args->f_id Addresses grabbed from the packet (out) + * args->rule.info a cookie depending on rule action + * + * Return value: + * + * IP_FW_PASS the packet must be accepted + * IP_FW_DENY the packet must be dropped + * IP_FW_DIVERT divert packet, port in m_tag + * IP_FW_TEE tee packet, port in m_tag + * IP_FW_DUMMYNET to dummynet, pipe in args->cookie + * IP_FW_NETGRAPH into netgraph, cookie args->cookie + * args->rule contains the matching rule, + * args->rule.info has additional information. + * + */ +int +ipfw_chk(struct ip_fw_args *args) +{ + + /* + * Local variables holding state while processing a packet: + * + * IMPORTANT NOTE: to speed up the processing of rules, there + * are some assumption on the values of the variables, which + * are documented here. Should you change them, please check + * the implementation of the various instructions to make sure + * that they still work. + * + * args->eh The MAC header. It is non-null for a layer2 + * packet, it is NULL for a layer-3 packet. + * **notyet** + * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. + * + * m | args->m Pointer to the mbuf, as received from the caller. + * It may change if ipfw_chk() does an m_pullup, or if it + * consumes the packet because it calls send_reject(). + * XXX This has to change, so that ipfw_chk() never modifies + * or consumes the buffer. + * ip is the beginning of the ip(4 or 6) header. + * Calculated by adding the L3offset to the start of data. + * (Until we start using L3offset, the packet is + * supposed to start with the ip header). + */ + struct mbuf *m = args->m; + struct ip *ip = mtod(m, struct ip *); + + /* + * For rules which contain uid/gid or jail constraints, cache + * a copy of the users credentials after the pcb lookup has been + * executed. This will speed up the processing of rules with + * these types of constraints, as well as decrease contention + * on pcb related locks. + */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else + struct ucred *ucred_cache = NULL; +#endif + int ucred_lookup = 0; + + /* + * oif | args->oif If NULL, ipfw_chk has been called on the + * inbound path (ether_input, ip_input). + * If non-NULL, ipfw_chk has been called on the outbound path + * (ether_output, ip_output). + */ + struct ifnet *oif = args->oif; + + int f_pos = 0; /* index of current rule in the array */ + int retval = 0; + + /* + * hlen The length of the IP header. + */ + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ + + /* + * offset The offset of a fragment. offset != 0 means that + * we have a fragment at this offset of an IPv4 packet. + * offset == 0 means that (if this is an IPv4 packet) + * this is the first or only fragment. + * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header + * or there is a single packet fragement (fragement header added + * without needed). We will treat a single packet fragment as if + * there was no fragment header (or log/block depending on the + * V_fw_permit_single_frag6 sysctl setting). + */ + u_short offset = 0; + u_short ip6f_mf = 0; + + /* + * Local copies of addresses. They are only valid if we have + * an IP packet. + * + * proto The protocol. Set to 0 for non-ip packets, + * or to the protocol read from the packet otherwise. + * proto != 0 means that we have an IPv4 packet. + * + * src_port, dst_port port numbers, in HOST format. Only + * valid for TCP and UDP packets. + * + * src_ip, dst_ip ip addresses, in NETWORK format. + * Only valid for IPv4 packets. + */ + uint8_t proto; + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + struct in_addr src_ip, dst_ip; /* NOTE: network format */ + uint16_t iplen=0; + int pktlen; + uint16_t etype = 0; /* Host order stored ether type */ + + /* + * dyn_dir = MATCH_UNKNOWN when rules unchecked, + * MATCH_NONE when checked and not matched (q = NULL), + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) + */ + int dyn_dir = MATCH_UNKNOWN; + ipfw_dyn_rule *q = NULL; + struct ip_fw_chain *chain = &V_layer3_chain; + + /* + * We store in ulp a pointer to the upper layer protocol header. + * In the ipv4 case this is easy to determine from the header, + * but for ipv6 we might have some additional headers in the middle. + * ulp is NULL if not found. + */ + void *ulp = NULL; /* upper layer protocol pointer. */ + + /* XXX ipv6 variables */ + int is_ipv6 = 0; + uint8_t icmp6_type = 0; + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ + /* end of ipv6 variables */ + + int is_ipv4 = 0; + + int done = 0; /* flag to exit the outer loop */ + IPFW_RLOCK_TRACKER; + + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) + return (IP_FW_PASS); /* accept */ + + dst_ip.s_addr = 0; /* make sure it is initialized */ + src_ip.s_addr = 0; /* make sure it is initialized */ + pktlen = m->m_pkthdr.len; + args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ + proto = args->f_id.proto = 0; /* mark f_id invalid */ + /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ + +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) +#define PULLUP_LEN(_len, p, T) \ +do { \ + int x = (_len) + T; \ + if ((m)->m_len < x) { \ + args->m = m = m_pullup(m, x); \ + if (m == NULL) \ + goto pullup_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + + /* + * if we have an ether header, + */ + if (args->eh) + etype = ntohs(args->eh->ether_type); + + /* Identify IP packets and fill up variables. */ + if (pktlen >= sizeof(struct ip6_hdr) && + (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + is_ipv6 = 1; + args->f_id.addr_type = 6; + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + /* Search extension headers to find upper layer protocols */ + while (ulp == NULL && offset == 0) { + switch (proto) { + case IPPROTO_ICMPV6: + PULLUP_TO(hlen, ulp, struct icmp6_hdr); + icmp6_type = ICMP6(ulp)->icmp6_type; + break; + + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_HOPOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + switch (((struct ip6_rthdr *)ulp)->ip6r_type) { + case 0: + ext_hd |= EXT_RTHDR0; + break; + case 2: + ext_hd |= EXT_RTHDR2; + break; + default: + if (V_fw_verbose) + printf("IPFW2: IPV6 - Unknown " + "Routing Header type(%d)\n", + ((struct ip6_rthdr *) + ulp)->ip6r_type); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + ext_hd |= EXT_ROUTING; + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + ext_hd |= EXT_FRAGMENT; + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_MORE_FRAG; + if (V_fw_permit_single_frag6 == 0 && + offset == 0 && ip6f_mf == 0) { + if (V_fw_verbose) + printf("IPFW2: IPV6 - Invalid " + "Fragment Header\n"); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + args->f_id.extra = + ntohl(((struct ip6_frag *)ulp)->ip6f_ident); + ulp = NULL; + break; + + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_DSTOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + ext_hd |= EXT_AH; + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + + case IPPROTO_ESP: /* RFC 2406 */ + PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ + /* Anything past Seq# is variable length and + * data past this ext. header is encrypted. */ + ext_hd |= EXT_ESP; + break; + + case IPPROTO_NONE: /* RFC 2460 */ + /* + * Packet ends here, and IPv6 header has + * already been pulled up. If ip6e_len!=0 + * then octets must be ignored. + */ + ulp = ip; /* non-NULL to get out of loop. */ + break; + + case IPPROTO_OSPFIGP: + /* XXX OSPF header check? */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + + case IPPROTO_PIM: + /* XXX PIM header check? */ + PULLUP_TO(hlen, ulp, struct pim); + break; + + case IPPROTO_CARP: + PULLUP_TO(hlen, ulp, struct carp_header); + if (((struct carp_header *)ulp)->carp_version != + CARP_VERSION) + return (IP_FW_DENY); + if (((struct carp_header *)ulp)->carp_type != + CARP_ADVERTISEMENT) + return (IP_FW_DENY); + break; + + case IPPROTO_IPV6: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip6_hdr); + break; + + case IPPROTO_IPV4: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip); + break; + + default: + if (V_fw_verbose) + printf("IPFW2: IPV6 - Unknown " + "Extension Header(%d), ext_hd=%x\n", + proto, ext_hd); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } /*switch */ + } + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)ip; + args->f_id.src_ip6 = ip6->ip6_src; + args->f_id.dst_ip6 = ip6->ip6_dst; + args->f_id.src_ip = 0; + args->f_id.dst_ip = 0; + args->f_id.flow_id6 = ntohl(ip6->ip6_flow); + } else if (pktlen >= sizeof(struct ip) && + (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { + is_ipv4 = 1; + hlen = ip->ip_hl << 2; + args->f_id.addr_type = 4; + + /* + * Collect parameters into local variables for faster matching. + */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + offset = ntohs(ip->ip_off) & IP_OFFMASK; + iplen = ntohs(ip->ip_len); + pktlen = iplen < pktlen ? iplen : pktlen; + + if (offset == 0) { + switch (proto) { + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen, ulp, struct icmphdr); + //args->f_id.flags = ICMP(ulp)->icmp_type; + break; + + default: + break; + } + } + + ip = mtod(m, struct ip *); + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + } +#undef PULLUP_TO + if (proto) { /* we may have port numbers, store them */ + args->f_id.proto = proto; + args->f_id.src_port = src_port = ntohs(src_port); + args->f_id.dst_port = dst_port = ntohs(dst_port); + } + + IPFW_PF_RLOCK(chain); + if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ + IPFW_PF_RUNLOCK(chain); + return (IP_FW_PASS); /* accept */ + } + if (args->rule.slot) { + /* + * Packet has already been tagged as a result of a previous + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, + * REASS, NETGRAPH, DIVERT/TEE...) + * Validate the slot and continue from the next one + * if still present, otherwise do a lookup. + */ + f_pos = (args->rule.chain_id == chain->id) ? + args->rule.slot : + ipfw_find_rule(chain, args->rule.rulenum, + args->rule.rule_id); + } else { + f_pos = 0; + } + + /* + * Now scan the rules, and parse microinstructions for each rule. + * We have two nested loops and an inner switch. Sometimes we + * need to break out of one or both loops, or re-enter one of + * the loops with updated variables. Loop variables are: + * + * f_pos (outer loop) points to the current rule. + * On output it points to the matching rule. + * done (outer loop) is used as a flag to break the loop. + * l (inner loop) residual length of current rule. + * cmd points to the current microinstruction. + * + * We break the inner loop by setting l=0 and possibly + * cmdlen=0 if we don't want to advance cmd. + * We break the outer loop by setting done=1 + * We can restart the inner loop by setting l>0 and f_pos, f, cmd + * as needed. + */ + for (; f_pos < chain->n_rules; f_pos++) { + ipfw_insn *cmd; + uint32_t tablearg = 0; + int l, cmdlen, skip_or; /* skip rest of OR block */ + struct ip_fw *f; + + f = chain->map[f_pos]; + if (V_set_disable & (1 << f->set) ) + continue; + + skip_or = 0; + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; + l -= cmdlen, cmd += cmdlen) { + int match; + + /* + * check_body is a jump target used when we find a + * CHECK_STATE, and need to jump to the body of + * the target rule. + */ + +/* check_body: */ + cmdlen = F_LEN(cmd); + /* + * An OR block (insn_1 || .. || insn_n) has the + * F_OR bit set in all but the last instruction. + * The first match will set "skip_or", and cause + * the following instructions to be skipped until + * past the one with the F_OR bit clear. + */ + if (skip_or) { /* skip this instruction */ + if ((cmd->len & F_OR) == 0) + skip_or = 0; /* next one is good */ + continue; + } + match = 0; /* set to 1 if we succeed */ + + switch (cmd->opcode) { + /* + * The first set of opcodes compares the packet's + * fields with some pattern, setting 'match' if a + * match is found. At the end of the loop there is + * logic to deal with F_NOT and F_OR flags associated + * with the opcode. + */ + case O_NOP: + match = 1; + break; + + case O_FORWARD_MAC: + printf("ipfw: opcode %d unimplemented\n", + cmd->opcode); + break; + + case O_GID: + case O_UID: + case O_JAIL: + /* + * We only check offset == 0 && proto != 0, + * as this ensures that we have a + * packet with the ports info. + */ + if (offset != 0) + break; + if (proto == IPPROTO_TCP || + proto == IPPROTO_UDP) + match = check_uidgid( + (ipfw_insn_u32 *)cmd, + args, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache); +#else + (void *)&ucred_cache); +#endif + break; + + case O_RECV: + match = iface_match(m->m_pkthdr.rcvif, + (ipfw_insn_if *)cmd, chain, &tablearg); + break; + + case O_XMIT: + match = iface_match(oif, (ipfw_insn_if *)cmd, + chain, &tablearg); + break; + + case O_VIA: + match = iface_match(oif ? oif : + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, + chain, &tablearg); + break; + + case O_MACADDR2: + if (args->eh != NULL) { /* have MAC header */ + u_int32_t *want = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->addr; + u_int32_t *mask = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->mask; + u_int32_t *hdr = (u_int32_t *)args->eh; + + match = + ( want[0] == (hdr[0] & mask[0]) && + want[1] == (hdr[1] & mask[1]) && + want[2] == (hdr[2] & mask[2]) ); + } + break; + + case O_MAC_TYPE: + if (args->eh != NULL) { + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (etype >= p[0] && + etype <= p[1]); + } + break; + + case O_FRAG: + match = (offset != 0); + break; + + case O_IN: /* "out" is "not in" */ + match = (oif == NULL); + break; + + case O_LAYER2: + match = (args->eh != NULL); + break; + + case O_DIVERTED: + { + /* For diverted packets, args->rule.info + * contains the divert port (in host format) + * reason and direction. + */ + uint32_t i = args->rule.info; + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); + } + break; + + case O_PROTO: + /* + * We do not allow an arg of 0 so the + * check of "proto" only suffices. + */ + match = (proto == cmd->arg1); + break; + + case O_IP_SRC: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + src_ip.s_addr); + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (is_ipv4) { + uint32_t key = + (cmd->opcode == O_IP_DST_LOOKUP) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t v = 0; + + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { + /* generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + if (v == 0) + key = dst_ip.s_addr; + else if (v == 1) + key = src_ip.s_addr; + else if (v == 6) /* dscp */ + key = (ip->ip_tos >> 2) & 0x3f; + else if (offset != 0) + break; + else if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP) + break; + else if (v == 2) + key = dst_port; + else if (v == 3) + key = src_port; +#ifndef USERSPACE + else if (v == 4 || v == 5) { + check_uidgid( + (ipfw_insn_u32 *)cmd, + args, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache); + if (v == 4 /* O_UID */) + key = ucred_cache->cr_uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + } +#endif /* !USERSPACE */ + else + break; + } + match = ipfw_lookup_table(chain, + cmd->arg1, key, &v); + if (!match) + break; + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = + ((ipfw_insn_u32 *)cmd)->d[0] == v; + else + tablearg = v; + } else if (is_ipv6) { + uint32_t v = 0; + void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? + &args->f_id.dst_ip6: &args->f_id.src_ip6; + match = ipfw_lookup_table_extended(chain, + cmd->arg1, + sizeof(struct in6_addr), + pkey, &v); + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = ((ipfw_insn_u32 *)cmd)->d[0] == v; + if (match) + tablearg = v; + } + break; + + case O_IP_FLOW_LOOKUP: + { + uint32_t v = 0; + match = ipfw_lookup_table_extended(chain, + cmd->arg1, 0, &args->f_id, &v); + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = ((ipfw_insn_u32 *)cmd)->d[0] == v; + if (match) + tablearg = v; + } + break; + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + if (is_ipv4) { + uint32_t a = + (cmd->opcode == O_IP_DST_MASK) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; + int i = cmdlen-1; + + for (; !match && i>0; i-= 2, p+= 2) + match = (p[0] == (a & p[1])); + } + break; + + case O_IP_SRC_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(src_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_SRC_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); +#endif + break; + + case O_IP_DST_SET: + case O_IP_SRC_SET: + if (is_ipv4) { + u_int32_t *d = (u_int32_t *)(cmd+1); + u_int32_t addr = + cmd->opcode == O_IP_DST_SET ? + args->f_id.dst_ip : + args->f_id.src_ip; + + if (addr < d[0]) + break; + addr -= d[0]; /* subtract base */ + match = (addr < cmd->arg1) && + ( d[ 1 + (addr>>5)] & + (1<<(addr & 0x1f)) ); + } + break; + + case O_IP_DST: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + dst_ip.s_addr); + break; + + case O_IP_DST_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(dst_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_DST_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); +#endif + break; + + + case O_IP_SRCPORT: + case O_IP_DSTPORT: + /* + * offset == 0 && proto != 0 is enough + * to guarantee that we have a + * packet with port info. + */ + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) + && offset == 0) { + u_int16_t x = + (cmd->opcode == O_IP_SRCPORT) ? + src_port : dst_port ; + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (x>=p[0] && x<=p[1]); + } + break; + + case O_ICMPTYPE: + match = (offset == 0 && proto==IPPROTO_ICMP && + icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); + break; + +#ifdef INET6 + case O_ICMP6TYPE: + match = is_ipv6 && offset == 0 && + proto==IPPROTO_ICMPV6 && + icmp6type_match( + ICMP6(ulp)->icmp6_type, + (ipfw_insn_u32 *)cmd); + break; +#endif /* INET6 */ + + case O_IPOPT: + match = (is_ipv4 && + ipopts_match(ip, cmd) ); + break; + + case O_IPVER: + match = (is_ipv4 && + cmd->arg1 == ip->ip_v); + break; + + case O_IPID: + case O_IPLEN: + case O_IPTTL: + if (is_ipv4) { /* only for IP packets */ + uint16_t x; + uint16_t *p; + int i; + + if (cmd->opcode == O_IPLEN) + x = iplen; + else if (cmd->opcode == O_IPTTL) + x = ip->ip_ttl; + else /* must be IPID */ + x = ntohs(ip->ip_id); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_IPPRECEDENCE: + match = (is_ipv4 && + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); + break; + + case O_IPTOS: + match = (is_ipv4 && + flags_match(cmd, ip->ip_tos)); + break; + + case O_DSCP: + { + uint32_t *p; + uint16_t x; + + p = ((ipfw_insn_u32 *)cmd)->d; + + if (is_ipv4) + x = ip->ip_tos >> 2; + else if (is_ipv6) { + uint8_t *v; + v = &((struct ip6_hdr *)ip)->ip6_vfc; + x = (*v & 0x0F) << 2; + v++; + x |= *v >> 6; + } else + break; + + /* DSCP bitmask is stored as low_u32 high_u32 */ + if (x > 32) + match = *(p + 1) & (1 << (x - 32)); + else + match = *p & (1 << x); + } + break; + + case O_TCPDATALEN: + if (proto == IPPROTO_TCP && offset == 0) { + struct tcphdr *tcp; + uint16_t x; + uint16_t *p; + int i; + + tcp = TCP(ulp); + x = iplen - + ((ip->ip_hl + tcp->th_off) << 2); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_TCPFLAGS: + match = (proto == IPPROTO_TCP && offset == 0 && + flags_match(cmd, TCP(ulp)->th_flags)); + break; + + case O_TCPOPTS: + if (proto == IPPROTO_TCP && offset == 0 && ulp){ + PULLUP_LEN(hlen, ulp, + (TCP(ulp)->th_off << 2)); + match = tcpopts_match(TCP(ulp), cmd); + } + break; + + case O_TCPSEQ: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_seq); + break; + + case O_TCPACK: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_ack); + break; + + case O_TCPWIN: + if (proto == IPPROTO_TCP && offset == 0) { + uint16_t x; + uint16_t *p; + int i; + + x = ntohs(TCP(ulp)->th_win); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* Otherwise we have ranges. */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i > 0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_ESTAB: + /* reject packets which have SYN only */ + /* XXX should i also check for TH_ACK ? */ + match = (proto == IPPROTO_TCP && offset == 0 && + (TCP(ulp)->th_flags & + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); + break; + + case O_ALTQ: { + struct pf_mtag *at; + struct m_tag *mtag; + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + /* + * ALTQ uses mbuf tags from another + * packet filtering system - pf(4). + * We allocate a tag in its format + * and fill it in, pretending to be pf(4). + */ + match = 1; + at = pf_find_mtag(m); + if (at != NULL && at->qid != 0) + break; + mtag = m_tag_get(PACKET_TAG_PF, + sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); + if (mtag == NULL) { + /* + * Let the packet fall back to the + * default ALTQ. + */ + break; + } + m_tag_prepend(m, mtag); + at = (struct pf_mtag *)(mtag + 1); + at->qid = altq->qid; + at->hdr = ip; + break; + } + + case O_LOG: + ipfw_log(chain, f, hlen, args, m, + oif, offset | ip6f_mf, tablearg, ip); + match = 1; + break; + + case O_PROB: + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); + break; + + case O_VERREVPATH: + /* Outgoing packets automatically pass/match */ + match = ((oif != NULL) || + (m->m_pkthdr.rcvif == NULL) || + ( +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + m->m_pkthdr.rcvif, args->f_id.fib) : +#endif + verify_path(src_ip, m->m_pkthdr.rcvif, + args->f_id.fib))); + break; + + case O_VERSRCREACH: + /* Outgoing packets automatically pass/match */ + match = (hlen > 0 && ((oif != NULL) || +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + NULL, args->f_id.fib) : +#endif + verify_path(src_ip, NULL, args->f_id.fib))); + break; + + case O_ANTISPOOF: + /* Outgoing packets automatically pass/match */ + if (oif == NULL && hlen > 0 && + ( (is_ipv4 && in_localaddr(src_ip)) +#ifdef INET6 + || (is_ipv6 && + in6_localaddr(&(args->f_id.src_ip6))) +#endif + )) + match = +#ifdef INET6 + is_ipv6 ? verify_path6( + &(args->f_id.src_ip6), + m->m_pkthdr.rcvif, + args->f_id.fib) : +#endif + verify_path(src_ip, + m->m_pkthdr.rcvif, + args->f_id.fib); + else + match = 1; + break; + + case O_IPSEC: +#ifdef IPSEC + match = (m_tag_find(m, + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); +#endif + /* otherwise no match */ + break; + +#ifdef INET6 + case O_IP6_SRC: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + + case O_IP6_DST: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if (is_ipv6) { + int i = cmdlen - 1; + struct in6_addr p; + struct in6_addr *d = + &((ipfw_insn_ip6 *)cmd)->addr6; + + for (; !match && i > 0; d += 2, + i -= F_INSN_SIZE(struct in6_addr) + * 2) { + p = (cmd->opcode == + O_IP6_SRC_MASK) ? + args->f_id.src_ip6: + args->f_id.dst_ip6; + APPLY_MASK(&p, &d[1]); + match = + IN6_ARE_ADDR_EQUAL(&d[0], + &p); + } + } + break; + + case O_FLOW6ID: + match = is_ipv6 && + flow6id_match(args->f_id.flow_id6, + (ipfw_insn_u32 *) cmd); + break; + + case O_EXT_HDR: + match = is_ipv6 && + (ext_hd & ((ipfw_insn *) cmd)->arg1); + break; + + case O_IP6: + match = is_ipv6; + break; +#endif + + case O_IP4: + match = is_ipv4; + break; + + case O_TAG: { + struct m_tag *mtag; + uint32_t tag = TARG(cmd->arg1, tag); + + /* Packet is already tagged with this tag? */ + mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); + + /* We have `untag' action when F_NOT flag is + * present. And we must remove this mtag from + * mbuf and reset `match' to zero (`match' will + * be inversed later). + * Otherwise we should allocate new mtag and + * push it into mbuf. + */ + if (cmd->len & F_NOT) { /* `untag' action */ + if (mtag != NULL) + m_tag_delete(m, mtag); + match = 0; + } else { + if (mtag == NULL) { + mtag = m_tag_alloc( MTAG_IPFW, + tag, 0, M_NOWAIT); + if (mtag != NULL) + m_tag_prepend(m, mtag); + } + match = 1; + } + break; + } + + case O_FIB: /* try match the specified fib */ + if (args->f_id.fib == cmd->arg1) + match = 1; + break; + + case O_SOCKARG: { +#ifndef USERSPACE /* not supported in userspace */ + struct inpcb *inp = args->inp; + struct inpcbinfo *pi; + + if (is_ipv6) /* XXX can we remove this ? */ + break; + + if (proto == IPPROTO_TCP) + pi = &V_tcbinfo; + else if (proto == IPPROTO_UDP) + pi = &V_udbinfo; + else + break; + + /* + * XXXRW: so_user_cookie should almost + * certainly be inp_user_cookie? + */ + + /* For incomming packet, lookup up the + inpcb using the src/dest ip/port tuple */ + if (inp == NULL) { + inp = in_pcblookup(pi, + src_ip, htons(src_port), + dst_ip, htons(dst_port), + INPLOOKUP_RLOCKPCB, NULL); + if (inp != NULL) { + tablearg = + inp->inp_socket->so_user_cookie; + if (tablearg) + match = 1; + INP_RUNLOCK(inp); + } + } else { + if (inp->inp_socket) { + tablearg = + inp->inp_socket->so_user_cookie; + if (tablearg) + match = 1; + } + } +#endif /* !USERSPACE */ + break; + } + + case O_TAGGED: { + struct m_tag *mtag; + uint32_t tag = TARG(cmd->arg1, tag); + + if (cmdlen == 1) { + match = m_tag_locate(m, MTAG_IPFW, + tag, NULL) != NULL; + break; + } + + /* we have ranges */ + for (mtag = m_tag_first(m); + mtag != NULL && !match; + mtag = m_tag_next(m, mtag)) { + uint16_t *p; + int i; + + if (mtag->m_tag_cookie != MTAG_IPFW) + continue; + + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for(; !match && i > 0; i--, p += 2) + match = + mtag->m_tag_id >= p[0] && + mtag->m_tag_id <= p[1]; + } + break; + } + + /* + * The second set of opcodes represents 'actions', + * i.e. the terminal part of a rule once the packet + * matches all previous patterns. + * Typically there is only one action for each rule, + * and the opcode is stored at the end of the rule + * (but there are exceptions -- see below). + * + * In general, here we set retval and terminate the + * outer loop (would be a 'break 3' in some language, + * but we need to set l=0, done=1) + * + * Exceptions: + * O_COUNT and O_SKIPTO actions: + * instead of terminating, we jump to the next rule + * (setting l=0), or to the SKIPTO target (setting + * f/f_len, cmd and l as needed), respectively. + * + * O_TAG, O_LOG and O_ALTQ action parameters: + * perform some action and set match = 1; + * + * O_LIMIT and O_KEEP_STATE: these opcodes are + * not real 'actions', and are stored right + * before the 'action' part of the rule. + * These opcodes try to install an entry in the + * state tables; if successful, we continue with + * the next opcode (match=1; break;), otherwise + * the packet must be dropped (set retval, + * break loops with l=0, done=1) + * + * O_PROBE_STATE and O_CHECK_STATE: these opcodes + * cause a lookup of the state table, and a jump + * to the 'action' part of the parent rule + * if an entry is found, or + * (CHECK_STATE only) a jump to the next rule if + * the entry is not found. + * The result of the lookup is cached so that + * further instances of these opcodes become NOPs. + * The jump to the next rule is done by setting + * l=0, cmdlen=0. + */ + case O_LIMIT: + case O_KEEP_STATE: + if (ipfw_install_state(chain, f, + (ipfw_insn_limit *)cmd, args, tablearg)) { + /* error or limit violation */ + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + } + match = 1; + break; + + case O_PROBE_STATE: + case O_CHECK_STATE: + /* + * dynamic rules are checked at the first + * keep-state or check-state occurrence, + * with the result being stored in dyn_dir. + * The compiler introduces a PROBE_STATE + * instruction for us when we have a + * KEEP_STATE (because PROBE_STATE needs + * to be run first). + */ + if (dyn_dir == MATCH_UNKNOWN && + (q = ipfw_lookup_dyn_rule(&args->f_id, + &dyn_dir, proto == IPPROTO_TCP ? + TCP(ulp) : NULL)) + != NULL) { + /* + * Found dynamic entry, update stats + * and jump to the 'action' part of + * the parent rule by setting + * f, cmd, l and clearing cmdlen. + */ + IPFW_INC_DYN_COUNTER(q, pktlen); + /* XXX we would like to have f_pos + * readily accessible in the dynamic + * rule, instead of having to + * lookup q->rule. + */ + f = q->rule; + f_pos = ipfw_find_rule(chain, + f->rulenum, f->id); + cmd = ACTION_PTR(f); + l = f->cmd_len - f->act_ofs; + ipfw_dyn_unlock(q); + cmdlen = 0; + match = 1; + break; + } + /* + * Dynamic entry not found. If CHECK_STATE, + * skip to next rule, if PROBE_STATE just + * ignore and continue with next opcode. + */ + if (cmd->opcode == O_CHECK_STATE) + l = 0; /* exit inner loop */ + match = 1; + break; + + case O_ACCEPT: + retval = 0; /* accept */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_PIPE: + case O_QUEUE: + set_match(args, f_pos, chain); + args->rule.info = TARG(cmd->arg1, pipe); + if (cmd->opcode == O_PIPE) + args->rule.info |= IPFW_IS_PIPE; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = IP_FW_DUMMYNET; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_DIVERT: + case O_TEE: + if (args->eh) /* not on layer 2 */ + break; + /* otherwise this is terminal */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = (cmd->opcode == O_DIVERT) ? + IP_FW_DIVERT : IP_FW_TEE; + set_match(args, f_pos, chain); + args->rule.info = TARG(cmd->arg1, divert); + break; + + case O_COUNT: + IPFW_INC_RULE_COUNTER(f, pktlen); + l = 0; /* exit inner loop */ + break; + + case O_SKIPTO: + IPFW_INC_RULE_COUNTER(f, pktlen); + f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); + f_pos++) + ; + /* Re-enter the inner loop at the skipto rule. */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + match = 1; + cmdlen = 0; + skip_or = 0; + continue; + break; /* not reached */ + + case O_CALLRETURN: { + /* + * Implementation of `subroutine' call/return, + * in the stack carried in an mbuf tag. This + * is different from `skipto' in that any call + * address is possible (`skipto' must prevent + * backward jumps to avoid endless loops). + * We have `return' action when F_NOT flag is + * present. The `m_tag_id' field is used as + * stack pointer. + */ + struct m_tag *mtag; + uint16_t jmpto, *stack; + +#define IS_CALL ((cmd->len & F_NOT) == 0) +#define IS_RETURN ((cmd->len & F_NOT) != 0) + /* + * Hand-rolled version of m_tag_locate() with + * wildcard `type'. + * If not already tagged, allocate new tag. + */ + mtag = m_tag_first(m); + while (mtag != NULL) { + if (mtag->m_tag_cookie == + MTAG_IPFW_CALL) + break; + mtag = m_tag_next(m, mtag); + } + if (mtag == NULL && IS_CALL) { + mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, + IPFW_CALLSTACK_SIZE * + sizeof(uint16_t), M_NOWAIT); + if (mtag != NULL) + m_tag_prepend(m, mtag); + } + + /* + * On error both `call' and `return' just + * continue with next rule. + */ + if (IS_RETURN && (mtag == NULL || + mtag->m_tag_id == 0)) { + l = 0; /* exit inner loop */ + break; + } + if (IS_CALL && (mtag == NULL || + mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { + printf("ipfw: call stack error, " + "go to next rule\n"); + l = 0; /* exit inner loop */ + break; + } + + IPFW_INC_RULE_COUNTER(f, pktlen); + stack = (uint16_t *)(mtag + 1); + + /* + * The `call' action may use cached f_pos + * (in f->next_rule), whose version is written + * in f->next_rule. + * The `return' action, however, doesn't have + * fixed jump address in cmd->arg1 and can't use + * cache. + */ + if (IS_CALL) { + stack[mtag->m_tag_id] = f->rulenum; + mtag->m_tag_id++; + f_pos = JUMP(chain, f, cmd->arg1, + tablearg, 1); + } else { /* `return' action */ + mtag->m_tag_id--; + jmpto = stack[mtag->m_tag_id] + 1; + f_pos = ipfw_find_rule(chain, jmpto, 0); + } + + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); f_pos++) + ; + /* Re-enter the inner loop at the dest rule. */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + cmdlen = 0; + skip_or = 0; + continue; + break; /* NOTREACHED */ + } +#undef IS_CALL +#undef IS_RETURN + + case O_REJECT: + /* + * Drop the packet and send a reject notice + * if the packet is not ICMP (or is an ICMP + * query), and it is not multicast/broadcast. + */ + if (hlen > 0 && is_ipv4 && offset == 0 && + (proto != IPPROTO_ICMP || + is_icmp_query(ICMP(ulp))) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN_MULTICAST(ntohl(dst_ip.s_addr))) { + send_reject(args, cmd->arg1, iplen, ip); + m = args->m; + } + /* FALLTHROUGH */ +#ifdef INET6 + case O_UNREACH6: + if (hlen > 0 && is_ipv6 && + ((offset & IP6F_OFF_MASK) == 0) && + (proto != IPPROTO_ICMPV6 || + (is_icmp6_query(icmp6_type) == 1)) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { + send_reject6( + args, cmd->arg1, hlen, + (struct ip6_hdr *)ip); + m = args->m; + } + /* FALLTHROUGH */ +#endif + case O_DENY: + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_FORWARD_IP: +#ifndef USERSPACE /* allow forward in userspace */ + if (args->eh) /* not valid on layer2 pkts */ + break; +#endif /* !USERSPACE */ + if (q == NULL || q->rule != f || + dyn_dir == MATCH_FORWARD) { + struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); + if (sa->sin_addr.s_addr == INADDR_ANY) { + bcopy(sa, &args->hopstore, + sizeof(*sa)); + args->hopstore.sin_addr.s_addr = + htonl(tablearg); + args->next_hop = &args->hopstore; + } else { + args->next_hop = sa; + } + } + retval = IP_FW_PASS; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + +#ifdef INET6 + case O_FORWARD_IP6: + if (args->eh) /* not valid on layer2 pkts */ + break; + if (q == NULL || q->rule != f || + dyn_dir == MATCH_FORWARD) { + struct sockaddr_in6 *sin6; + + sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); + args->next_hop6 = sin6; + } + retval = IP_FW_PASS; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; +#endif + + case O_NETGRAPH: + case O_NGTEE: + set_match(args, f_pos, chain); + args->rule.info = TARG(cmd->arg1, netgraph); + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = (cmd->opcode == O_NETGRAPH) ? + IP_FW_NETGRAPH : IP_FW_NGTEE; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_SETFIB: { + uint32_t fib; + + IPFW_INC_RULE_COUNTER(f, pktlen); + fib = TARG(cmd->arg1, fib) & 0x7FFFF; + if (fib >= rt_numfibs) + fib = 0; + M_SETFIB(m, fib); + args->f_id.fib = fib; + l = 0; /* exit inner loop */ + break; + } + + case O_SETDSCP: { + uint16_t code; + + code = TARG(cmd->arg1, dscp) & 0x3F; + l = 0; /* exit inner loop */ + if (is_ipv4) { + uint16_t a; + + a = ip->ip_tos; + ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); + a += ntohs(ip->ip_sum) - ip->ip_tos; + ip->ip_sum = htons(a); + } else if (is_ipv6) { + uint8_t *v; + + v = &((struct ip6_hdr *)ip)->ip6_vfc; + *v = (*v & 0xF0) | (code >> 2); + v++; + *v = (*v & 0x3F) | ((code & 0x03) << 6); + } else + break; + + IPFW_INC_RULE_COUNTER(f, pktlen); + break; + } + + case O_NAT: + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + if (!IPFW_NAT_LOADED) { + retval = IP_FW_DENY; + break; + } + + struct cfg_nat *t; + int nat_id; + + set_match(args, f_pos, chain); + /* Check if this is 'global' nat rule */ + if (cmd->arg1 == 0) { + retval = ipfw_nat_ptr(args, NULL, m); + break; + } + t = ((ipfw_insn_nat *)cmd)->nat; + if (t == NULL) { + nat_id = TARG(cmd->arg1, nat); + t = (*lookup_nat_ptr)(&chain->nat, nat_id); + + if (t == NULL) { + retval = IP_FW_DENY; + break; + } + if (cmd->arg1 != IP_FW_TARG) + ((ipfw_insn_nat *)cmd)->nat = t; + } + retval = ipfw_nat_ptr(args, t, m); + break; + + case O_REASS: { + int ip_off; + + IPFW_INC_RULE_COUNTER(f, pktlen); + l = 0; /* in any case exit inner loop */ + ip_off = ntohs(ip->ip_off); + + /* if not fragmented, go to next rule */ + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) + break; + + args->m = m = ip_reass(m); + + /* + * do IP header checksum fixup. + */ + if (m == NULL) { /* fragment got swallowed */ + retval = IP_FW_DENY; + } else { /* good, packet complete */ + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + retval = IP_FW_REASS; + set_match(args, f_pos, chain); + } + done = 1; /* exit outer loop */ + break; + } + + default: + panic("-- unknown opcode %d\n", cmd->opcode); + } /* end of switch() on opcodes */ + /* + * if we get here with l=0, then match is irrelevant. + */ + + if (cmd->len & F_NOT) + match = !match; + + if (match) { + if (cmd->len & F_OR) + skip_or = 1; + } else { + if (!(cmd->len & F_OR)) /* not an OR block, */ + break; /* try next rule */ + } + + } /* end of inner loop, scan opcodes */ +#undef PULLUP_LEN + + if (done) + break; + +/* next_rule:; */ /* try next rule */ + + } /* end of outer for, scan rules */ + + if (done) { + struct ip_fw *rule = chain->map[f_pos]; + /* Update statistics */ + IPFW_INC_RULE_COUNTER(rule, pktlen); + } else { + retval = IP_FW_DENY; + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + } + IPFW_PF_RUNLOCK(chain); +#ifdef __FreeBSD__ + if (ucred_cache != NULL) + crfree(ucred_cache); +#endif + return (retval); + +pullup_failed: + if (V_fw_verbose) + printf("ipfw: pullup failed\n"); + return (IP_FW_DENY); +} + +/* + * Set maximum number of tables that can be used in given VNET ipfw instance. + */ +#ifdef SYSCTL_NODE +static int +sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int ntables; + + ntables = V_fw_tables_max; + + error = sysctl_handle_int(oidp, &ntables, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + return (ipfw_resize_tables(&V_layer3_chain, ntables)); +} + +/* + * Switches table namespace between global and per-set. + */ +static int +sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int sets; + + sets = V_fw_tables_sets; + + error = sysctl_handle_int(oidp, &sets, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); +} +#endif + +/* + * Module and VNET glue + */ + +/* + * Stuff that must be initialised only on boot or module load + */ +static int +ipfw_init(void) +{ + int error = 0; + + /* + * Only print out this stuff the first time around, + * when called from the sysinit code. + */ + printf("ipfw2 " +#ifdef INET6 + "(+ipv6) " +#endif + "initialized, divert %s, nat %s, " + "default to %s, logging ", +#ifdef IPDIVERT + "enabled", +#else + "loadable", +#endif +#ifdef IPFIREWALL_NAT + "enabled", +#else + "loadable", +#endif + default_to_accept ? "accept" : "deny"); + + /* + * Note: V_xxx variables can be accessed here but the vnet specific + * initializer may not have been called yet for the VIMAGE case. + * Tuneables will have been processed. We will print out values for + * the default vnet. + * XXX This should all be rationalized AFTER 8.0 + */ + if (V_fw_verbose == 0) + printf("disabled\n"); + else if (V_verbose_limit == 0) + printf("unlimited\n"); + else + printf("limited to %d packets/entry by default\n", + V_verbose_limit); + + /* Check user-supplied table count for validness */ + if (default_fw_tables > IPFW_TABLES_MAX) + default_fw_tables = IPFW_TABLES_MAX; + + ipfw_init_sopt_handler(); + ipfw_log_bpf(1); /* init */ + ipfw_iface_init(); + return (error); +} + +/* + * Called for the removal of the last instance only on module unload. + */ +static void +ipfw_destroy(void) +{ + + ipfw_iface_destroy(); + ipfw_log_bpf(0); /* uninit */ + ipfw_destroy_sopt_handler(); + printf("IP firewall unloaded\n"); +} + +/* + * Stuff that must be initialized for every instance + * (including the first of course). + */ +static int +vnet_ipfw_init(const void *unused) +{ + int error, first; + struct ip_fw *rule = NULL; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; + + /* First set up some values that are compile time options */ + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_fw_deny_unknown_exthdrs = 1; +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif +#ifdef IPFIREWALL_NAT + LIST_INIT(&chain->nat); +#endif + + ipfw_init_counters(); + /* insert the default rule and create the initial map */ + chain->n_rules = 1; + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); + rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); + + /* Set initial number of tables */ + V_fw_tables_max = default_fw_tables; + error = ipfw_init_tables(chain, first); + if (error) { + printf("ipfw2: setting up tables failed\n"); + free(chain->map, M_IPFW); + free(rule, M_IPFW); + return (ENOSPC); + } + + /* fill and insert the default rule */ + rule->act_ofs = 0; + rule->rulenum = IPFW_DEFAULT_RULE; + rule->cmd_len = 1; + rule->set = RESVD_SET; + rule->cmd[0].len = 1; + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; + chain->default_rule = chain->map[0] = rule; + chain->id = rule->id = 1; + /* Pre-calculate rules length for legacy dump format */ + chain->static_len = sizeof(struct ip_fw_rule0); + + IPFW_LOCK_INIT(chain); + ipfw_dyn_init(chain); +#ifdef LINEAR_SKIPTO + ipfw_init_skipto_cache(chain); +#endif + + /* First set up some values that are compile time options */ + V_ipfw_vnet_ready = 1; /* Open for business */ + + /* + * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. + * Even if the latter two fail we still keep the module alive + * because the sockopt and layer2 paths are still useful. + * ipfw[6]_hook return 0 on success, ENOENT on failure, + * so we can ignore the exact return value and just set a flag. + * + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so + * changes in the underlying (per-vnet) variables trigger + * immediate hook()/unhook() calls. + * In layer2 we have the same behaviour, except that V_ether_ipfw + * is checked on each packet because there are no pfil hooks. + */ + V_ip_fw_ctl_ptr = ipfw_ctl3; + error = ipfw_attach_hooks(1); + return (error); +} + +/* + * Called for the removal of each instance. + */ +static int +vnet_ipfw_uninit(const void *unused) +{ + struct ip_fw *reap; + struct ip_fw_chain *chain = &V_layer3_chain; + int i, last; + + V_ipfw_vnet_ready = 0; /* tell new callers to go away */ + /* + * disconnect from ipv4, ipv6, layer2 and sockopt. + * Then grab, release and grab again the WLOCK so we make + * sure the update is propagated and nobody will be in. + */ + (void)ipfw_attach_hooks(0 /* detach */); + V_ip_fw_ctl_ptr = NULL; + + last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; + + IPFW_UH_WLOCK(chain); + IPFW_UH_WUNLOCK(chain); + IPFW_UH_WLOCK(chain); + + IPFW_WLOCK(chain); + ipfw_dyn_uninit(0); /* run the callout_drain */ + IPFW_WUNLOCK(chain); + + reap = NULL; + IPFW_WLOCK(chain); + for (i = 0; i < chain->n_rules; i++) + ipfw_reap_add(chain, &reap, chain->map[i]); + free(chain->map, M_IPFW); +#ifdef LINEAR_SKIPTO + ipfw_destroy_skipto_cache(chain); +#endif + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); + ipfw_destroy_tables(chain, last); + if (reap != NULL) + ipfw_reap_rules(reap); + vnet_ipfw_iface_destroy(chain); + IPFW_LOCK_DESTROY(chain); + ipfw_dyn_uninit(1); /* free the remaining parts */ + ipfw_destroy_counters(); + return (0); +} + +/* + * Module event handler. + * In general we have the choice of handling most of these events by the + * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to + * use the SYSINIT handlers as they are more capable of expressing the + * flow of control during module and vnet operations, so this is just + * a skeleton. Note there is no SYSINIT equivalent of the module + * SHUTDOWN handler, but we don't have anything to do in that case anyhow. + */ +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + /* Called once at module load or + * system boot if compiled in. */ + break; + case MOD_QUIESCE: + /* Called before unload. May veto unloading. */ + break; + case MOD_UNLOAD: + /* Called during unload. */ + break; + case MOD_SHUTDOWN: + /* Called during system shutdown. */ + break; + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ +#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ +#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ + +DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); +FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); +MODULE_VERSION(ipfw, 3); +/* should declare some dependencies here */ + +/* + * Starting up. Done in order after ipfwmod() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_init, NULL); +VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_init, NULL); + +/* + * Closing up shop. These are done in REVERSE ORDER, but still + * after ipfwmod() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_destroy, NULL); +VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_uninit, NULL); +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c new file mode 100644 index 0000000..ba6f579 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_dynamic.c @@ -0,0 +1,1604 @@ +/*- + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_dynamic.c 272840 2014-10-09 19:32:35Z melifaro $"); + +#define DEB(x) +#define DDB(x) x + +/* + * Dynamic rule support for ipfw + */ + +#include "opt_ipfw.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> /* ip_defttl */ +#include <netinet/ip_fw.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> +#endif + +#include <netpfil/ipfw/ip_fw_private.h> + +#include <machine/in_cksum.h> /* XXX for in_cksum */ + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is equal to UMA zone items count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. This can be changed by dyn_keep_states + * sysctl. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ + +struct ipfw_dyn_bucket { + struct mtx mtx; /* Bucket protecting lock */ + ipfw_dyn_rule *head; /* Pointer to first rule */ +}; + +/* + * Static variables followed by global ones + */ +static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v); +static VNET_DEFINE(u_int32_t, dyn_buckets_max); +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, ipfw_timeout); +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) +#define V_dyn_buckets_max VNET(dyn_buckets_max) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_ipfw_timeout VNET(ipfw_timeout) + +static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone); +#define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone) + +#define IPFW_BUCK_LOCK_INIT(b) \ + mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF) +#define IPFW_BUCK_LOCK_DESTROY(b) \ + mtx_destroy(&(b)->mtx) +#define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx) +#define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx) +#define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED) + + +static VNET_DEFINE(int, dyn_keep_states); +#define V_dyn_keep_states VNET(dyn_keep_states) + +/* + * Timeouts for various events in handing dynamic rules. + */ +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); + +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); +static VNET_DEFINE(u_int32_t, dyn_keepalive); +static VNET_DEFINE(time_t, dyn_keepalive_last); + +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) +#define V_dyn_keepalive VNET(dyn_keepalive) +#define V_dyn_keepalive_last VNET(dyn_keepalive_last) + +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ + +#define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone) +#define V_dyn_max VNET(dyn_max) + +/* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */ +static int ipfw_dyn_count; /* number of objects */ + +#ifdef USERSPACE /* emulation of UMA object counters for userspace */ +#define uma_zone_get_cur(x) ipfw_dyn_count +#endif /* USERSPACE */ + +static int last_log; /* Log ratelimiting */ + +static void ipfw_dyn_tick(void *vnetx); +static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int); +#ifdef SYSCTL_NODE + +static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS); +static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS); + +SYSBEGIN(f2) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, + "Max number of dyn. buckets"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, + "Current Number of dyn. buckets"); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLTYPE_UINT|CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", + "Number of dyn. rules"); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", + "Max number of dyn. rules"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, + "Lifetime of dyn. rules for acks"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, + "Lifetime of dyn. rules for syn"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, + "Lifetime of dyn. rules for fin"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, + "Lifetime of dyn. rules for rst"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, + "Lifetime of dyn. rules for UDP"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, + "Enable keepalives for dyn. rules"); +SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, + CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, + "Do not flush dynamic states on rule deletion"); + +SYSEND + +#endif /* SYSCTL_NODE */ + + +#ifdef INET6 +static __inline int +hash_packet6(struct ipfw_flow_id *id) +{ + u_int32_t i; + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->dst_port) ^ (id->src_port); + return i; +} +#endif + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id, int buckets) +{ + u_int32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = hash_packet6(id); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (buckets - 1); + return i; +} + +/** + * Print customizable flow id description via log(9) facility. + */ +static void +print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags, + char *prefix, char *postfix) +{ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) { + ip6_sprintf(src, &id->src_ip6); + ip6_sprintf(dst, &id->dst_ip6); + } else +#endif + { + da.s_addr = htonl(id->src_ip); + inet_ntop(AF_INET, &da, src, sizeof(src)); + da.s_addr = htonl(id->dst_ip); + inet_ntop(AF_INET, &da, dst, sizeof(dst)); + } + log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", + prefix, dyn_type, src, id->src_port, dst, + id->dst_port, DYN_COUNT, postfix); +} + +#define print_dyn_rule(id, dtype, prefix, postfix) \ + print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define TIME_LE(a,b) ((int)((a)-(b)) < 0) + +/* + * Lookup a dynamic rule, locked version. + */ +static ipfw_dyn_rule * +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction, + struct tcphdr *tcp) +{ + /* + * Stateful ipfw extensions. + * Lookup into dynamic session queue. + */ +#define MATCH_REVERSE 0 +#define MATCH_FORWARD 1 +#define MATCH_NONE 2 +#define MATCH_UNKNOWN 3 + int dir = MATCH_NONE; + ipfw_dyn_rule *prev, *q = NULL; + + IPFW_BUCK_ASSERT(i); + + for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) { + if (q->dyn_type == O_LIMIT_PARENT && q->count) + continue; + + if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT) + continue; + + if (IS_IP6_FLOW_ID(pkt)) { + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + dir = MATCH_FORWARD; + break; + } + if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && + IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port) { + dir = MATCH_REVERSE; + break; + } + } else { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port) { + dir = MATCH_REVERSE; + break; + } + } + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if (prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = q; + } + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ + uint32_t ack; + u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); + +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) +#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) +#define ACK_FWD 0x10000 /* fwd ack seen */ +#define ACK_REV 0x20000 /* rev ack seen */ + + q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); + switch (q->state & TCP_FLAGS) { + case TH_SYN: /* opening */ + q->expire = time_uptime + V_dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN: /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8): +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + if (tcp == NULL) + break; + + ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || + _SEQ_GE(ack, q->ack_fwd)) { + q->ack_fwd = ack; + q->state |= ACK_FWD; + } + } else { + if (q->ack_rev == 0 || + _SEQ_GE(ack, q->ack_rev)) { + q->ack_rev = ack; + q->state |= ACK_REV; + } + } + if ((q->state & (ACK_FWD | ACK_REV)) == + (ACK_FWD | ACK_REV)) { + q->expire = time_uptime + V_dyn_ack_lifetime; + q->state &= ~(ACK_FWD | ACK_REV); + } + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; + break; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = time_uptime + V_dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = time_uptime + V_dyn_short_lifetime; + } +done: + if (match_direction != NULL) + *match_direction = dir; + return (q); +} + +ipfw_dyn_rule * +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + ipfw_dyn_rule *q; + int i; + + i = hash_packet(pkt, V_curr_dyn_buckets); + + IPFW_BUCK_LOCK(i); + q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp); + if (q == NULL) + IPFW_BUCK_UNLOCK(i); + /* NB: return table locked when q is not NULL */ + return q; +} + +/* + * Unlock bucket mtx + * @p - pointer to dynamic rule + */ +void +ipfw_dyn_unlock(ipfw_dyn_rule *q) +{ + + IPFW_BUCK_UNLOCK(q->bucket); +} + +static int +resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets) +{ + int i, k, nbuckets_old; + ipfw_dyn_rule *q; + struct ipfw_dyn_bucket *dyn_v, *dyn_v_old; + + /* Check if given number is power of 2 and less than 64k */ + if ((nbuckets > 65536) || (!powerof2(nbuckets))) + return 1; + + CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__, + V_curr_dyn_buckets, nbuckets); + + /* Allocate and initialize new hash */ + dyn_v = malloc(nbuckets * sizeof(ipfw_dyn_rule), M_IPFW, + M_WAITOK | M_ZERO); + + for (i = 0 ; i < nbuckets; i++) + IPFW_BUCK_LOCK_INIT(&dyn_v[i]); + + /* + * Call upper half lock, as get_map() do to ease + * read-only access to dynamic rules hash from sysctl + */ + IPFW_UH_WLOCK(chain); + + /* + * Acquire chain write lock to permit hash access + * for main traffic path without additional locks + */ + IPFW_WLOCK(chain); + + /* Save old values */ + nbuckets_old = V_curr_dyn_buckets; + dyn_v_old = V_ipfw_dyn_v; + + /* Skip relinking if array is not set up */ + if (V_ipfw_dyn_v == NULL) + V_curr_dyn_buckets = 0; + + /* Re-link all dynamic states */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + while (V_ipfw_dyn_v[i].head != NULL) { + /* Remove from current chain */ + q = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = q->next; + + /* Get new hash value */ + k = hash_packet(&q->id, nbuckets); + q->bucket = k; + /* Add to the new head */ + q->next = dyn_v[k].head; + dyn_v[k].head = q; + } + } + + /* Update current pointers/buckets values */ + V_curr_dyn_buckets = nbuckets; + V_ipfw_dyn_v = dyn_v; + + IPFW_WUNLOCK(chain); + + IPFW_UH_WUNLOCK(chain); + + /* Start periodic callout on initial creation */ + if (dyn_v_old == NULL) { + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0); + return (0); + } + + /* Destroy all mutexes */ + for (i = 0 ; i < nbuckets_old ; i++) + IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]); + + /* Free old hash */ + free(dyn_v_old, M_IPFW); + + return 0; +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, int i, u_int8_t dyn_type, struct ip_fw *rule) +{ + ipfw_dyn_rule *r; + + IPFW_BUCK_ASSERT(i); + + r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + if (r == NULL) { + if (last_log != time_uptime) { + last_log = time_uptime; + log(LOG_DEBUG, "ipfw: %s: Cannot allocate rule\n", + __func__); + } + return NULL; + } + ipfw_dyn_count++; + + /* + * refcount on parent is already incremented, so + * it is safe to use parent unlocked. + */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = time_uptime + V_dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + IPFW_ZERO_DYN_COUNTER(r); + r->count = 0; + + r->bucket = i; + r->next = V_ipfw_dyn_v[i].head; + V_ipfw_dyn_v[i].head = r; + DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule) +{ + ipfw_dyn_rule *q; + int i, is_v6; + + is_v6 = IS_IP6_FLOW_ID(pkt); + i = hash_packet( pkt, V_curr_dyn_buckets ); + *pindex = i; + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + rule== q->rule && + pkt->proto == q->id.proto && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port && + ( + (is_v6 && + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6))) || + (!is_v6 && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip) + ) + ) { + q->expire = time_uptime + V_dyn_short_lifetime; + DEB(print_dyn_rule(pkt, q->dyn_type, + "lookup_dyn_parent found", "");) + return q; + } + + /* Add virtual limiting rule */ + return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +int +ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) +{ + ipfw_dyn_rule *q; + int i; + + DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", "");) + + i = hash_packet(&args->f_id, V_curr_dyn_buckets); + + IPFW_BUCK_LOCK(i); + + q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); + + if (q != NULL) { /* should never occur */ + DEB( + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: entry already present, done\n", + __func__); + }) + IPFW_BUCK_UNLOCK(i); + return (0); + } + + /* + * State limiting is done via uma(9) zone limiting. + * Save pointer to newly-installed rule and reject + * packet if add_dyn_rule() returned NULL. + * Note q is currently set to NULL. + */ + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule); + break; + + case O_LIMIT: { /* limit number of sessions */ + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + uint32_t conn_limit; + uint16_t limit_mask = cmd->limit_mask; + int pindex; + + conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); + + DEB( + if (cmd->conn_limit == IP_FW_TARG) + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " + "(tablearg)\n", __func__, conn_limit); + else + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", + __func__, conn_limit); + ) + + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + id.addr_type = args->f_id.addr_type; + id.fib = M_GETFIB(args->m); + + if (IS_IP6_FLOW_ID (&(args->f_id))) { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip6 = args->f_id.src_ip6; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip6 = args->f_id.dst_ip6; + } else { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + } + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + + /* + * We have to release lock for previous bucket to + * avoid possible deadlock + */ + IPFW_BUCK_UNLOCK(i); + + if ((parent = lookup_dyn_parent(&id, &pindex, rule)) == NULL) { + printf("ipfw: %s: add parent failed\n", __func__); + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + + if (parent->count >= conn_limit) { + if (V_fw_verbose && last_log != time_uptime) { + last_log = time_uptime; + char sbuf[24]; + last_log = time_uptime; + snprintf(sbuf, sizeof(sbuf), + "%d drop session", + parent->rule->rulenum); + print_dyn_rule_flags(&args->f_id, + cmd->o.opcode, + LOG_SECURITY | LOG_DEBUG, + sbuf, "too many entries"); + } + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + /* Increment counter on parent */ + parent->count++; + IPFW_BUCK_UNLOCK(pindex); + + IPFW_BUCK_LOCK(i); + q = add_dyn_rule(&args->f_id, i, O_LIMIT, (struct ip_fw *)parent); + if (q == NULL) { + /* Decrement index and notify caller */ + IPFW_BUCK_UNLOCK(i); + IPFW_BUCK_LOCK(pindex); + parent->count--; + IPFW_BUCK_UNLOCK(pindex); + return (1); + } + break; + } + default: + printf("ipfw: %s: unknown dynamic rule type %u\n", + __func__, cmd->o.opcode); + } + + if (q == NULL) { + IPFW_BUCK_UNLOCK(i); + return (1); /* Notify caller about failure */ + } + + /* XXX just set lifetime */ + lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); + + IPFW_BUCK_UNLOCK(i); + return (0); +} + +/* + * Generate a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required + * so that MAC can label the reply appropriately. + */ +struct mbuf * +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, + u_int32_t ack, int flags) +{ + struct mbuf *m = NULL; /* stupid compiler */ + int len, dir; + struct ip *h = NULL; /* stupid compiler */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif + struct tcphdr *th = NULL; + + MGETHDR(m, M_NOWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + M_SETFIB(m, id->fib); +#ifdef MAC + if (replyto != NULL) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; /* don't warn about unused arg */ +#endif + + switch (id->addr_type) { + case 4: + len = sizeof(struct ip) + sizeof(struct tcphdr); + break; +#ifdef INET6 + case 6: + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + break; +#endif + default: + /* XXX: log me?!? */ + FREE_PKT(m); + return (NULL); + } + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); + + m->m_data += max_linkhdr; + m->m_flags |= M_SKIP_FIREWALL; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + + switch (id->addr_type) { + case 4: + h = mtod(m, struct ip *); + + /* prepare for checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(sizeof(struct tcphdr)); + if (dir) { + h->ip_src.s_addr = htonl(id->src_ip); + h->ip_dst.s_addr = htonl(id->dst_ip); + } else { + h->ip_src.s_addr = htonl(id->dst_ip); + h->ip_dst.s_addr = htonl(id->src_ip); + } + + th = (struct tcphdr *)(h + 1); + break; +#ifdef INET6 + case 6: + h6 = mtod(m, struct ip6_hdr *); + + /* prepare for checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(sizeof(struct tcphdr)); + if (dir) { + h6->ip6_src = id->src_ip6; + h6->ip6_dst = id->dst_ip6; + } else { + h6->ip6_src = id->dst_ip6; + h6->ip6_dst = id->src_ip6; + } + + th = (struct tcphdr *)(h6 + 1); + break; +#endif + } + + if (dir) { + th->th_sport = htons(id->src_port); + th->th_dport = htons(id->dst_port); + } else { + th->th_sport = htons(id->dst_port); + th->th_dport = htons(id->src_port); + } + th->th_off = sizeof(struct tcphdr) >> 2; + + if (flags & TH_RST) { + if (flags & TH_ACK) { + th->th_seq = htonl(ack); + th->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + th->th_ack = htonl(seq); + th->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * Keepalive - use caller provided sequence numbers + */ + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_flags = TH_ACK; + } + + switch (id->addr_type) { + case 4: + th->th_sum = in_cksum(m, len); + + /* finish the ip header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = htons(0); + h->ip_len = htons(len); + h->ip_ttl = V_ip_defttl; + h->ip_sum = 0; + break; +#ifdef INET6 + case 6: + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), + sizeof(struct tcphdr)); + + /* finish the ip6 header */ + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + break; +#endif + } + + return (m); +} + +/* + * Queue keepalive packets for given dynamic rule + */ +static struct mbuf ** +ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q) +{ + struct mbuf *m_rev, *m_fwd; + + m_rev = (q->state & ACK_REV) ? NULL : + ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); + m_fwd = (q->state & ACK_FWD) ? NULL : + ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); + + if (m_rev != NULL) { + *mtailp = m_rev; + mtailp = &(*mtailp)->m_nextpkt; + } + if (m_fwd != NULL) { + *mtailp = m_fwd; + mtailp = &(*mtailp)->m_nextpkt; + } + + return (mtailp); +} + +/* + * This procedure is used to perform various maintance + * on dynamic hash list. Currently it is called every second. + */ +static void +ipfw_dyn_tick(void * vnetx) +{ + struct ip_fw_chain *chain; + int check_ka = 0; +#ifdef VIMAGE + struct vnet *vp = vnetx; +#endif + + CURVNET_SET(vp); + + chain = &V_layer3_chain; + + /* Run keepalive checks every keepalive_period iff ka is enabled */ + if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) && + (V_dyn_keepalive != 0)) { + V_dyn_keepalive_last = time_uptime; + check_ka = 1; + } + + check_dyn_rules(chain, NULL, check_ka, 1); + + callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0); + + CURVNET_RESTORE(); +} + + +/* + * Walk thru all dynamic states doing generic maintance: + * 1) free expired states + * 2) free all states based on deleted rule / set + * 3) send keepalives for states if needed + * + * @chain - pointer to current ipfw rules chain + * @rule - delete all states originated by given rule if != NULL + * @set - delete all states originated by any rule in set @set if != RESVD_SET + * @check_ka - perform checking/sending keepalives + * @timer - indicate call from timer routine. + * + * Timer routine must call this function unlocked to permit + * sending keepalives/resizing table. + * + * Others has to call function with IPFW_UH_WLOCK held. + * Additionally, function assume that dynamic rule/set is + * ALREADY deleted so no new states can be generated by + * 'deleted' rules. + * + * Write lock is needed to ensure that unused parent rules + * are not freed by other instance (see stage 2, 3) + */ +static void +check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt, + int check_ka, int timer) +{ + struct mbuf *m0, *m, *mnext, **mtailp; + struct ip *h; + int i, dyn_count, new_buckets = 0, max_buckets; + int expired = 0, expired_limits = 0, parents = 0, total = 0; + ipfw_dyn_rule *q, *q_prev, *q_next; + ipfw_dyn_rule *exp_head, **exptailp; + ipfw_dyn_rule *exp_lhead, **expltailp; + + KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated", + __func__)); + + /* Avoid possible LOR */ + KASSERT(!check_ka || timer, ("%s: keepalive check with lock held", + __func__)); + + /* + * Do not perform any checks if we currently have no dynamic states + */ + if (DYN_COUNT == 0) + return; + + /* Expired states */ + exp_head = NULL; + exptailp = &exp_head; + + /* Expired limit states */ + exp_lhead = NULL; + expltailp = &exp_lhead; + + /* + * We make a chain of packets to go out here -- not deferring + * until after we drop the IPFW dynamic rule lock would result + * in a lock order reversal with the normal packet input -> ipfw + * call stack. + */ + m0 = NULL; + mtailp = &m0; + + /* Protect from hash resizing */ + if (timer != 0) + IPFW_UH_WLOCK(chain); + else + IPFW_UH_WLOCK_ASSERT(chain); + +#define NEXT_RULE() { q_prev = q; q = q->next ; continue; } + + /* Stage 1: perform requested deletion */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) { + /* account every rule */ + total++; + + /* Skip parent rules at all */ + if (q->dyn_type == O_LIMIT_PARENT) { + parents++; + NEXT_RULE(); + } + + /* + * Remove rules which are: + * 1) expired + * 2) matches deletion range + */ + if ((TIME_LEQ(q->expire, time_uptime)) || + (rt != NULL && ipfw_match_range(q->rule, rt))) { + if (TIME_LE(time_uptime, q->expire) && + q->dyn_type == O_KEEP_STATE && + V_dyn_keep_states != 0) { + /* + * Do not delete state if + * it is not expired and + * dyn_keep_states is ON. + * However we need to re-link it + * to any other stable rule + */ + q->rule = chain->default_rule; + NEXT_RULE(); + } + + /* Unlink q from current list */ + q_next = q->next; + if (q == V_ipfw_dyn_v[i].head) + V_ipfw_dyn_v[i].head = q_next; + else + q_prev->next = q_next; + + q->next = NULL; + + /* queue q to expire list */ + if (q->dyn_type != O_LIMIT) { + *exptailp = q; + exptailp = &(*exptailp)->next; + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink entry", "left"); + ) + } else { + /* Separate list for limit rules */ + *expltailp = q; + expltailp = &(*expltailp)->next; + expired_limits++; + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink limit entry", "left"); + ) + } + + q = q_next; + expired++; + continue; + } + + /* + * Check if we need to send keepalive: + * we need to ensure if is time to do KA, + * this is established TCP session, and + * expire time is within keepalive interval + */ + if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) && + ((q->state & BOTH_SYN) == BOTH_SYN) && + (TIME_LEQ(q->expire, time_uptime + + V_dyn_keepalive_interval))) + mtailp = ipfw_dyn_send_ka(mtailp, q); + + NEXT_RULE(); + } + IPFW_BUCK_UNLOCK(i); + } + + /* Stage 2: decrement counters from O_LIMIT parents */ + if (expired_limits != 0) { + /* + * XXX: Note that deleting set with more than one + * heavily-used LIMIT rules can result in overwhelming + * locking due to lack of per-hash value sorting + * + * We should probably think about: + * 1) pre-allocating hash of size, say, + * MAX(16, V_curr_dyn_buckets / 1024) + * 2) checking if expired_limits is large enough + * 3) If yes, init hash (or its part), re-link + * current list and start decrementing procedure in + * each bucket separately + */ + + /* + * Small optimization: do not unlock bucket until + * we see the next item resides in different bucket + */ + if (exp_lhead != NULL) { + i = exp_lhead->parent->bucket; + IPFW_BUCK_LOCK(i); + } + for (q = exp_lhead; q != NULL; q = q->next) { + if (i != q->parent->bucket) { + IPFW_BUCK_UNLOCK(i); + i = q->parent->bucket; + IPFW_BUCK_LOCK(i); + } + + /* Decrease parent refcount */ + q->parent->count--; + } + if (exp_lhead != NULL) + IPFW_BUCK_UNLOCK(i); + } + + /* + * We protectet ourselves from unused parent deletion + * (from the timer function) by holding UH write lock. + */ + + /* Stage 3: remove unused parent rules */ + if ((parents != 0) && (expired != 0)) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + IPFW_BUCK_LOCK(i); + for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) { + if (q->dyn_type != O_LIMIT_PARENT) + NEXT_RULE(); + + if (q->count != 0) + NEXT_RULE(); + + /* Parent rule without consumers */ + + /* Unlink q from current list */ + q_next = q->next; + if (q == V_ipfw_dyn_v[i].head) + V_ipfw_dyn_v[i].head = q_next; + else + q_prev->next = q_next; + + q->next = NULL; + + /* Add to expired list */ + *exptailp = q; + exptailp = &(*exptailp)->next; + + DEB(print_dyn_rule(&q->id, q->dyn_type, + "unlink parent entry", "left"); + ) + + expired++; + + q = q_next; + } + IPFW_BUCK_UNLOCK(i); + } + } + +#undef NEXT_RULE + + if (timer != 0) { + /* + * Check if we need to resize hash: + * if current number of states exceeds number of buckes in hash, + * grow hash size to the minimum power of 2 which is bigger than + * current states count. Limit hash size by 64k. + */ + max_buckets = (V_dyn_buckets_max > 65536) ? + 65536 : V_dyn_buckets_max; + + dyn_count = DYN_COUNT; + + if ((dyn_count > V_curr_dyn_buckets * 2) && + (dyn_count < max_buckets)) { + new_buckets = V_curr_dyn_buckets; + while (new_buckets < dyn_count) { + new_buckets *= 2; + + if (new_buckets >= max_buckets) + break; + } + } + + IPFW_UH_WUNLOCK(chain); + } + + /* Finally delete old states ad limits if any */ + for (q = exp_head; q != NULL; q = q_next) { + q_next = q->next; + uma_zfree(V_ipfw_dyn_rule_zone, q); + ipfw_dyn_count--; + } + + for (q = exp_lhead; q != NULL; q = q_next) { + q_next = q->next; + uma_zfree(V_ipfw_dyn_rule_zone, q); + ipfw_dyn_count--; + } + + /* + * The rest code MUST be called from timer routine only + * without holding any locks + */ + if (timer == 0) + return; + + /* Send keepalive packets if any */ + for (m = m0; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + h = mtod(m, struct ip *); + if (h->ip_v == 4) + ip_output(m, NULL, NULL, 0, NULL, NULL); +#ifdef INET6 + else + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); +#endif + } + + /* Run table resize without holding any locks */ + if (new_buckets != 0) + resize_dynamic_table(chain, new_buckets); +} + +/* + * Deletes all dynamic rules originated by given rule or all rules in + * given set. Specify RESVD_SET to indicate set should not be used. + * @chain - pointer to current ipfw rules chain + * @rr - delete all states originated by rules in matched range. + * + * Function has to be called with IPFW_UH_WLOCK held. + * Additionally, function assume that dynamic rule/set is + * ALREADY deleted so no new states can be generated by + * 'deleted' rules. + */ +void +ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + + check_dyn_rules(chain, rt, 0, 0); +} + +/* + * Check if rule contains at least one dynamic opcode. + * + * Returns 1 if such opcode is found, 0 otherwise. + */ +int +ipfw_is_dyn_rule(struct ip_fw *rule) +{ + int cmdlen, l; + ipfw_insn *cmd; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + case O_LIMIT: + case O_KEEP_STATE: + case O_PROBE_STATE: + case O_CHECK_STATE: + return (1); + } + } + + return (0); +} + +void +ipfw_dyn_init(struct ip_fw_chain *chain) +{ + + V_ipfw_dyn_v = NULL; + V_dyn_buckets_max = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + V_dyn_keepalive_last = time_uptime; + + V_dyn_max = 4096; /* max # of dynamic rules */ + + V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + /* Enforce limit on dynamic rules */ + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); + + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); + + /* + * This can potentially be done on first dynamic rule + * being added to chain. + */ + resize_dynamic_table(chain, V_curr_dyn_buckets); +} + +void +ipfw_dyn_uninit(int pass) +{ + int i; + + if (pass == 0) { + callout_drain(&V_ipfw_timeout); + return; + } + + if (V_ipfw_dyn_v != NULL) { + /* + * Skip deleting all dynamic states - + * uma_zdestroy() does this more efficiently; + */ + + /* Destroy all mutexes */ + for (i = 0 ; i < V_curr_dyn_buckets ; i++) + IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]); + free(V_ipfw_dyn_v, M_IPFW); + V_ipfw_dyn_v = NULL; + } + + uma_zdestroy(V_ipfw_dyn_rule_zone); +} + +#ifdef SYSCTL_NODE +/* + * Get/set maximum number of dynamic states in given VNET instance. + */ +static int +sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int nstates; + + nstates = V_dyn_max; + + error = sysctl_handle_int(oidp, &nstates, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + V_dyn_max = nstates; + uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); + + return (0); +} + +/* + * Get current number of dynamic states in given VNET instance. + */ +static int +sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS) +{ + int error; + unsigned int nstates; + + nstates = DYN_COUNT; + + error = sysctl_handle_int(oidp, &nstates, 0, req); + + return (error); +} +#endif + +/* + * Returns size of dynamic states in legacy format + */ +int +ipfw_dyn_len(void) +{ + + return (V_ipfw_dyn_v == NULL) ? 0 : + (DYN_COUNT * sizeof(ipfw_dyn_rule)); +} + +/* + * Returns number of dynamic states. + * Used by dump format v1 (current). + */ +int +ipfw_dyn_get_count(void) +{ + + return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT; +} + +static void +export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst) +{ + + memcpy(dst, src, sizeof(*src)); + memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum)); + /* + * store set number into high word of + * dst->rule pointer. + */ + memcpy((char *)&dst->rule + sizeof(src->rule->rulenum), + &(src->rule->set), sizeof(src->rule->set)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + memcpy(&dst->next, &dst, sizeof(dst)); + dst->expire = + TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime; +} + +/* + * Fills int buffer given by @sd with dynamic states. + * Used by dump format v1 (current). + * + * Returns 0 on success. + */ +int +ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) +{ + ipfw_dyn_rule *p; + ipfw_obj_dyntlv *dst, *last; + ipfw_obj_ctlv *ctlv; + int i; + size_t sz; + + if (V_ipfw_dyn_v == NULL) + return (0); + + IPFW_UH_RLOCK_ASSERT(chain); + + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + sz = sizeof(ipfw_obj_dyntlv); + ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; + ctlv->objsize = sz; + last = NULL; + + for (i = 0 ; i < V_curr_dyn_buckets; i++) { + IPFW_BUCK_LOCK(i); + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { + dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz); + if (dst == NULL) { + IPFW_BUCK_UNLOCK(i); + return (ENOMEM); + } + + export_dyn_rule(p, &dst->state); + dst->head.length = sz; + dst->head.type = IPFW_TLV_DYN_ENT; + last = dst; + } + IPFW_BUCK_UNLOCK(i); + } + + if (last != NULL) /* mark last dynamic rule */ + last->head.flags = IPFW_DF_LAST; + + return (0); +} + +/* + * Fill given buffer with dynamic states (legacy format). + * IPFW_UH_RLOCK has to be held while calling. + */ +void +ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) +{ + ipfw_dyn_rule *p, *last = NULL; + char *bp; + int i; + + if (V_ipfw_dyn_v == NULL) + return; + bp = *pbp; + + IPFW_UH_RLOCK_ASSERT(chain); + + for (i = 0 ; i < V_curr_dyn_buckets; i++) { + IPFW_BUCK_LOCK(i); + for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { + if (bp + sizeof *p <= ep) { + ipfw_dyn_rule *dst = + (ipfw_dyn_rule *)bp; + + export_dyn_rule(p, dst); + last = dst; + bp += sizeof(ipfw_dyn_rule); + } + } + IPFW_BUCK_UNLOCK(i); + } + + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + *pbp = bp; +} +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c new file mode 100644 index 0000000..7e9c992 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_iface.c @@ -0,0 +1,537 @@ +/*- + * Copyright (c) 2014 Yandex LLC. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: projects/ipfw/sys/netpfil/ipfw/ip_fw_iface.c 267384 2014-06-12 09:59:11Z melifaro $"); + +/* + * Kernel interface tracking API. + * + */ + +#include "opt_ipfw.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/eventhandler.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> + +#define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg) + +#define DEFAULT_IFACES 128 + +static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex); +static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex); +static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces }, +}; + +/* + * FreeBSD Kernel interface. + */ +static void ipfw_kifhandler(void *arg, struct ifnet *ifp); +static int ipfw_kiflookup(char *name); +static void iface_khandler_register(void); +static void iface_khandler_deregister(void); + +static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event; +static int num_vnets = 0; +static struct mtx vnet_mtx; + +/* + * Checks if kernel interface is contained in our tracked + * interface list and calls attach/detach handler. + */ +static void +ipfw_kifhandler(void *arg, struct ifnet *ifp) +{ + struct ip_fw_chain *ch; + struct ipfw_iface *iif; + struct namedobj_instance *ii; + uintptr_t htype; + + if (V_ipfw_vnet_ready == 0) + return; + + ch = &V_layer3_chain; + htype = (uintptr_t)arg; + + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + if (ii == NULL) { + IPFW_UH_WUNLOCK(ch); + return; + } + iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0, + if_name(ifp)); + if (iif != NULL) { + if (htype == 1) + handle_ifattach(ch, iif, ifp->if_index); + else + handle_ifdetach(ch, iif, ifp->if_index); + } + IPFW_UH_WUNLOCK(ch); +} + +/* + * Reference current VNET as iface tracking API user. + * Registers interface tracking handlers for first VNET. + */ +static void +iface_khandler_register() +{ + int create; + + create = 0; + + mtx_lock(&vnet_mtx); + if (num_vnets == 0) + create = 1; + num_vnets++; + mtx_unlock(&vnet_mtx); + + if (create == 0) + return; + + printf("IPFW: starting up interface tracker\n"); + + ipfw_ifdetach_event = EVENTHANDLER_REGISTER( + ifnet_departure_event, ipfw_kifhandler, NULL, + EVENTHANDLER_PRI_ANY); + ipfw_ifattach_event = EVENTHANDLER_REGISTER( + ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1), + EVENTHANDLER_PRI_ANY); +} + +/* + * + * Detach interface event handlers on last VNET instance + * detach. + */ +static void +iface_khandler_deregister() +{ + int destroy; + + destroy = 0; + mtx_lock(&vnet_mtx); + if (num_vnets == 1) + destroy = 1; + num_vnets--; + mtx_unlock(&vnet_mtx); + + if (destroy == 0) + return; + + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, + ipfw_ifattach_event); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + ipfw_ifdetach_event); +} + +/* + * Retrieves ifindex for given @name. + * + * Returns ifindex or 0. + */ +static int +ipfw_kiflookup(char *name) +{ + struct ifnet *ifp; + int ifindex; + + ifindex = 0; + + if ((ifp = ifunit_ref(name)) != NULL) { + ifindex = ifp->if_index; + if_rele(ifp); + } + + return (ifindex); +} + +/* + * Global ipfw startup hook. + * Since we perform lazy initialization, do nothing except + * mutex init. + */ +int +ipfw_iface_init() +{ + + mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF); + IPFW_ADD_SOPT_HANDLER(1, scodes); + return (0); +} + +/* + * Global ipfw destroy hook. + * Unregister khandlers iff init has been done. + */ +void +ipfw_iface_destroy() +{ + + IPFW_DEL_SOPT_HANDLER(1, scodes); + mtx_destroy(&vnet_mtx); +} + +/* + * Perform actual init on internal request. + * Inits both namehash and global khandler. + */ +static void +vnet_ipfw_iface_init(struct ip_fw_chain *ch) +{ + struct namedobj_instance *ii; + + ii = ipfw_objhash_create(DEFAULT_IFACES); + IPFW_UH_WLOCK(ch); + if (ch->ifcfg == NULL) { + ch->ifcfg = ii; + ii = NULL; + } + IPFW_UH_WUNLOCK(ch); + + if (ii != NULL) { + /* Already initialized. Free namehash. */ + ipfw_objhash_destroy(ii); + } else { + /* We're the first ones. Init kernel hooks. */ + iface_khandler_register(); + } +} + +static void +destroy_iface(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + + /* Assume all consumers have been already detached */ + free(no, M_IPFW); +} + +/* + * Per-VNET ipfw detach hook. + * + */ +void +vnet_ipfw_iface_destroy(struct ip_fw_chain *ch) +{ + struct namedobj_instance *ii; + + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + ch->ifcfg = NULL; + IPFW_UH_WUNLOCK(ch); + + if (ii != NULL) { + ipfw_objhash_foreach(ii, destroy_iface, ch); + ipfw_objhash_destroy(ii); + iface_khandler_deregister(); + } +} + +/* + * Notify the subsystem that we are interested in tracking + * interface @name. This function has to be called without + * holding any locks to permit allocating the necessary states + * for proper interface tracking. + * + * Returns 0 on success. + */ +int +ipfw_iface_ref(struct ip_fw_chain *ch, char *name, + struct ipfw_ifc *ic) +{ + struct namedobj_instance *ii; + struct ipfw_iface *iif, *tmp; + + if (strlen(name) >= sizeof(iif->ifname)) + return (EINVAL); + + IPFW_UH_WLOCK(ch); + + ii = CHAIN_TO_II(ch); + if (ii == NULL) { + + /* + * First request to subsystem. + * Let's perform init. + */ + IPFW_UH_WUNLOCK(ch); + vnet_ipfw_iface_init(ch); + IPFW_UH_WLOCK(ch); + ii = CHAIN_TO_II(ch); + } + + iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); + + if (iif != NULL) { + iif->no.refcnt++; + ic->iface = iif; + IPFW_UH_WUNLOCK(ch); + return (0); + } + + IPFW_UH_WUNLOCK(ch); + + /* Not found. Let's create one */ + iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO); + TAILQ_INIT(&iif->consumers); + iif->no.name = iif->ifname; + strlcpy(iif->ifname, name, sizeof(iif->ifname)); + + /* + * Ref & link to the list. + * + * We assume ifnet_arrival_event / ifnet_departure_event + * are not holding any locks. + */ + iif->no.refcnt = 1; + IPFW_UH_WLOCK(ch); + + tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); + if (tmp != NULL) { + /* Interface has been created since unlock. Ref and return */ + tmp->no.refcnt++; + ic->iface = tmp; + IPFW_UH_WUNLOCK(ch); + free(iif, M_IPFW); + return (0); + } + + iif->ifindex = ipfw_kiflookup(name); + if (iif->ifindex != 0) + iif->resolved = 1; + + ipfw_objhash_add(ii, &iif->no); + ic->iface = iif; + + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Adds @ic to the list of iif interface consumers. + * Must be called with holding both UH+WLOCK. + * Callback may be immediately called (if interface exists). + */ +void +ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + iif = ic->iface; + + TAILQ_INSERT_TAIL(&iif->consumers, ic, next); + if (iif->resolved != 0) + ic->cb(ch, ic->cbdata, iif->ifindex); +} + +/* + * Unlinks interface tracker object @ic from interface. + * Must be called while holding UH lock. + */ +void +ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + IPFW_UH_WLOCK_ASSERT(ch); + + iif = ic->iface; + TAILQ_REMOVE(&iif->consumers, ic, next); +} + +/* + * Unreference interface specified by @ic. + * Must be called without holding any locks. + */ +void +ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic) +{ + struct ipfw_iface *iif; + + iif = ic->iface; + ic->iface = NULL; + + IPFW_UH_WLOCK(ch); + iif->no.refcnt--; + /* TODO: check for references & delete */ + IPFW_UH_WUNLOCK(ch); +} + +/* + * Interface arrival handler. + */ +static void +handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex) +{ + struct ipfw_ifc *ic; + + IPFW_UH_WLOCK_ASSERT(ch); + + iif->gencnt++; + iif->resolved = 1; + iif->ifindex = ifindex; + + IPFW_WLOCK(ch); + TAILQ_FOREACH(ic, &iif->consumers, next) + ic->cb(ch, ic->cbdata, iif->ifindex); + IPFW_WUNLOCK(ch); +} + +/* + * Interface departure handler. + */ +static void +handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, + uint16_t ifindex) +{ + struct ipfw_ifc *ic; + + IPFW_UH_WLOCK_ASSERT(ch); + + IPFW_WLOCK(ch); + TAILQ_FOREACH(ic, &iif->consumers, next) + ic->cb(ch, ic->cbdata, 0); + IPFW_WUNLOCK(ch); + + iif->gencnt++; + iif->resolved = 0; + iif->ifindex = 0; +} + +struct dump_iface_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static void +export_iface_internal(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + ipfw_iface_info *i; + struct dump_iface_args *da; + struct ipfw_iface *iif; + + da = (struct dump_iface_args *)arg; + + i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i)); + KASSERT(i != 0, ("previously checked buffer is not enough")); + + iif = (struct ipfw_iface *)no; + + strlcpy(i->ifname, iif->ifname, sizeof(i->ifname)); + if (iif->resolved) + i->flags |= IPFW_IFFLAG_RESOLVED; + i->ifindex = iif->ifindex; + i->refcnt = iif->no.refcnt; + i->gencnt = iif->gencnt; +} + +/* + * Lists all interface currently tracked by ipfw. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ] + * + * Returns 0 on success + */ +static int +list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct namedobj_instance *ii; + struct _ipfw_obj_lheader *olh; + struct dump_iface_args da; + uint32_t count, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + ii = CHAIN_TO_II(ch); + if (ii != NULL) + count = ipfw_objhash_count(ii); + else + count = 0; + size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_iface_info); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + da.ch = ch; + da.sd = sd; + + if (ii != NULL) + ipfw_objhash_foreach(ii, export_iface_internal, &da); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c new file mode 100644 index 0000000..cbbd875 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_log.c @@ -0,0 +1,567 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_log.c 272840 2014-10-09 19:32:35Z melifaro $"); + +/* + * Logging support for ipfw + */ + +#include "opt_ipfw.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/vnet.h> +#include <net/if_types.h> /* for IFT_PFLOG */ +#include <net/bpf.h> /* for BPF */ + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#ifdef INET6 +#include <netinet6/in6_var.h> /* ip6_sprintf() */ +#endif + +#include <netpfil/ipfw/ip_fw_private.h> + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#ifdef __APPLE__ +#undef snprintf +#define snprintf sprintf +#define SNPARGS(buf, len) buf + len +#define SNP(buf) buf +#else /* !__APPLE__ */ +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) +#endif /* !__APPLE__ */ + +#ifdef WITHOUT_BPF +void +ipfw_log_bpf(int onoff) +{ +} +#else /* !WITHOUT_BPF */ +static struct ifnet *log_if; /* hook to attach to bpf */ +static struct rwlock log_if_lock; +#define LOGIF_LOCK_INIT(x) rw_init(&log_if_lock, "ipfw log_if lock") +#define LOGIF_LOCK_DESTROY(x) rw_destroy(&log_if_lock) +#define LOGIF_RLOCK(x) rw_rlock(&log_if_lock) +#define LOGIF_RUNLOCK(x) rw_runlock(&log_if_lock) +#define LOGIF_WLOCK(x) rw_wlock(&log_if_lock) +#define LOGIF_WUNLOCK(x) rw_wunlock(&log_if_lock) + +static const char ipfwname[] = "ipfw"; + +/* we use this dummy function for all ifnet callbacks */ +static int +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +static int +ipfw_log_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro) +{ + if (m != NULL) + FREE_PKT(m); + return EINVAL; +} + +static void +ipfw_log_start(struct ifnet* ifp) +{ + panic("ipfw_log_start() must not be called"); +} + +static const u_char ipfwbroadcastaddr[6] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static int +ipfw_log_clone_match(struct if_clone *ifc, const char *name) +{ + + return (strncmp(name, ipfwname, sizeof(ipfwname) - 1) == 0); +} + +static int +ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len, + caddr_t params) +{ + int error; + int unit; + struct ifnet *ifp; + + error = ifc_name2unit(name, &unit); + if (error) + return (error); + + error = ifc_alloc_unit(ifc, &unit); + if (error) + return (error); + + ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) { + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + ifp->if_dname = ipfwname; + ifp->if_dunit = unit; + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", ipfwname, unit); + strlcpy(name, ifp->if_xname, len); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)log_dummy; + ifp->if_ioctl = log_dummy; + ifp->if_start = ipfw_log_start; + ifp->if_output = ipfw_log_output; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + ifp->if_broadcastaddr = ipfwbroadcastaddr; + ifp->if_baudrate = IF_Mbps(10); + + LOGIF_WLOCK(); + if (log_if == NULL) + log_if = ifp; + else { + LOGIF_WUNLOCK(); + if_free(ifp); + ifc_free_unit(ifc, unit); + return (EEXIST); + } + LOGIF_WUNLOCK(); + if_attach(ifp); + bpfattach(ifp, DLT_EN10MB, 14); + + return (0); +} + +static int +ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +{ + int unit; + + if (ifp == NULL) + return (0); + + LOGIF_WLOCK(); + if (log_if != NULL && ifp == log_if) + log_if = NULL; + else { + LOGIF_WUNLOCK(); + return (EINVAL); + } + LOGIF_WUNLOCK(); + + unit = ifp->if_dunit; + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + ifc_free_unit(ifc, unit); + + return (0); +} + +static struct if_clone *ipfw_log_cloner; + +void +ipfw_log_bpf(int onoff) +{ + + if (onoff) { + LOGIF_LOCK_INIT(); + ipfw_log_cloner = if_clone_advanced(ipfwname, 0, + ipfw_log_clone_match, ipfw_log_clone_create, + ipfw_log_clone_destroy); + } else { + if_clone_detach(ipfw_log_cloner); + LOGIF_LOCK_DESTROY(); + } +} +#endif /* !WITHOUT_BPF */ + +#define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +void +ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + u_short offset, uint32_t tablearg, struct ip *ip) +{ + char *action; + int limit_reached = 0; + char action2[92], proto[128], fragment[32]; + + if (V_fw_verbose == 0) { +#ifndef WITHOUT_BPF + LOGIF_RLOCK(); + if (log_if == NULL || log_if->if_bpf == NULL) { + LOGIF_RUNLOCK(); + return; + } + + if (args->eh) /* layer2, use orig hdr */ + BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); + else { + /* Add fake header. Later we will store + * more info in the header. + */ + if (ip->ip_v == 4) + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); + else if (ip->ip_v == 6) + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x86\xdd", ETHER_HDR_LEN, m); + else + /* Obviously bogus EtherType. */ + BPF_MTAP2(log_if, "DDDDDDSSSSSS\xff\xff", ETHER_HDR_LEN, m); + } + LOGIF_RUNLOCK(); +#endif /* !WITHOUT_BPF */ + return; + } + /* the old 'log' function */ + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) + return; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_ALTQ) { + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + snprintf(SNPARGS(action2, 0), "Altq %d", + altq->qid); + cmd += F_LEN(cmd); + } + if (cmd->opcode == O_PROB || cmd->opcode == O_TAG || + cmd->opcode == O_SETDSCP) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1==ICMP6_UNREACH_RST) + action = "Reset"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + TARG(cmd->arg1, divert)); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + TARG(cmd->arg1, divert)); + break; + case O_SETFIB: + snprintf(SNPARGS(action2, 0), "SetFib %d", + TARG(cmd->arg1, fib)); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + TARG(cmd->arg1, skipto)); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + TARG(cmd->arg1, pipe)); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + TARG(cmd->arg1, pipe)); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + struct in_addr dummyaddr; + if (sa->sa.sin_addr.s_addr == INADDR_ANY) + dummyaddr.s_addr = htonl(tablearg); + else + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; + + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(dummyaddr)); + + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; +#ifdef INET6 + case O_FORWARD_IP6: { + char buf[INET6_ADDRSTRLEN]; + ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; + int len; + + len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", + ip6_sprintf(buf, &sa->sa.sin6_addr)); + + if (sa->sa.sin6_port) + snprintf(SNPARGS(action2, len), ":%u", + sa->sa.sin6_port); + } + break; +#endif + case O_NETGRAPH: + snprintf(SNPARGS(action2, 0), "Netgraph %d", + cmd->arg1); + break; + case O_NGTEE: + snprintf(SNPARGS(action2, 0), "Ngtee %d", + cmd->arg1); + break; + case O_NAT: + action = "Nat"; + break; + case O_REASS: + action = "Reass"; + break; + case O_CALLRETURN: + if (cmd->len & F_NOT) + action = "Return"; + else + snprintf(SNPARGS(action2, 0), "Call %d", + cmd->arg1); + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + + } else { + int len; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + struct icmphdr *icmp; + struct tcphdr *tcp; + struct udphdr *udp; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + struct icmp6_hdr *icmp6; + u_short ip6f_mf; +#endif + src[0] = '\0'; + dst[0] = '\0'; +#ifdef INET6 + ip6f_mf = offset & IP6F_MORE_FRAG; + offset &= IP6F_OFF_MASK; + + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); + + ip6 = (struct ip6_hdr *)ip; + tcp = (struct tcphdr *)(((char *)ip) + hlen); + udp = (struct udphdr *)(((char *)ip) + hlen); + } else +#endif + { + tcp = L3HDR(struct tcphdr, ip); + udp = L3HDR(struct udphdr, ip); + + inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); + inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); + } + + switch (args->f_id.proto) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + dst, + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + dst, + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_ICMP: + icmp = L3HDR(struct icmphdr, ip); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMPv6:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#endif + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", + args->f_id.proto, src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; + } + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + snprintf(SNPARGS(fragment, 0), + " (frag %08x:%d@%d%s)", + args->f_id.extra, + ntohs(ip6->ip6_plen) - hlen, + ntohs(offset) << 3, ip6f_mf ? "+" : ""); + } else +#endif + { + int ipoff, iplen; + ipoff = ntohs(ip->ip_off); + iplen = ntohs(ip->ip_len); + if (ipoff & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), + " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), + offset << 3, + (ipoff & IP_MF) ? "+" : ""); + } + } +#ifdef __FreeBSD__ + if (oif || m->m_pkthdr.rcvif) + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, + fragment); + else +#endif + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment); + if (limit_reached) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1); +} +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c new file mode 100644 index 0000000..f41b607 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_pfil.c @@ -0,0 +1,587 @@ +/*- + * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_pfil.c 264540 2014-04-16 14:37:11Z ae $"); + +#include "opt_ipfw.h" +#include "opt_inet.h" +#include "opt_inet6.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/ethernet.h> +#include <net/pfil.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif + +#include <netgraph/ng_ipfw.h> + +#include <netpfil/ipfw/ip_fw_private.h> + +#include <machine/in_cksum.h> + +static VNET_DEFINE(int, fw_enable) = 1; +#define V_fw_enable VNET(fw_enable) + +#ifdef INET6 +static VNET_DEFINE(int, fw6_enable) = 1; +#define V_fw6_enable VNET(fw6_enable) +#endif + +static VNET_DEFINE(int, fwlink_enable) = 0; +#define V_fwlink_enable VNET(fwlink_enable) + +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +/* Forward declarations. */ +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); +int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); +int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f1) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw"); +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6_fw); +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +#endif /* INET6 */ + +SYSCTL_DECL(_net_link_ether); +SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0, + ipfw_chg_hook, "I", "Pass ether pkts through firewall"); + +SYSEND + +#endif /* SYSCTL_NODE */ + +/* + * The pfilter hook to pass packets to ipfw_chk and then to + * dummynet, divert, netgraph or other modules. + * The packet may be consumed. + */ +int +ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct ip_fw_args args; + struct m_tag *tag; + int ipfw; + int ret; + + /* convert dir to IPFW values */ + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; + bzero(&args, sizeof(args)); + +again: + /* + * extract and remove the tag if present. If we are left + * with onepass, optimize the outgoing path. + */ + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (tag != NULL) { + args.rule = *((struct ipfw_rule_ref *)(tag+1)); + m_tag_delete(*m0, tag); + if (args.rule.info & IPFW_ONEPASS) + return (0); + } + + args.m = *m0; + args.oif = dir == DIR_OUT ? ifp : NULL; + args.inp = inp; + + ipfw = ipfw_chk(&args); + *m0 = args.m; + + KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", + __func__)); + + /* breaking out of the switch means drop */ + ret = 0; /* default return value for pass */ + switch (ipfw) { + case IP_FW_PASS: + /* next_hop may be set by ipfw_chk */ + if (args.next_hop == NULL && args.next_hop6 == NULL) + break; /* pass */ +#if (!defined(INET6) && !defined(INET)) + ret = EACCES; +#else + { + struct m_tag *fwd_tag; + size_t len; + + KASSERT(args.next_hop == NULL || args.next_hop6 == NULL, + ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__, + args.next_hop, args.next_hop6)); +#ifdef INET6 + if (args.next_hop6 != NULL) + len = sizeof(struct sockaddr_in6); +#endif +#ifdef INET + if (args.next_hop != NULL) + len = sizeof(struct sockaddr_in); +#endif + + /* Incoming packets should not be tagged so we do not + * m_tag_find. Outgoing packets may be tagged, so we + * reuse the tag if present. + */ + fwd_tag = (dir == DIR_IN) ? NULL : + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + m_tag_unlink(*m0, fwd_tag); + } else { + fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len, + M_NOWAIT); + if (fwd_tag == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + } +#ifdef INET6 + if (args.next_hop6 != NULL) { + bcopy(args.next_hop6, (fwd_tag+1), len); + if (in6_localip(&args.next_hop6->sin6_addr)) + (*m0)->m_flags |= M_FASTFWD_OURS; + (*m0)->m_flags |= M_IP6_NEXTHOP; + } +#endif +#ifdef INET + if (args.next_hop != NULL) { + bcopy(args.next_hop, (fwd_tag+1), len); + if (in_localip(args.next_hop->sin_addr)) + (*m0)->m_flags |= M_FASTFWD_OURS; + (*m0)->m_flags |= M_IP_NEXTHOP; + } +#endif + m_tag_prepend(*m0, fwd_tag); + } +#endif /* INET || INET6 */ + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + if (mtod(*m0, struct ip *)->ip_v == 4) + ret = ip_dn_io_ptr(m0, dir, &args); + else if (mtod(*m0, struct ip *)->ip_v == 6) + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); + else + break; /* drop it */ + /* + * XXX should read the return value. + * dummynet normally eats the packet and sets *m0=NULL + * unless the packet can be sent immediately. In this + * case args is updated and we should re-run the + * check without clearing args. + */ + if (*m0 != NULL) + goto again; + break; + + case IP_FW_TEE: + case IP_FW_DIVERT: + if (ip_divert_ptr == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ipfw_divert(m0, dir, &args.rule, + (ipfw == IP_FW_TEE) ? 1 : 0); + /* continue processing for the original packet (tee). */ + if (*m0) + goto again; + break; + + case IP_FW_NGTEE: + case IP_FW_NETGRAPH: + if (ng_ipfw_input_p == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ng_ipfw_input_p(m0, dir, &args, + (ipfw == IP_FW_NGTEE) ? 1 : 0); + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ + goto again; /* continue with packet */ + break; + + case IP_FW_NAT: + /* honor one-pass in case of successful nat */ + if (V_fw_one_pass) + break; /* ret is already 0 */ + goto again; + + case IP_FW_REASS: + goto again; /* continue with packet */ + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + + return ret; +} + +/* + * ipfw processing for ethernet packets (in and out). + * Inteface is NULL from ether_demux, and ifp from + * ether_output_frame. + */ +int +ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir, + struct inpcb *inp) +{ + struct ether_header *eh; + struct ether_header save_eh; + struct mbuf *m; + int i, ret; + struct ip_fw_args args; + struct m_tag *mtag; + + /* fetch start point from rule, if any */ + mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { + /* dummynet packet, already partially processed */ + struct ipfw_rule_ref *r; + + /* XXX can we free it after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + if (r->info & IPFW_ONEPASS) + return (0); + args.rule = *r; + } + + /* I need some amt of data to be contiguous */ + m = *m0; + i = min(m->m_pkthdr.len, max_protohdr); + if (m->m_len < i) { + m = m_pullup(m, i); + if (m == NULL) { + *m0 = m; + return (0); + } + } + eh = mtod(m, struct ether_header *); +#if defined(USERSPACE) + args.eh = eh; +#else + save_eh = *eh; /* save copy for restore below */ + args.eh = &save_eh; /* MAC header for bridged/MAC packets */ +#endif + m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ + + args.m = m; /* the packet we are looking at */ + args.oif = dir == PFIL_OUT ? dst: NULL; /* destination, if any */ + args.next_hop = NULL; /* we do not support forward yet */ + args.next_hop6 = NULL; /* we do not support forward yet */ + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = ipfw_chk(&args); + m = args.m; + if (m != NULL) { + /* + * Restore Ethernet header, as needed, in case the + * mbuf chain was replaced by ipfw. + */ + M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); + if (m == NULL) { + *m0 = NULL; + return (0); + } + if (eh != mtod(m, struct ether_header *)) + bcopy(&save_eh, mtod(m, struct ether_header *), + ETHER_HDR_LEN); + } + *m0 = m; + + ret = 0; + /* Check result of ipfw_chk() */ +#if defined(USERSPACE) + /* fwd 1.1.1.1 causes the packet to be bounced back. + * This is signalled by setting the low bit of the peer + */ + if (args.next_hop) { + uintptr_t *p = (void *)&(m->__m_peer); + *p |= 1; + } +#endif + switch (i) { + case IP_FW_PASS: + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + int dir; + + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + + *m0 = NULL; + dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); + ip_dn_io_ptr(&m, dir, &args); + return 0; + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + + return ret; +} + +/* do the divert, return 1 on error 0 on success */ +static int +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, + int tee) +{ + /* + * ipfw_chk() has already tagged the packet with the divert tag. + * If tee is set, copy packet and return original. + * If not tee, consume packet and send it to divert socket. + */ + struct mbuf *clone; + struct ip *ip = mtod(*m0, struct ip *); + struct m_tag *tag; + + /* Cloning needed for tee? */ + if (tee == 0) { + clone = *m0; /* use the original mbuf */ + *m0 = NULL; + } else { + clone = m_dup(*m0, M_NOWAIT); + /* If we cannot duplicate the mbuf, we sacrifice the divert + * chain and continue with the tee-ed packet. + */ + if (clone == NULL) + return 1; + } + + /* + * Divert listeners can normally handle non-fragmented packets, + * but we can only reass in the non-tee case. + * This means that listeners on a tee rule may get fragments, + * and have to live with that. + * Note that we now have the 'reass' ipfw option so if we care + * we can do it before a 'tee'. + */ + if (!tee) switch (ip->ip_v) { + case IPVERSION: + if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { + int hlen; + struct mbuf *reass; + + reass = ip_reass(clone); /* Reassemble packet. */ + if (reass == NULL) + return 0; /* not an error */ + /* if reass = NULL then it was consumed by ip_reass */ + /* + * IP header checksum fixup after reassembly and leave header + * in network byte order. + */ + ip = mtod(reass, struct ip *); + hlen = ip->ip_hl << 2; + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(reass, hlen); + clone = reass; + } + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + { + struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); + + if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { + int nxt, off; + + off = sizeof(struct ip6_hdr); + nxt = frag6_input(&clone, &off, 0); + if (nxt == IPPROTO_DONE) + return (0); + } + break; + } +#endif + } + + /* attach a tag to the packet with the reinject info */ + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT); + if (tag == NULL) { + FREE_PKT(clone); + return 1; + } + *((struct ipfw_rule_ref *)(tag+1)) = *rule; + m_tag_prepend(clone, tag); + + /* Do the dirty job... */ + ip_divert_ptr(clone, incoming); + return 0; +} + +/* + * attach or detach hooks for a given protocol family + */ +static int +ipfw_hook(int onoff, int pf) +{ + struct pfil_head *pfh; + pfil_func_t hook_func; + + pfh = pfil_head_get(PFIL_TYPE_AF, pf); + if (pfh == NULL) + return ENOENT; + + hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet; + + (void) (onoff ? pfil_add_hook : pfil_remove_hook) + (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); + + return 0; +} + +int +ipfw_attach_hooks(int arg) +{ + int error = 0; + + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET); + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ + printf("ipfw_hook() error\n"); + } +#ifdef INET6 + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET6); + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { + error = ENOENT; + printf("ipfw6_hook() error\n"); + } +#endif + if (arg == 0) /* detach */ + ipfw_hook(0, AF_LINK); + else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { + error = ENOENT; + printf("ipfw_link_hook() error\n"); + } + return error; +} + +int +ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +{ + int newval; + int error; + int af; + + if (arg1 == &V_fw_enable) + af = AF_INET; +#ifdef INET6 + else if (arg1 == &V_fw6_enable) + af = AF_INET6; +#endif + else if (arg1 == &V_fwlink_enable) + af = AF_LINK; + else + return (EINVAL); + + newval = *(int *)arg1; + /* Handle sysctl change */ + error = sysctl_handle_int(oidp, &newval, 0, req); + + if (error) + return (error); + + /* Formalize new value */ + newval = (newval) ? 1 : 0; + + if (*(int *)arg1 == newval) + return (0); + + error = ipfw_hook(newval, af); + if (error) + return (error); + *(int *)arg1 = newval; + + return (0); +} +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h b/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h new file mode 100644 index 0000000..e7ad538 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_private.h @@ -0,0 +1,625 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netpfil/ipfw/ip_fw_private.h 272840 2014-10-09 19:32:35Z melifaro $ + */ + +#ifndef _IPFW2_PRIVATE_H +#define _IPFW2_PRIVATE_H + +/* + * Internal constants and data structures used by ipfw components + * and not meant to be exported outside the kernel. + */ + +#ifdef _KERNEL + +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif + +/* Return values from ipfw_chk() */ +enum { + IP_FW_PASS = 0, + IP_FW_DENY, + IP_FW_DIVERT, + IP_FW_TEE, + IP_FW_DUMMYNET, + IP_FW_NETGRAPH, + IP_FW_NGTEE, + IP_FW_NAT, + IP_FW_REASS, +}; + +/* + * Structure for collecting parameters to dummynet for ip6_output forwarding + */ +struct _ip6dn_args { + struct ip6_pktopts *opt_or; + struct route_in6 ro_or; + int flags_or; + struct ip6_moptions *im6o_or; + struct ifnet *origifp_or; + struct ifnet *ifp_or; + struct sockaddr_in6 dst_or; + u_long mtu_or; + struct route_in6 ro_pmtu_or; +}; + + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ + + /* + * On return, it points to the matching rule. + * On entry, rule.slot > 0 means the info is valid and + * contains the starting rule for an ipfw search. + * If chain_id == chain->id && slot >0 then jump to that slot. + * Otherwise, we locate the first rule >= rulenum:rule_id + */ + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ether_header *eh; /* for bridged packets */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + //uint32_t cookie; /* a cookie depending on rule action */ + struct inpcb *inp; + + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ +}; + +MALLOC_DECLARE(M_IPFW); + +/* + * Hooks sometime need to know the direction of the packet + * (divert, dummynet, netgraph, ...) + * We use a generic definition here, with bit0-1 indicating the + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the + * specific protocol + * indicating the protocol (if necessary) + */ +enum { + DIR_MASK = 0x3, + DIR_OUT = 0, + DIR_IN = 1, + DIR_FWD = 2, + DIR_DROP = 3, + PROTO_LAYER2 = 0x4, /* set for layer 2 */ + /* PROTO_DEFAULT = 0, */ + PROTO_IPV4 = 0x08, + PROTO_IPV6 = 0x10, + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ + /* PROTO_OLDBDG = 0x14, unused, old bridge */ +}; + +/* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else +#define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ + +/* + * Function definitions. + */ + +/* attach (arg = 1) or detach (arg = 0) hooks */ +int ipfw_attach_hooks(int); +#ifdef NOTYET +void ipfw_nat_destroy(void); +#endif + +/* In ip_fw_log.c */ +struct ip; +struct ip_fw_chain; +void ipfw_log_bpf(int); +void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, + u_short offset, uint32_t tablearg, struct ip *ip); +VNET_DECLARE(u_int64_t, norule_counter); +#define V_norule_counter VNET(norule_counter) +VNET_DECLARE(int, verbose_limit); +#define V_verbose_limit VNET(verbose_limit) + +/* In ip_fw_dynamic.c */ + +enum { /* result for matching dynamic rules */ + MATCH_REVERSE = 0, + MATCH_FORWARD, + MATCH_NONE, + MATCH_UNKNOWN, +}; + +/* + * The lock for dynamic rules is only used once outside the file, + * and only to release the result of lookup_dyn_rule(). + * Eventually we may implement it with a callback on the function. + */ +struct ip_fw_chain; +struct sockopt_data; +int ipfw_is_dyn_rule(struct ip_fw *rule); +void ipfw_expire_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *); +void ipfw_dyn_unlock(ipfw_dyn_rule *q); + +struct tcphdr; +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, + u_int32_t, u_int32_t, int); +int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, + ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, + int *match_direction, struct tcphdr *tcp); +void ipfw_remove_dyn_children(struct ip_fw *rule); +void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); +int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); + +void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ +int ipfw_dyn_len(void); +int ipfw_dyn_get_count(void); + +/* common variables */ +VNET_DECLARE(int, fw_one_pass); +#define V_fw_one_pass VNET(fw_one_pass) + +VNET_DECLARE(int, fw_verbose); +#define V_fw_verbose VNET(fw_verbose) + +VNET_DECLARE(struct ip_fw_chain, layer3_chain); +#define V_layer3_chain VNET(layer3_chain) + +VNET_DECLARE(int, ipfw_vnet_ready); +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + +VNET_DECLARE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DECLARE(int, autoinc_step); +#define V_autoinc_step VNET(autoinc_step) + +VNET_DECLARE(unsigned int, fw_tables_max); +#define V_fw_tables_max VNET(fw_tables_max) + +VNET_DECLARE(unsigned int, fw_tables_sets); +#define V_fw_tables_sets VNET(fw_tables_sets) + +struct tables_config; + +#ifdef _KERNEL +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + */ + +struct ip_fw { + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + uint8_t flags; /* currently unused */ + counter_u64_t cntr; /* Pointer to rule counters */ + uint32_t timestamp; /* tv_sec of last match */ + uint32_t id; /* rule id */ + uint32_t cached_id; /* used by jump_fast */ + uint32_t cached_pos; /* used by jump_fast */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define IPFW_RULE_CNTR_SIZE (2 * sizeof(counter_u64_t)) + +#endif + +struct ip_fw_chain { + struct ip_fw **map; /* array of rule ptrs to ease lookup */ + uint32_t id; /* ruleset id */ + int n_rules; /* number of static rules */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ + void *tablestate; /* runtime table info */ + void *valuestate; /* runtime table value info */ + int *idxmap; /* skipto array of rules */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; +#else + struct rmlock rwmtx; +#endif + int static_len; /* total len of static rules (v0) */ + uint32_t gencnt; /* NAT generation count */ + struct ip_fw *default_rule; + struct tables_config *tblcfg; /* tables module data */ + void *ifcfg; /* interface module data */ + int *idxmap_back; /* standby skipto array of rules */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_lock; +#else + struct rwlock uh_lock; /* lock for upper half */ +#endif +}; + +/* 64-byte structure representing multi-field table value */ +struct table_value { + uint32_t tag; /* O_TAG/O_TAGGED */ + uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t divert; /* O_DIVERT/O_TEE */ + uint16_t skipto; /* skipto, CALLRET */ + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ + uint32_t fib; /* O_SETFIB */ + uint32_t nat; /* O_NAT */ + uint32_t nh4; + uint8_t dscp; + uint8_t spare0[3]; + /* -- 32 bytes -- */ + struct in6_addr nh6; + uint32_t limit; /* O_LIMIT */ + uint32_t spare1; + uint64_t refcnt; /* Number of references */ +}; + +struct namedobj_instance; + +struct named_object { + TAILQ_ENTRY(named_object) nn_next; /* namehash */ + TAILQ_ENTRY(named_object) nv_next; /* valuehash */ + char *name; /* object name */ + uint8_t type; /* object type */ + uint8_t compat; /* Object name is number */ + uint16_t kidx; /* object kernel index */ + uint16_t uidx; /* userland idx for compat records */ + uint32_t set; /* set object belongs to */ + uint32_t refcnt; /* number of references */ +}; +TAILQ_HEAD(namedobjects_head, named_object); + +struct sockopt; /* used by tcp_var.h */ +struct sockopt_data { + caddr_t kbuf; /* allocated buffer */ + size_t ksize; /* given buffer size */ + size_t koff; /* data already used */ + size_t kavail; /* number of bytes available */ + size_t ktotal; /* total bytes pushed */ + struct sockopt *sopt; /* socket data */ + caddr_t sopt_val; /* sopt user buffer */ + size_t valsize; /* original data size */ +}; + +struct ipfw_ifc; + +typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, + uint16_t ifindex); + +struct ipfw_iface { + struct named_object no; + char ifname[64]; + int resolved; + uint16_t ifindex; + uint16_t spare; + uint64_t gencnt; + TAILQ_HEAD(, ipfw_ifc) consumers; +}; + +struct ipfw_ifc { + TAILQ_ENTRY(ipfw_ifc) next; + struct ipfw_iface *iface; + ipfw_ifc_cb *cb; + void *cbdata; +}; + +/* Macro for working with various counters */ +#define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ + counter_u64_add((_cntr)->cntr, 1); \ + counter_u64_add((_cntr)->cntr + 1, _bytes); \ + if ((_cntr)->timestamp != time_uptime) \ + (_cntr)->timestamp = time_uptime; \ + } while (0) + +#define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ + (_cntr)->pcnt++; \ + (_cntr)->bcnt += _bytes; \ + } while (0) + +#define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ + counter_u64_zero((_cntr)->cntr); \ + counter_u64_zero((_cntr)->cntr + 1); \ + (_cntr)->timestamp = 0; \ + } while (0) + +#define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ + (_cntr)->pcnt = 0; \ + (_cntr)->bcnt = 0; \ + } while (0) + +#define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f +#define IP_FW_ARG_TABLEARG(ch, a, f) \ + (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) +/* + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c + * so the variable and the macros must be here. + */ + +#if defined( __linux__ ) || defined( _WIN32 ) +#define IPFW_LOCK_INIT(_chain) do { \ + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rw_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK_TRACKER +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) +#else /* FreeBSD */ +#define IPFW_LOCK_INIT(_chain) do { \ + rm_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rm_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) +#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker +#define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) +#define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) +#define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) +#endif + +#define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) +#define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) + +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) + +struct obj_idx { + uint16_t uidx; /* internal index supplied by userland */ + uint16_t kidx; /* kernel object index */ + uint16_t off; /* tlv offset from rule end in 4-byte words */ + uint8_t spare; + uint8_t type; /* object type within its category */ +}; + +struct rule_check_info { + uint16_t flags; /* rule-specific check flags */ + uint16_t table_opcodes; /* count of opcodes referencing table */ + uint16_t urule_numoff; /* offset of rulenum in bytes */ + uint8_t version; /* rule version */ + uint8_t spare; + ipfw_obj_ctlv *ctlv; /* name TLV containter */ + struct ip_fw *krule; /* resulting rule pointer */ + caddr_t urule; /* original rule pointer */ + struct obj_idx obuf[8]; /* table references storage */ +}; + +/* Legacy interface support */ +/* + * FreeBSD 8 export rule format + */ +struct ip_fw_rule0 { + struct ip_fw *x_next; /* linked list of rules */ + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +struct ip_fw_bcounter0 { + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ +}; + +/* Kernel rule length */ +/* + * RULE _K_ SIZE _V_ -> + * get kernel size from userland rool version _V_. + * RULE _U_ SIZE _V_ -> + * get user size version _V_ from kernel rule + * RULESIZE _V_ -> + * get user size rule length + */ +/* FreeBSD8 <> current kernel format */ +#define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) +#define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) +/* FreeBSD11 <> current kernel format */ +#define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ + (r)->cmd_len * 4 - 4, 8)) +#define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) + + +/* In ip_fw_iface.c */ +int ipfw_iface_init(void); +void ipfw_iface_destroy(void); +void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); +int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, + struct ipfw_ifc *ic); +void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); +void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); +void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); + +/* In ip_fw_sockopt.c */ +void ipfw_init_skipto_cache(struct ip_fw_chain *chain); +void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); +int ipfw_ctl3(struct sockopt *sopt); +int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, + struct ip_fw *rule); +void ipfw_reap_rules(struct ip_fw *head); +void ipfw_init_counters(void); +void ipfw_destroy_counters(void); +struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); +int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); + +typedef int (sopt_handler_f)(struct ip_fw_chain *ch, + ip_fw3_opheader *op3, struct sockopt_data *sd); +struct ipfw_sopt_handler { + uint16_t opcode; + uint8_t version; + uint8_t dir; + sopt_handler_f *handler; + uint64_t refcnt; +}; +#define HDIR_SET 0x01 /* Handler is used to set some data */ +#define HDIR_GET 0x02 /* Handler is used to retrieve data */ +#define HDIR_BOTH HDIR_GET|HDIR_SET + +void ipfw_init_sopt_handler(void); +void ipfw_destroy_sopt_handler(void); +void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); +int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); +caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); +caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); +#define IPFW_ADD_SOPT_HANDLER(f, c) do { \ + if ((f) != 0) \ + ipfw_add_sopt_handler(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) +#define IPFW_DEL_SOPT_HANDLER(l, c) do { \ + if ((l) != 0) \ + ipfw_del_sopt_handler(c, \ + sizeof(c) / sizeof(c[0])); \ + } while(0) + +typedef void (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, + void *arg); +typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, void *key, + uint32_t kopt); +typedef int (objhash_cmp_f)(struct named_object *no, void *key, uint32_t kopt); +struct namedobj_instance *ipfw_objhash_create(uint32_t items); +void ipfw_objhash_destroy(struct namedobj_instance *); +void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); +void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, + void **idx, int *blocks); +void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, + void **idx, int *blocks); +void ipfw_objhash_bitmap_free(void *idx, int blocks); +void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); +struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, + uint32_t set, char *name); +struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, + uint16_t idx); +int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, + struct named_object *b); +void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); +void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); +uint32_t ipfw_objhash_count(struct namedobj_instance *ni); +void ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, + void *arg); +int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); +int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); +void ipfw_objhash_set_funcs(struct namedobj_instance *ni, + objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); + +/* In ip_fw_table.c */ +struct table_info; + +typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); + +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val); +int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, + void *paddr, uint32_t *val); +int ipfw_init_tables(struct ip_fw_chain *ch, int first); +int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); +int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); +void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); + +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ + +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); + +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); +typedef int ipfw_nat_cfg_t(struct sockopt *); + +VNET_DECLARE(int, ipfw_nat_ready); +#define V_ipfw_nat_ready VNET(ipfw_nat_ready) +#define IPFW_NAT_LOADED (V_ipfw_nat_ready) + +extern ipfw_nat_t *ipfw_nat_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#endif /* _KERNEL */ +#endif /* _IPFW2_PRIVATE_H */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c new file mode 100644 index 0000000..bdf2692 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_sockopt.c @@ -0,0 +1,3469 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Supported by: Valeria Paoli + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_sockopt.c 273035 2014-10-13 13:49:28Z melifaro $"); + +/* + * Control socket and rule management routines for ipfw. + * Control is currently implemented via IP_FW3 setsockopt() code. + */ + +#include "opt_ipfw.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> /* struct m_tag used by nested headers */ +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/fnv_hash.h> +#include <net/if.h> +#include <net/route.h> +#include <net/vnet.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* hooks */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +static int ipfw_ctl(struct sockopt *sopt); +static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, + struct rule_check_info *ci); +static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, + struct rule_check_info *ci); +static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, + struct rule_check_info *ci); + +#define NAMEDOBJ_HASH_SIZE 32 + +struct namedobj_instance { + struct namedobjects_head *names; + struct namedobjects_head *values; + uint32_t nn_size; /* names hash size */ + uint32_t nv_size; /* number hash size */ + u_long *idx_mask; /* used items bitmask */ + uint32_t max_blocks; /* number of "long" blocks in bitmask */ + uint32_t count; /* number of items */ + uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */ + objhash_hash_f *hash_f; + objhash_cmp_f *cmp_f; +}; +#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ + +static uint32_t objhash_hash_name(struct namedobj_instance *ni, void *key, + uint32_t kopt); +static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val); +static int objhash_cmp_name(struct named_object *no, void *name, uint32_t set); + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); +static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +/* ctl3 handler data */ +struct mtx ctl3_lock; +#define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF) +#define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock) +#define CTL3_LOCK() mtx_lock(&ctl3_lock) +#define CTL3_UNLOCK() mtx_unlock(&ctl3_lock) + +static struct ipfw_sopt_handler *ctl3_handlers; +static size_t ctl3_hsize; +static uint64_t ctl3_refct, ctl3_gencnt; +#define CTL3_SMALLBUF 4096 /* small page-size write buffer */ +#define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */ + +static int ipfw_flush_sopt_data(struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XGET, 0, HDIR_GET, dump_config }, + { IP_FW_XADD, 0, HDIR_BOTH, add_rules }, + { IP_FW_XDEL, 0, HDIR_BOTH, del_rules }, + { IP_FW_XZERO, 0, HDIR_SET, clear_rules }, + { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules }, + { IP_FW_XMOVE, 0, HDIR_SET, move_rules }, + { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets }, + { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets }, + { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets }, + { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, +}; + +/* + * static variables followed by global ones + */ + +static VNET_DEFINE(uma_zone_t, ipfw_cntr_zone); +#define V_ipfw_cntr_zone VNET(ipfw_cntr_zone) + +void +ipfw_init_counters() +{ + + V_ipfw_cntr_zone = uma_zcreate("IPFW counters", + IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_PCPU); +} + +void +ipfw_destroy_counters() +{ + + uma_zdestroy(V_ipfw_cntr_zone); +} + +struct ip_fw * +ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize) +{ + struct ip_fw *rule; + + rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO); + rule->cntr = uma_zalloc(V_ipfw_cntr_zone, M_WAITOK | M_ZERO); + + return (rule); +} + +static void +free_rule(struct ip_fw *rule) +{ + + uma_zfree(V_ipfw_cntr_zone, rule->cntr); + free(rule, M_IPFW); +} + + +/* + * Find the smallest rule >= key, id. + * We could use bsearch but it is so simple that we code it directly + */ +int +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) +{ + int i, lo, hi; + struct ip_fw *r; + + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { + i = (lo + hi) / 2; + r = chain->map[i]; + if (r->rulenum < key) + lo = i + 1; /* continue from the next one */ + else if (r->rulenum > key) + hi = i; /* this might be good */ + else if (r->id < id) + lo = i + 1; /* continue from the next one */ + else /* r->id >= id */ + hi = i; /* this might be good */ + }; + return hi; +} + +/* + * Builds skipto cache on rule set @map. + */ +static void +update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map) +{ + int *smap, rulenum; + int i, mi; + + IPFW_UH_WLOCK_ASSERT(chain); + + mi = 0; + rulenum = map[mi]->rulenum; + smap = chain->idxmap_back; + + if (smap == NULL) + return; + + for (i = 0; i < 65536; i++) { + smap[i] = mi; + /* Use the same rule index until i < rulenum */ + if (i != rulenum || i == 65535) + continue; + /* Find next rule with num > i */ + rulenum = map[++mi]->rulenum; + while (rulenum == i) + rulenum = map[++mi]->rulenum; + } +} + +/* + * Swaps prepared (backup) index with current one. + */ +static void +swap_skipto_cache(struct ip_fw_chain *chain) +{ + int *map; + + IPFW_UH_WLOCK_ASSERT(chain); + IPFW_WLOCK_ASSERT(chain); + + map = chain->idxmap; + chain->idxmap = chain->idxmap_back; + chain->idxmap_back = map; +} + +/* + * Allocate and initialize skipto cache. + */ +void +ipfw_init_skipto_cache(struct ip_fw_chain *chain) +{ + int *idxmap, *idxmap_back; + + idxmap = malloc(65536 * sizeof(uint32_t *), M_IPFW, + M_WAITOK | M_ZERO); + idxmap_back = malloc(65536 * sizeof(uint32_t *), M_IPFW, + M_WAITOK | M_ZERO); + + /* + * Note we may be called at any time after initialization, + * for example, on first skipto rule, so we need to + * provide valid chain->idxmap on return + */ + + IPFW_UH_WLOCK(chain); + if (chain->idxmap != NULL) { + IPFW_UH_WUNLOCK(chain); + free(idxmap, M_IPFW); + free(idxmap_back, M_IPFW); + return; + } + + /* Set backup pointer first to permit building cache */ + chain->idxmap_back = idxmap_back; + update_skipto_cache(chain, chain->map); + IPFW_WLOCK(chain); + /* It is now safe to set chain->idxmap ptr */ + chain->idxmap = idxmap; + swap_skipto_cache(chain); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); +} + +/* + * Destroys skipto cache. + */ +void +ipfw_destroy_skipto_cache(struct ip_fw_chain *chain) +{ + + if (chain->idxmap != NULL) + free(chain->idxmap, M_IPFW); + if (chain->idxmap != NULL) + free(chain->idxmap_back, M_IPFW); +} + + +/* + * allocate a new map, returns the chain locked. extra is the number + * of entries to add or delete. + */ +static struct ip_fw ** +get_map(struct ip_fw_chain *chain, int extra, int locked) +{ + + for (;;) { + struct ip_fw **map; + int i, mflags; + + mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK); + + i = chain->n_rules + extra; + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags); + if (map == NULL) { + printf("%s: cannot allocate map\n", __FUNCTION__); + return NULL; + } + if (!locked) + IPFW_UH_WLOCK(chain); + if (i >= chain->n_rules + extra) /* good */ + return map; + /* otherwise we lost the race, free and retry */ + if (!locked) + IPFW_UH_WUNLOCK(chain); + free(map, M_IPFW); + } +} + +/* + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK + */ +static struct ip_fw ** +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) +{ + struct ip_fw **old_map; + + IPFW_WLOCK(chain); + chain->id++; + chain->n_rules = new_len; + old_map = chain->map; + chain->map = new_map; + swap_skipto_cache(chain); + IPFW_WUNLOCK(chain); + return old_map; +} + + +static void +export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr) +{ + + cntr->size = sizeof(*cntr); + + if (krule->cntr != NULL) { + cntr->pcnt = counter_u64_fetch(krule->cntr); + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); + cntr->timestamp = krule->timestamp; + } + if (cntr->timestamp > 0) + cntr->timestamp += boottime.tv_sec; +} + +static void +export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr) +{ + + if (krule->cntr != NULL) { + cntr->pcnt = counter_u64_fetch(krule->cntr); + cntr->bcnt = counter_u64_fetch(krule->cntr + 1); + cntr->timestamp = krule->timestamp; + } + if (cntr->timestamp > 0) + cntr->timestamp += boottime.tv_sec; +} + +/* + * Copies rule @urule from v1 userland format (current). + * to kernel @krule. + * Assume @krule is zeroed. + */ +static void +import_rule1(struct rule_check_info *ci) +{ + struct ip_fw_rule *urule; + struct ip_fw *krule; + + urule = (struct ip_fw_rule *)ci->urule; + krule = (struct ip_fw *)ci->krule; + + /* copy header */ + krule->act_ofs = urule->act_ofs; + krule->cmd_len = urule->cmd_len; + krule->rulenum = urule->rulenum; + krule->set = urule->set; + krule->flags = urule->flags; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); + + /* Copy opcodes */ + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); +} + +/* + * Export rule into v1 format (Current). + * Layout: + * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT) + * [ ip_fw_rule ] OR + * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs). + * ] + * Assume @data is zeroed. + */ +static void +export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs) +{ + struct ip_fw_bcounter *cntr; + struct ip_fw_rule *urule; + ipfw_obj_tlv *tlv; + + /* Fill in TLV header */ + tlv = (ipfw_obj_tlv *)data; + tlv->type = IPFW_TLV_RULE_ENT; + tlv->length = len; + + if (rcntrs != 0) { + /* Copy counters */ + cntr = (struct ip_fw_bcounter *)(tlv + 1); + urule = (struct ip_fw_rule *)(cntr + 1); + export_cntr1_base(krule, cntr); + } else + urule = (struct ip_fw_rule *)(tlv + 1); + + /* copy header */ + urule->act_ofs = krule->act_ofs; + urule->cmd_len = krule->cmd_len; + urule->rulenum = krule->rulenum; + urule->set = krule->set; + urule->flags = krule->flags; + urule->id = krule->id; + + /* Copy opcodes */ + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); +} + + +/* + * Copies rule @urule from FreeBSD8 userland format (v0) + * to kernel @krule. + * Assume @krule is zeroed. + */ +static void +import_rule0(struct rule_check_info *ci) +{ + struct ip_fw_rule0 *urule; + struct ip_fw *krule; + int cmdlen, l; + ipfw_insn *cmd; + ipfw_insn_limit *lcmd; + ipfw_insn_if *cmdif; + + urule = (struct ip_fw_rule0 *)ci->urule; + krule = (struct ip_fw *)ci->krule; + + /* copy header */ + krule->act_ofs = urule->act_ofs; + krule->cmd_len = urule->cmd_len; + krule->rulenum = urule->rulenum; + krule->set = urule->set; + if ((urule->_pad & 1) != 0) + krule->flags |= IPFW_RULE_NOOPT; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum); + + /* Copy opcodes */ + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); + + /* + * Alter opcodes: + * 1) convert tablearg value from 65335 to 0 + * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room for targ). + * 3) convert table number in iface opcodes to u16 + */ + l = krule->cmd_len; + cmd = krule->cmd; + cmdlen = 0; + + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + /* Opcodes supporting tablearg */ + case O_TAG: + case O_TAGGED: + case O_PIPE: + case O_QUEUE: + case O_DIVERT: + case O_TEE: + case O_SKIPTO: + case O_CALLRETURN: + case O_NETGRAPH: + case O_NGTEE: + case O_NAT: + if (cmd->arg1 == 65535) + cmd->arg1 = IP_FW_TARG; + break; + case O_SETFIB: + case O_SETDSCP: + if (cmd->arg1 == 65535) + cmd->arg1 = IP_FW_TARG; + else + cmd->arg1 |= 0x8000; + break; + case O_LIMIT: + lcmd = (ipfw_insn_limit *)cmd; + if (lcmd->conn_limit == 65535) + lcmd->conn_limit = IP_FW_TARG; + break; + /* Interface tables */ + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + break; + + cmdif->p.kidx = (uint16_t)cmdif->p.glob; + break; + } + } +} + +/* + * Copies rule @krule from kernel to FreeBSD8 userland format (v0) + */ +static void +export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len) +{ + int cmdlen, l; + ipfw_insn *cmd; + ipfw_insn_limit *lcmd; + ipfw_insn_if *cmdif; + + /* copy header */ + memset(urule, 0, len); + urule->act_ofs = krule->act_ofs; + urule->cmd_len = krule->cmd_len; + urule->rulenum = krule->rulenum; + urule->set = krule->set; + if ((krule->flags & IPFW_RULE_NOOPT) != 0) + urule->_pad |= 1; + + /* Copy opcodes */ + memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); + + /* Export counters */ + export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt); + + /* + * Alter opcodes: + * 1) convert tablearg value from 0 to 65335 + * 2) Remove highest bit from O_SETFIB/O_SETDSCP values. + * 3) convert table number in iface opcodes to int + */ + l = urule->cmd_len; + cmd = urule->cmd; + cmdlen = 0; + + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + switch (cmd->opcode) { + /* Opcodes supporting tablearg */ + case O_TAG: + case O_TAGGED: + case O_PIPE: + case O_QUEUE: + case O_DIVERT: + case O_TEE: + case O_SKIPTO: + case O_CALLRETURN: + case O_NETGRAPH: + case O_NGTEE: + case O_NAT: + if (cmd->arg1 == IP_FW_TARG) + cmd->arg1 = 65535; + break; + case O_SETFIB: + case O_SETDSCP: + if (cmd->arg1 == IP_FW_TARG) + cmd->arg1 = 65535; + else + cmd->arg1 &= ~0x8000; + break; + case O_LIMIT: + lcmd = (ipfw_insn_limit *)cmd; + if (lcmd->conn_limit == IP_FW_TARG) + lcmd->conn_limit = 65535; + break; + /* Interface tables */ + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + break; + + cmdif->p.glob = cmdif->p.kidx; + break; + } + } +} + +/* + * Add new rule(s) to the list possibly creating rule number for each. + * Update the rule_number in the input struct so the caller knows it as well. + * Must be called without IPFW_UH held + */ +static int +commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) +{ + int error, i, insert_before, tcount; + uint16_t rulenum, *pnum; + struct rule_check_info *ci; + struct ip_fw *krule; + struct ip_fw **map; /* the new array of pointers */ + + /* Check if we need to do table remap */ + tcount = 0; + for (ci = rci, i = 0; i < count; ci++, i++) { + if (ci->table_opcodes == 0) + continue; + + /* + * Rule has some table opcodes. + * Reference & allocate needed tables/ + */ + error = ipfw_rewrite_table_uidx(chain, ci); + if (error != 0) { + + /* + * rewrite failed, state for current rule + * has been reverted. Check if we need to + * revert more. + */ + if (tcount > 0) { + + /* + * We have some more table rules + * we need to rollback. + */ + + IPFW_UH_WLOCK(chain); + while (ci != rci) { + ci--; + if (ci->table_opcodes == 0) + continue; + ipfw_unref_rule_tables(chain,ci->krule); + + } + IPFW_UH_WUNLOCK(chain); + + } + + return (error); + } + + tcount++; + } + + /* get_map returns with IPFW_UH_WLOCK if successful */ + map = get_map(chain, count, 0 /* not locked */); + if (map == NULL) { + if (tcount > 0) { + /* Unbind tables */ + IPFW_UH_WLOCK(chain); + for (ci = rci, i = 0; i < count; ci++, i++) { + if (ci->table_opcodes == 0) + continue; + + ipfw_unref_rule_tables(chain, ci->krule); + } + IPFW_UH_WUNLOCK(chain); + } + + return (ENOSPC); + } + + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; + + /* FIXME: Handle count > 1 */ + ci = rci; + krule = ci->krule; + rulenum = krule->rulenum; + + /* find the insertion point, we will insert before */ + insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; + i = ipfw_find_rule(chain, insert_before, 0); + /* duplicate first part */ + if (i > 0) + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); + map[i] = krule; + /* duplicate remaining part, we always have the default rule */ + bcopy(chain->map + i, map + i + 1, + sizeof(struct ip_fw *) *(chain->n_rules - i)); + if (rulenum == 0) { + /* Compute rule number and write it back */ + rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rulenum += V_autoinc_step; + krule->rulenum = rulenum; + /* Save number to userland rule */ + pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff); + *pnum = rulenum; + } + + krule->id = chain->id + 1; + update_skipto_cache(chain, map); + map = swap_map(chain, map, chain->n_rules + 1); + chain->static_len += RULEUSIZE0(krule); + IPFW_UH_WUNLOCK(chain); + if (map) + free(map, M_IPFW); + return (0); +} + +/* + * Adds @rule to the list of rules to reap + */ +void +ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, + struct ip_fw *rule) +{ + + IPFW_UH_WLOCK_ASSERT(chain); + + /* Unlink rule from everywhere */ + ipfw_unref_rule_tables(chain, rule); + + *((struct ip_fw **)rule) = *head; + *head = rule; +} + +/* + * Reclaim storage associated with a list of rules. This is + * typically the list created using remove_rule. + * A NULL pointer on input is handled correctly. + */ +void +ipfw_reap_rules(struct ip_fw *head) +{ + struct ip_fw *rule; + + while ((rule = head) != NULL) { + head = *((struct ip_fw **)head); + free_rule(rule); + } +} + +/* + * Rules to keep are + * (default || reserved || !match_set || !match_number) + * where + * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) + * // the default rule is always protected + * + * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) + * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") + * + * match_set ::= (cmd == 0 || rule->set == set) + * // set number is ignored for cmd == 0 + * + * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) + * // number is ignored for cmd == 1 or n == 0 + * + */ +int +ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt) +{ + + /* Don't match default rule for modification queries */ + if (rule->rulenum == IPFW_DEFAULT_RULE && + (rt->flags & IPFW_RCFLAG_DEFAULT) == 0) + return (0); + + /* Don't match rules in reserved set for flush requests */ + if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET) + return (0); + + /* If we're filtering by set, don't match other sets */ + if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set) + return (0); + + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && + (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule)) + return (0); + + return (1); +} + +/* + * Delete rules matching range @rt. + * Saves number of deleted rules in @ndel. + * + * Returns 0 on success. + */ +static int +delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel) +{ + struct ip_fw *reap, *rule, **map; + int end, start; + int i, n, ndyn, ofs; + + reap = NULL; + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + + /* + * Stage 1: Determine range to inspect. + * Range is half-inclusive, e.g [start, end). + */ + start = 0; + end = chain->n_rules - 1; + + if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) { + start = ipfw_find_rule(chain, rt->start_rule, 0); + + end = ipfw_find_rule(chain, rt->end_rule, 0); + if (rt->end_rule != IPFW_DEFAULT_RULE) + while (chain->map[end]->rulenum == rt->end_rule) + end++; + } + + /* Allocate new map of the same size */ + map = get_map(chain, 0, 1 /* locked */); + if (map == NULL) { + IPFW_UH_WUNLOCK(chain); + return (ENOMEM); + } + + n = 0; + ndyn = 0; + ofs = start; + /* 1. bcopy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* 2. copy active rules between start and end */ + for (i = start; i < end; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) { + map[ofs++] = rule; + continue; + } + + n++; + if (ipfw_is_dyn_rule(rule) != 0) + ndyn++; + } + /* 3. copy the final part of the map */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + /* 4. recalculate skipto cache */ + update_skipto_cache(chain, map); + /* 5. swap the maps (under UH_WLOCK + WHLOCK) */ + map = swap_map(chain, map, chain->n_rules - n); + /* 6. Remove all dynamic states originated by deleted rules */ + if (ndyn > 0) + ipfw_expire_dyn_rules(chain, rt); + /* 7. now remove the rules deleted from the old map */ + for (i = start; i < end; i++) { + rule = map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + chain->static_len -= RULEUSIZE0(rule); + ipfw_reap_add(chain, &reap, rule); + } + IPFW_UH_WUNLOCK(chain); + + ipfw_reap_rules(reap); + if (map != NULL) + free(map, M_IPFW); + *ndel = n; + return (0); +} + +/* + * Changes set of given rule rannge @rt + * with each other. + * + * Returns 0 on success. + */ +static int +move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + struct ip_fw *rule; + int i; + + IPFW_UH_WLOCK(chain); + + /* + * Move rules with matching paramenerts to a new set. + * This one is much more complex. We have to ensure + * that all referenced tables (if any) are referenced + * by given rule subset only. Otherwise, we can't move + * them to new set and have to return error. + */ + if (V_fw_tables_sets != 0) { + if (ipfw_move_tables_sets(chain, rt, rt->new_set) != 0) { + IPFW_UH_WUNLOCK(chain); + return (EBUSY); + } + } + + /* XXX: We have to do swap holding WLOCK */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + rule->set = rt->new_set; + } + + IPFW_UH_WUNLOCK(chain); + + return (0); +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) + IPFW_ZERO_RULE_COUNTER(rule); + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/* + * Flushes rules counters and/or log values on matching range. + * + * Returns number of items cleared. + */ +static int +clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only) +{ + struct ip_fw *rule; + int num; + int i; + + num = 0; + rt->flags |= IPFW_RCFLAG_DEFAULT; + + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + clear_counters(rule, log_only); + num++; + } + IPFW_UH_WUNLOCK(chain); + + return (num); +} + +static int +check_range_tlv(ipfw_range_tlv *rt) +{ + + if (rt->head.length != sizeof(*rt)) + return (1); + if (rt->start_rule > rt->end_rule) + return (1); + if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) + return (1); + + if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) + return (1); + + return (0); +} + +/* + * Delete rules matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * Reply: [ ipfw_obj_header ipfw_range_tlv ] + * + * Saves number of deleted rules in ipfw_range_tlv->new_set. + * + * Returns 0 on success. + */ +static int +del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + int error, ndel; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + ndel = 0; + if ((error = delete_range(chain, &rh->range, &ndel)) != 0) + return (error); + + /* Save number of rules deleted */ + rh->range.new_set = ndel; + return (0); +} + +/* + * Move rules/sets matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * + * Returns 0 on success. + */ +static int +move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + return (move_range(chain, &rh->range)); +} + +/* + * Clear rule accounting data matching specified parameters + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * Reply: [ ipfw_obj_header ipfw_range_tlv ] + * + * Saves number of cleared rules in ipfw_range_tlv->new_set. + * + * Returns 0 on success. + */ +static int +clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + int log_only, num; + char *msg; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (check_range_tlv(&rh->range) != 0) + return (EINVAL); + + log_only = (op3->opcode == IP_FW_XRESETLOG); + + num = clear_range(chain, &rh->range, log_only); + + if (rh->range.flags & IPFW_RCFLAG_ALL) + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + else + msg = log_only ? "logging count reset" : "cleared"; + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + log(lev, "ipfw: %s.\n", msg); + } + + /* Save number of rules cleared */ + rh->range.new_set = num; + return (0); +} + +static void +enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt) +{ + uint32_t v_set; + + IPFW_UH_WLOCK_ASSERT(chain); + + /* Change enabled/disabled sets mask */ + v_set = (V_set_disable | rt->set) & ~rt->new_set; + v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */ + IPFW_WLOCK(chain); + V_set_disable = v_set; + IPFW_WUNLOCK(chain); +} + +static void +swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv) +{ + struct ip_fw *rule; + int i; + + IPFW_UH_WLOCK_ASSERT(chain); + + /* Swap or move two sets */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == rt->set) + rule->set = rt->new_set; + else if (rule->set == rt->new_set && mv == 0) + rule->set = rt->set; + } + if (V_fw_tables_sets != 0) + ipfw_swap_tables_sets(chain, rt->set, rt->new_set, mv); +} + +/* + * Swaps or moves set + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_range_tlv ] + * + * Returns 0 on success. + */ +static int +manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_header *rh; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); + + if (rh->range.head.length != sizeof(ipfw_range_tlv)) + return (1); + + IPFW_UH_WLOCK(chain); + switch (op3->opcode) { + case IP_FW_SET_SWAP: + case IP_FW_SET_MOVE: + swap_sets(chain, &rh->range, op3->opcode == IP_FW_SET_MOVE); + break; + case IP_FW_SET_ENABLE: + enable_sets(chain, &rh->range); + break; + } + IPFW_UH_WUNLOCK(chain); + + return (0); +} + +/** + * Remove all rules with given number, or do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an uint32_t. The low 16 bit are the rule or set number; + * the next 8 bits are the new set; the top 8 bits indicate the command: + * + * 0 delete rules numbered "rulenum" + * 1 delete rules in set "rulenum" + * 2 move rules "rulenum" to set "new_set" + * 3 move rules from set "rulenum" to set "new_set" + * 4 swap sets "rulenum" and "new_set" + * 5 delete rules "rulenum" and set "new_set" + */ +static int +del_entry(struct ip_fw_chain *chain, uint32_t arg) +{ + uint32_t num; /* rule number or old_set */ + uint8_t cmd, new_set; + int do_del, ndel; + int error = 0; + ipfw_range_tlv rt; + + num = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 5 || new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2 || cmd == 5) { + if (num >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (num > RESVD_SET) /* old_set */ + return EINVAL; + } + + /* Convert old requests into new representation */ + memset(&rt, 0, sizeof(rt)); + rt.start_rule = num; + rt.end_rule = num; + rt.set = num; + rt.new_set = new_set; + do_del = 0; + + switch (cmd) { + case 0: /* delete rules numbered "rulenum" */ + if (num == 0) + rt.flags |= IPFW_RCFLAG_ALL; + else + rt.flags |= IPFW_RCFLAG_RANGE; + do_del = 1; + break; + case 1: /* delete rules in set "rulenum" */ + rt.flags |= IPFW_RCFLAG_SET; + do_del = 1; + break; + case 5: /* delete rules "rulenum" and set "new_set" */ + rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET; + rt.set = new_set; + rt.new_set = 0; + do_del = 1; + break; + case 2: /* move rules "rulenum" to set "new_set" */ + rt.flags |= IPFW_RCFLAG_RANGE; + break; + case 3: /* move rules from set "rulenum" to set "new_set" */ + IPFW_UH_WLOCK(chain); + swap_sets(chain, &rt, 1); + IPFW_UH_WUNLOCK(chain); + return (0); + case 4: /* swap sets "rulenum" and "new_set" */ + IPFW_UH_WLOCK(chain); + swap_sets(chain, &rt, 0); + IPFW_UH_WUNLOCK(chain); + return (0); + default: + return (ENOTSUP); + } + + if (do_del != 0) { + if ((error = delete_range(chain, &rt, &ndel)) != 0) + return (error); + + if (ndel == 0 && (cmd != 1 && num != 0)) + return (EINVAL); + + return (0); + } + + return (move_range(chain, &rt)); +} + +/** + * Reset some or all counters on firewall rules. + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, + * the next 8 bits are the set number, the top 8 bits are the command: + * 0 work with rules from all set's; + * 1 work with rules only from specified set. + * Specified rule number is zero if we want to clear all entries. + * log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) +{ + struct ip_fw *rule; + char *msg; + int i; + + uint16_t rulenum = arg & 0xffff; + uint8_t set = (arg >> 16) & 0xff; + uint8_t cmd = (arg >> 24) & 0xff; + + if (cmd > 1) + return (EINVAL); + if (cmd == 1 && set > RESVD_SET) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + if (rulenum == 0) { + V_norule_counter = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + /* Skip rules not in our set. */ + if (cmd == 1 && rule->set != set) + continue; + clear_counters(rule, log_only); + } + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + } else { + int cleared = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) { + if (cmd == 0 || rule->set == set) + clear_counters(rule, log_only); + cleared = 1; + } + if (rule->rulenum > rulenum) + break; + } + if (!cleared) { /* we did not find any matching rules */ + IPFW_UH_RUNLOCK(chain); + return (EINVAL); + } + msg = log_only ? "logging count reset" : "cleared"; + } + IPFW_UH_RUNLOCK(chain); + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + + if (rulenum) + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); + else + log(lev, "ipfw: %s.\n", msg); + } + return (0); +} + + +/* + * Check rule head in FreeBSD11 format + * + */ +static int +check_ipfw_rule1(struct ip_fw_rule *rule, int size, + struct rule_check_info *ci) +{ + int l; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + + /* Check for valid cmd_len */ + l = roundup2(RULESIZE(rule), sizeof(uint64_t)); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) + return (EINVAL); + + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); +} + +/* + * Check rule head in FreeBSD8 format + * + */ +static int +check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, + struct rule_check_info *ci) +{ + int l; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + + /* Check for valid cmd_len */ + l = sizeof(*rule) + rule->cmd_len * 4 - 4; + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + + if (rule->rulenum > IPFW_DEFAULT_RULE - 1) + return (EINVAL); + + return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); +} + +static int +check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) +{ + int cmdlen, l; + int have_action; + + have_action = 0; + + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_SOCKARG: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: +#ifdef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: +#endif + case O_IP4: + case O_TAG: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + break; + + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if ((cmd->arg1 != IP_FW_TARG) && + ((cmd->arg1 & 0x7FFFF) >= rt_numfibs)) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1 & 0x7FFFF); + return EINVAL; + } + goto check_action; + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (cmd->arg1 >= V_fw_tables_max) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + ci->table_opcodes++; + break; + case O_IP_FLOW_LOOKUP: + if (cmd->arg1 >= V_fw_tables_max) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + ci->table_opcodes++; + break; + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TCPWIN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_DSCP: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (((ipfw_insn_if *)cmd)->name[0] == '\1') + ci->table_opcodes++; + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; +#ifdef INET6 + case O_FORWARD_IP6: + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) + goto bad_size; + goto check_action; +#endif /* INET6 */ + + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return EINVAL; + else + goto check_size; + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return EINVAL; + else + goto check_size; + case O_NAT: + if (!IPFW_NAT_LOADED) + return EINVAL; + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + goto bad_size; + goto check_action; + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: + case O_SETDSCP: +#ifdef INET6 + case O_UNREACH6: +#endif + case O_SKIPTO: + case O_REASS: + case O_CALLRETURN: +check_size: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return (EINVAL); + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return (EINVAL); + } + break; +#ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + goto bad_size; + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + goto bad_size; + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + goto bad_size; + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: + case O_IP6_SRC: + case O_IP6_DST: + case O_FLOW6ID: + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + case O_ICMP6TYPE: + printf("ipfw: no IPv6 support in kernel\n"); + return (EPROTONOSUPPORT); +#endif + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return (EINVAL); + } + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return (EINVAL); + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return (EINVAL); +} + + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +static int convert_rule_to_7(struct ip_fw_rule0 *rule); +static int convert_rule_to_8(struct ip_fw_rule0 *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + +/* + * Copy the static and dynamic rules to the supplied buffer + * and return the amount of space actually used. + * Must be run under IPFW_UH_RLOCK + */ +static size_t +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +{ + char *bp = buf; + char *ep = bp + space; + struct ip_fw *rule; + struct ip_fw_rule0 *dst; + int error, i, l, warnflag; + time_t boot_seconds; + + warnflag = 0; + + boot_seconds = boottime.tv_sec; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + bcopy(rule, bp, l + sizeof(uint32_t)); + error = ipfw_rewrite_table_kidx(chain, + (struct ip_fw_rule0 *)bp); + if (error != 0) + return (0); + error = convert_rule_to_7((struct ip_fw_rule0 *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + l = RULEUSIZE0(rule); + if (bp + l > ep) { /* should not happen */ + printf("overflow dumping static rules\n"); + break; + } + dst = (struct ip_fw_rule0 *)bp; + export_rule0(rule, dst, l); + error = ipfw_rewrite_table_kidx(chain, dst); + + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + * + * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask + * so we need to fail _after_ saving at least one mask. + */ + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); + if (dst->timestamp) + dst->timestamp += boot_seconds; + bp += l; + + if (error != 0) { + if (error == 2) { + /* Non-fatal table rewrite error. */ + warnflag = 1; + continue; + } + printf("Stop on rule %d. Fail to convert table\n", + rule->rulenum); + break; + } + } + if (warnflag != 0) + printf("ipfw: process %s is using legacy interfaces," + " consider rebuilding\n", ""); + ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */ + return (bp - (char *)buf); +} + + +struct dump_args { + uint32_t b; /* start rule */ + uint32_t e; /* end rule */ + uint32_t rcount; /* number of rules */ + uint32_t rsize; /* rules size */ + uint32_t tcount; /* number of tables */ + int rcounters; /* counters */ +}; + +/* + * Dumps static rules with table TLVs in buffer @sd. + * + * Returns 0 on success. + */ +static int +dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, + uint32_t *bmask, struct sockopt_data *sd) +{ + int error; + int i, l; + uint32_t tcount; + ipfw_obj_ctlv *ctlv; + struct ip_fw *krule; + caddr_t dst; + + /* Dump table names first (if any) */ + if (da->tcount > 0) { + /* Header first */ + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + ctlv->head.type = IPFW_TLV_TBLNAME_LIST; + ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + + sizeof(*ctlv); + ctlv->count = da->tcount; + ctlv->objsize = sizeof(ipfw_obj_ntlv); + } + + i = 0; + tcount = da->tcount; + while (tcount > 0) { + if ((bmask[i / 32] & (1 << (i % 32))) == 0) { + i++; + continue; + } + + if ((error = ipfw_export_table_ntlv(chain, i, sd)) != 0) + return (error); + + i++; + tcount--; + } + + /* Dump rules */ + ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); + if (ctlv == NULL) + return (ENOMEM); + ctlv->head.type = IPFW_TLV_RULE_LIST; + ctlv->head.length = da->rsize + sizeof(*ctlv); + ctlv->count = da->rcount; + + for (i = da->b; i < da->e; i++) { + krule = chain->map[i]; + + l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv); + if (da->rcounters != 0) + l += sizeof(struct ip_fw_bcounter); + dst = (caddr_t)ipfw_get_sopt_space(sd, l); + if (dst == NULL) + return (ENOMEM); + + export_rule1(krule, dst, l, da->rcounters); + } + + return (0); +} + +/* + * Dumps requested objects data + * Data layout (version 0)(current): + * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags + * size = ipfw_cfg_lheader.size + * Reply: [ ipfw_cfg_lheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) + * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ] + * ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional) + * ] + * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize. + * The rest (size, count) are set to zero and needs to be ignored. + * + * Returns 0 on success. + */ +static int +dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_cfg_lheader *hdr; + struct ip_fw *rule; + size_t sz, rnum; + uint32_t hdr_flags; + int error, i; + struct dump_args da; + uint32_t *bmask; + + hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); + if (hdr == NULL) + return (EINVAL); + + error = 0; + bmask = NULL; + /* Allocate needed state */ + if (hdr->flags & IPFW_CFG_GET_STATIC) + bmask = malloc(IPFW_TABLES_MAX / 8, M_TEMP, M_WAITOK | M_ZERO); + + IPFW_UH_RLOCK(chain); + + /* + * STAGE 1: Determine size/count for objects in range. + * Prepare used tables bitmask. + */ + sz = sizeof(ipfw_cfg_lheader); + memset(&da, 0, sizeof(da)); + + da.b = 0; + da.e = chain->n_rules; + + if (hdr->end_rule != 0) { + /* Handle custom range */ + if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE) + rnum = IPFW_DEFAULT_RULE; + da.b = ipfw_find_rule(chain, rnum, 0); + rnum = hdr->end_rule; + rnum = (rnum < IPFW_DEFAULT_RULE) ? rnum+1 : IPFW_DEFAULT_RULE; + da.e = ipfw_find_rule(chain, rnum, 0) + 1; + } + + if (hdr->flags & IPFW_CFG_GET_STATIC) { + for (i = da.b; i < da.e; i++) { + rule = chain->map[i]; + da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); + da.rcount++; + da.tcount += ipfw_mark_table_kidx(chain, rule, bmask); + } + /* Add counters if requested */ + if (hdr->flags & IPFW_CFG_GET_COUNTERS) { + da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount; + da.rcounters = 1; + } + + if (da.tcount > 0) + sz += da.tcount * sizeof(ipfw_obj_ntlv) + + sizeof(ipfw_obj_ctlv); + sz += da.rsize + sizeof(ipfw_obj_ctlv); + } + + if (hdr->flags & IPFW_CFG_GET_STATES) + sz += ipfw_dyn_get_count() * sizeof(ipfw_obj_dyntlv) + + sizeof(ipfw_obj_ctlv); + + + /* + * Fill header anyway. + * Note we have to save header fields to stable storage + * buffer inside @sd can be flushed after dumping rules + */ + hdr->size = sz; + hdr->set_mask = ~V_set_disable; + hdr_flags = hdr->flags; + hdr = NULL; + + if (sd->valsize < sz) { + error = ENOMEM; + goto cleanup; + } + + /* STAGE2: Store actual data */ + if (hdr_flags & IPFW_CFG_GET_STATIC) { + error = dump_static_rules(chain, &da, bmask, sd); + if (error != 0) + goto cleanup; + } + + if (hdr_flags & IPFW_CFG_GET_STATES) + error = ipfw_dump_states(chain, sd); + +cleanup: + IPFW_UH_RUNLOCK(chain); + + if (bmask != NULL) + free(bmask, M_TEMP); + + return (error); +} + +static int +check_object_name(ipfw_obj_ntlv *ntlv) +{ + int error; + + switch (ntlv->head.type) { + case IPFW_TLV_TBL_NAME: + error = ipfw_check_table_name(ntlv->name); + break; + default: + error = ENOTSUP; + } + + return (0); +} + +/* + * Adds one or more rules to ipfw @chain. + * Data layout (version 0)(current): + * Request: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3) + * ] + * Reply: + * [ + * ip_fw3_opheader + * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) + * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] + * ] + * + * Rules in reply are modified to store their actual ruleset number. + * + * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending + * accoring to their idx field and there has to be no duplicates. + * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. + * (*3) Each ip_fw structure needs to be aligned to u64 boundary. + * + * Returns 0 on success. + */ +static int +add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_ctlv *ctlv, *rtlv, *tstate; + ipfw_obj_ntlv *ntlv; + int clen, error, idx; + uint32_t count, read; + struct ip_fw_rule *r; + struct rule_check_info rci, *ci, *cbuf; + int i, rsize; + + op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); + ctlv = (ipfw_obj_ctlv *)(op3 + 1); + + read = sizeof(ip_fw3_opheader); + rtlv = NULL; + tstate = NULL; + cbuf = NULL; + memset(&rci, 0, sizeof(struct rule_check_info)); + + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { + clen = ctlv->head.length; + /* Check size and alignment */ + if (clen > sd->valsize || clen < sizeof(*ctlv)) + return (EINVAL); + if ((clen % sizeof(uint64_t)) != 0) + return (EINVAL); + + /* + * Some table names or other named objects. + * Check for validness. + */ + count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); + if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) + return (EINVAL); + + /* + * Check each TLV. + * Ensure TLVs are sorted ascending and + * there are no duplicates. + */ + idx = -1; + ntlv = (ipfw_obj_ntlv *)(ctlv + 1); + while (count > 0) { + if (ntlv->head.length != sizeof(ipfw_obj_ntlv)) + return (EINVAL); + + error = check_object_name(ntlv); + if (error != 0) + return (error); + + if (ntlv->idx <= idx) + return (EINVAL); + + idx = ntlv->idx; + count--; + ntlv++; + } + + tstate = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + } + + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { + clen = ctlv->head.length; + if (clen + read > sd->valsize || clen < sizeof(*ctlv)) + return (EINVAL); + if ((clen % sizeof(uint64_t)) != 0) + return (EINVAL); + + /* + * TODO: Permit adding multiple rules at once + */ + if (ctlv->count != 1) + return (ENOTSUP); + + clen -= sizeof(*ctlv); + + if (ctlv->count > clen / sizeof(struct ip_fw_rule)) + return (EINVAL); + + /* Allocate state for each rule or use stack */ + if (ctlv->count == 1) { + memset(&rci, 0, sizeof(struct rule_check_info)); + cbuf = &rci; + } else + cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP, + M_WAITOK | M_ZERO); + ci = cbuf; + + /* + * Check each rule for validness. + * Ensure numbered rules are sorted ascending + * and properly aligned + */ + idx = 0; + r = (struct ip_fw_rule *)(ctlv + 1); + count = 0; + error = 0; + while (clen > 0) { + rsize = roundup2(RULESIZE(r), sizeof(uint64_t)); + if (rsize > clen || ctlv->count <= count) { + error = EINVAL; + break; + } + + ci->ctlv = tstate; + error = check_ipfw_rule1(r, rsize, ci); + if (error != 0) + break; + + /* Check sorting */ + if (r->rulenum != 0 && r->rulenum < idx) { + printf("rulenum %d idx %d\n", r->rulenum, idx); + error = EINVAL; + break; + } + idx = r->rulenum; + + ci->urule = (caddr_t)r; + + rsize = roundup2(rsize, sizeof(uint64_t)); + clen -= rsize; + r = (struct ip_fw_rule *)((caddr_t)r + rsize); + count++; + ci++; + } + + if (ctlv->count != count || error != 0) { + if (cbuf != &rci) + free(cbuf, M_TEMP); + return (EINVAL); + } + + rtlv = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + } + + if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) { + if (cbuf != NULL && cbuf != &rci) + free(cbuf, M_TEMP); + return (EINVAL); + } + + /* + * Passed rules seems to be valid. + * Allocate storage and try to add them to chain. + */ + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) { + clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule); + ci->krule = ipfw_alloc_rule(chain, clen); + import_rule1(ci); + } + + if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) { + /* Free allocate krules */ + for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) + free(ci->krule, M_IPFW); + } + + if (cbuf != NULL && cbuf != &rci) + free(cbuf, M_TEMP); + + return (error); +} + +/* + * Lists all sopts currently registered. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ] + * + * Returns 0 on success + */ +static int +dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + ipfw_sopt_info *i; + struct ipfw_sopt_handler *sh; + uint32_t count, n, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + CTL3_LOCK(); + count = ctl3_hsize; + size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_sopt_info); + + if (size > olh->size) { + olh->size = size; + CTL3_UNLOCK(); + return (ENOMEM); + } + olh->size = size; + + for (n = 1; n <= count; n++) { + i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i)); + KASSERT(i != 0, ("previously checked buffer is not enough")); + sh = &ctl3_handlers[n]; + i->opcode = sh->opcode; + i->version = sh->version; + i->refcnt = sh->refcnt; + } + CTL3_UNLOCK(); + + return (0); +} + +/* + * Compares two sopt handlers (code, version and handler ptr). + * Used both as qsort() and bsearch(). + * Does not compare handler for latter case. + * + * Returns 0 if match is found. + */ +static int +compare_sh(const void *_a, const void *_b) +{ + const struct ipfw_sopt_handler *a, *b; + + a = (const struct ipfw_sopt_handler *)_a; + b = (const struct ipfw_sopt_handler *)_b; + + if (a->opcode < b->opcode) + return (-1); + else if (a->opcode > b->opcode) + return (1); + + if (a->version < b->version) + return (-1); + else if (a->version > b->version) + return (1); + + /* bsearch helper */ + if (a->handler == NULL) + return (0); + + if ((uintptr_t)a->handler < (uintptr_t)b->handler) + return (-1); + else if ((uintptr_t)b->handler > (uintptr_t)b->handler) + return (1); + + return (0); +} + +/* + * Finds sopt handler based on @code and @version. + * + * Returns pointer to handler or NULL. + */ +static struct ipfw_sopt_handler * +find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler) +{ + struct ipfw_sopt_handler *sh, h; + + memset(&h, 0, sizeof(h)); + h.opcode = code; + h.version = version; + h.handler = handler; + + sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers, + ctl3_hsize, sizeof(h), compare_sh); + + return (sh); +} + +static int +find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh) +{ + struct ipfw_sopt_handler *sh; + + CTL3_LOCK(); + if ((sh = find_sh(opcode, version, NULL)) == NULL) { + CTL3_UNLOCK(); + printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n", + opcode, version); + return (EINVAL); + } + sh->refcnt++; + ctl3_refct++; + /* Copy handler data to requested buffer */ + *psh = *sh; + CTL3_UNLOCK(); + + return (0); +} + +static void +find_unref_sh(struct ipfw_sopt_handler *psh) +{ + struct ipfw_sopt_handler *sh; + + CTL3_LOCK(); + sh = find_sh(psh->opcode, psh->version, NULL); + KASSERT(sh != NULL, ("ctl3 handler disappeared")); + sh->refcnt--; + ctl3_refct--; + CTL3_UNLOCK(); +} + +void +ipfw_init_sopt_handler() +{ + + CTL3_LOCK_INIT(); + IPFW_ADD_SOPT_HANDLER(1, scodes); +} + +void +ipfw_destroy_sopt_handler() +{ + + IPFW_DEL_SOPT_HANDLER(1, scodes); + CTL3_LOCK_DESTROY(); +} + +/* + * Adds one or more sockopt handlers to the global array. + * Function may sleep. + */ +void +ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) +{ + size_t sz; + struct ipfw_sopt_handler *tmp; + + CTL3_LOCK(); + + for (;;) { + sz = ctl3_hsize + count; + CTL3_UNLOCK(); + tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO); + CTL3_LOCK(); + if (ctl3_hsize + count <= sz) + break; + + /* Retry */ + free(tmp, M_IPFW); + } + + /* Merge old & new arrays */ + sz = ctl3_hsize + count; + memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh)); + memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh)); + qsort(tmp, sz, sizeof(*sh), compare_sh); + /* Switch new and free old */ + if (ctl3_handlers != NULL) + free(ctl3_handlers, M_IPFW); + ctl3_handlers = tmp; + ctl3_hsize = sz; + ctl3_gencnt++; + + CTL3_UNLOCK(); +} + +/* + * Removes one or more sockopt handlers from the global array. + */ +int +ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) +{ + size_t sz; + struct ipfw_sopt_handler *tmp, *h; + int i; + + CTL3_LOCK(); + + for (i = 0; i < count; i++) { + tmp = &sh[i]; + h = find_sh(tmp->opcode, tmp->version, tmp->handler); + if (h == NULL) + continue; + + sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h); + memmove(h, h + 1, sz); + ctl3_hsize--; + } + + if (ctl3_hsize == 0) { + if (ctl3_handlers != NULL) + free(ctl3_handlers, M_IPFW); + ctl3_handlers = NULL; + } + + ctl3_gencnt++; + + CTL3_UNLOCK(); + + return (0); +} + +/* + * Writes data accumulated in @sd to sockopt buffer. + * Zeroes internal @sd buffer. + */ +static int +ipfw_flush_sopt_data(struct sockopt_data *sd) +{ + struct sockopt *sopt; + int error; + size_t sz; + + sz = sd->koff; + if (sz == 0) + return (0); + + sopt = sd->sopt; + + if (sopt->sopt_dir == SOPT_GET) { + error = copyout(sd->kbuf, sopt->sopt_val, sz); + if (error != 0) + return (error); + } + + memset(sd->kbuf, 0, sd->ksize); + sd->ktotal += sz; + sd->koff = 0; + if (sd->ktotal + sd->ksize < sd->valsize) + sd->kavail = sd->ksize; + else + sd->kavail = sd->valsize - sd->ktotal; + + /* Update sopt buffer data */ + sopt->sopt_valsize = sd->ktotal; + sopt->sopt_val = sd->sopt_val + sd->ktotal; + + return (0); +} + +/* + * Ensures that @sd buffer has contigious @neeeded number of + * bytes. + * + * Returns pointer to requested space or NULL. + */ +caddr_t +ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed) +{ + int error; + caddr_t addr; + + if (sd->kavail < needed) { + /* + * Flush data and try another time. + */ + error = ipfw_flush_sopt_data(sd); + + if (sd->kavail < needed || error != 0) + return (NULL); + } + + addr = sd->kbuf + sd->koff; + sd->koff += needed; + sd->kavail -= needed; + return (addr); +} + +/* + * Requests @needed contigious bytes from @sd buffer. + * Function is used to notify subsystem that we are + * interesed in first @needed bytes (request header) + * and the rest buffer can be safely zeroed. + * + * Returns pointer to requested space or NULL. + */ +caddr_t +ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed) +{ + caddr_t addr; + + if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL) + return (NULL); + + if (sd->kavail > 0) + memset(sd->kbuf + sd->koff, 0, sd->kavail); + + return (addr); +} + +/* + * New sockopt handler. + */ +int +ipfw_ctl3(struct sockopt *sopt) +{ + int error, locked; + size_t size, valsize; + struct ip_fw_chain *chain; + char xbuf[256]; + struct sockopt_data sdata; + struct ipfw_sopt_handler h; + ip_fw3_opheader *op3 = NULL; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); + if (error != 0) + return (error); + + if (sopt->sopt_name != IP_FW3) + return (ipfw_ctl(sopt)); + + chain = &V_layer3_chain; + error = 0; + + /* Save original valsize before it is altered via sooptcopyin() */ + valsize = sopt->sopt_valsize; + memset(&sdata, 0, sizeof(sdata)); + /* Read op3 header first to determine actual operation */ + op3 = (ip_fw3_opheader *)xbuf; + error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3)); + if (error != 0) + return (error); + sopt->sopt_valsize = valsize; + + /* + * Find and reference command. + */ + error = find_ref_sh(op3->opcode, op3->version, &h); + if (error != 0) + return (error); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error != 0) { + find_unref_sh(&h); + return (error); + } + } + + /* + * Fill in sockopt_data structure that may be useful for + * IP_FW3 get requests. + */ + locked = 0; + if (valsize <= sizeof(xbuf)) { + /* use on-stack buffer */ + sdata.kbuf = xbuf; + sdata.ksize = sizeof(xbuf); + sdata.kavail = valsize; + } else { + + /* + * Determine opcode type/buffer size: + * allocate sliding-window buf for data export or + * contigious buffer for special ops. + */ + if ((h.dir & HDIR_SET) != 0) { + /* Set request. Allocate contigous buffer. */ + if (valsize > CTL3_LARGEBUF) { + find_unref_sh(&h); + return (EFBIG); + } + + size = valsize; + } else { + /* Get request. Allocate sliding window buffer */ + size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF; + + if (size < valsize) { + /* We have to wire user buffer */ + error = vslock(sopt->sopt_val, valsize); + if (error != 0) + return (error); + locked = 1; + } + } + + sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + sdata.ksize = size; + sdata.kavail = size; + } + + sdata.sopt = sopt; + sdata.sopt_val = sopt->sopt_val; + sdata.valsize = valsize; + + /* + * Copy either all request (if valsize < bsize_max) + * or first bsize_max bytes to guarantee most consumers + * that all necessary data has been copied). + * Anyway, copy not less than sizeof(ip_fw3_opheader). + */ + if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize, + sizeof(ip_fw3_opheader))) != 0) + return (error); + op3 = (ip_fw3_opheader *)sdata.kbuf; + + /* Finally, run handler */ + error = h.handler(chain, op3, &sdata); + find_unref_sh(&h); + + /* Flush state and free buffers */ + if (error == 0) + error = ipfw_flush_sopt_data(&sdata); + else + ipfw_flush_sopt_data(&sdata); + + if (locked != 0) + vsunlock(sdata.sopt_val, valsize); + + /* Restore original pointer and set number of bytes written */ + sopt->sopt_val = sdata.sopt_val; + sopt->sopt_valsize = sdata.ktotal; + if (sdata.kbuf != xbuf) + free(sdata.kbuf, M_TEMP); + + return (error); +} + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (512*sizeof(u_int32_t)) + int error; + size_t size, valsize; + struct ip_fw *buf; + struct ip_fw_rule0 *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + uint32_t opt; + struct rule_check_info ci; + IPFW_RLOCK_TRACKER; + + chain = &V_layer3_chain; + error = 0; + + /* Save original valsize before it is altered via sooptcopyin() */ + valsize = sopt->sopt_valsize; + opt = sopt->sopt_name; + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (opt == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error != 0) + return (error); + } + + switch (opt) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + * + * Note that the calculated size is used to bound the + * amount of data returned to the user. The rule set may + * change between calculating the size and returning the + * data in which case we'll just return what fits. + */ + for (;;) { + int len = 0, want; + + size = chain->static_len; + size += ipfw_dyn_len(); + if (size >= sopt->sopt_valsize) + break; + buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + IPFW_UH_RLOCK(chain); + /* check again how much space we need */ + want = chain->static_len + ipfw_dyn_len(); + if (size >= want) + len = ipfw_getrules(chain, buf, size); + IPFW_UH_RUNLOCK(chain); + if (size >= want) + error = sooptcopyout(sopt, buf, len); + free(buf, M_TEMP); + if (size >= want) + break; + } + break; + + case IP_FW_FLUSH: + /* locking is done within del_entry() */ + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ + break; + + case IP_FW_ADD: + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw7) ); + + memset(&ci, 0, sizeof(struct rule_check_info)); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + size = sopt->sopt_valsize; + if (size == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) { + free(rule, M_TEMP); + return error; + } + size = RULESIZE(rule); + } else + is7 = 0; + if (error == 0) + error = check_ipfw_rule0(rule, size, &ci); + if (error == 0) { + /* locking is done within add_rule() */ + struct ip_fw *krule; + krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule)); + ci.urule = (caddr_t)rule; + ci.krule = krule; + import_rule0(&ci); + error = commit_rules(chain, &ci, 1); + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) { + free(rule, M_TEMP); + return error; + } + } + error = sooptcopyout(sopt, rule, size); + } + } + free(rule, M_TEMP); + break; + + case IP_FW_DEL: + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. Argument size + * is used to distinguish between the two: + * sizeof(u_int32_t) + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * 2*sizeof(u_int32_t) + * atomic disable/enable sets. + * first u_int32_t contains sets to be disabled, + * second u_int32_t contains sets to be enabled. + */ + error = sooptcopyin(sopt, rulenum, + 2*sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + size = sopt->sopt_valsize; + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { + /* delete or reassign, locking done in del_entry() */ + error = del_entry(chain, rulenum[0]); + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ + IPFW_UH_WLOCK(chain); + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & + ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ + IPFW_UH_WUNLOCK(chain); + } else + error = EINVAL; + break; + + case IP_FW_ZERO: + case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */ + rulenum[0] = 0; + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, rulenum, + sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + } + error = zero_entry(chain, rulenum[0], + sopt->sopt_name == IP_FW_RESETLOG); + break; + + /*--- TABLE opcodes ---*/ + case IP_FW_TABLE_ADD: + case IP_FW_TABLE_DEL: + { + ipfw_table_entry ent; + struct tentry_info tei; + struct tid_info ti; + struct table_value v; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + + memset(&tei, 0, sizeof(tei)); + tei.paddr = &ent.addr; + tei.subtype = AF_INET; + tei.masklen = ent.masklen; + ipfw_import_table_value_legacy(ent.value, &v); + tei.pvalue = &v; + memset(&ti, 0, sizeof(ti)); + ti.uidx = ent.tbl; + ti.type = IPFW_TABLE_CIDR; + + error = (opt == IP_FW_TABLE_ADD) ? + add_table_entry(chain, &ti, &tei, 0, 1) : + del_table_entry(chain, &ti, &tei, 0, 1); + } + break; + + + case IP_FW_TABLE_FLUSH: + { + u_int16_t tbl; + struct tid_info ti; + + error = sooptcopyin(sopt, &tbl, + sizeof(tbl), sizeof(tbl)); + if (error) + break; + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl; + error = flush_table(chain, &ti); + } + break; + + case IP_FW_TABLE_GETSIZE: + { + u_int32_t tbl, cnt; + struct tid_info ti; + + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), + sizeof(tbl)))) + break; + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl; + IPFW_RLOCK(chain); + error = ipfw_count_table(chain, &ti, &cnt); + IPFW_RUNLOCK(chain); + if (error) + break; + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); + } + break; + + case IP_FW_TABLE_LIST: + { + ipfw_table *tbl; + struct tid_info ti; + + if (sopt->sopt_valsize < sizeof(*tbl)) { + error = EINVAL; + break; + } + size = sopt->sopt_valsize; + tbl = malloc(size, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); + if (error) { + free(tbl, M_TEMP); + break; + } + tbl->size = (size - sizeof(*tbl)) / + sizeof(ipfw_table_entry); + memset(&ti, 0, sizeof(ti)); + ti.uidx = tbl->tbl; + IPFW_RLOCK(chain); + error = ipfw_dump_table_legacy(chain, &ti, tbl); + IPFW_RUNLOCK(chain); + if (error) { + free(tbl, M_TEMP); + break; + } + error = sooptcopyout(sopt, tbl, size); + free(tbl, M_TEMP); + } + break; + + /*--- NAT operations are protected by the IPFW_LOCK ---*/ + case IP_FW_NAT_CFG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_DEL: + if (IPFW_NAT_LOADED) + error = ipfw_nat_del_ptr(sopt); + else { + printf("IP_FW_NAT_DEL: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_CONFIG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_GET_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_LOG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_log_ptr(sopt); + else { + printf("IP_FW_NAT_GET_LOG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +#undef RULE_MAXSIZE +} +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +static int +convert_rule_to_7(struct ip_fw_rule0 *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw_rule0 *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + //rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * decrement opcode if it is after O_REASS + */ + dst->opcode--; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +static int +convert_rule_to_8(struct ip_fw_rule0 *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * increment opcode if it is after O_REASS + */ + dst->opcode++; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + +/* + * Named object api + * + */ + +/* + * Allocate new bitmask which can be used to enlarge/shrink + * named instance index. + */ +void +ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks) +{ + size_t size; + int max_blocks; + u_long *idx_mask; + + KASSERT((items % BLOCK_ITEMS) == 0, + ("bitmask size needs to power of 2 and greater or equal to %zu", + BLOCK_ITEMS)); + + max_blocks = items / BLOCK_ITEMS; + size = items / 8; + idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK); + /* Mark all as free */ + memset(idx_mask, 0xFF, size * IPFW_MAX_SETS); + *idx_mask &= ~(u_long)1; /* Skip index 0 */ + + *idx = idx_mask; + *pblocks = max_blocks; +} + +/* + * Copy current bitmask index to new one. + */ +void +ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks) +{ + int old_blocks, new_blocks; + u_long *old_idx, *new_idx; + int i; + + old_idx = ni->idx_mask; + old_blocks = ni->max_blocks; + new_idx = *idx; + new_blocks = *blocks; + + for (i = 0; i < IPFW_MAX_SETS; i++) { + memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i], + old_blocks * sizeof(u_long)); + } +} + +/* + * Swaps current @ni index with new one. + */ +void +ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks) +{ + int old_blocks; + u_long *old_idx; + + old_idx = ni->idx_mask; + old_blocks = ni->max_blocks; + + ni->idx_mask = *idx; + ni->max_blocks = *blocks; + + /* Save old values */ + *idx = old_idx; + *blocks = old_blocks; +} + +void +ipfw_objhash_bitmap_free(void *idx, int blocks) +{ + + free(idx, M_IPFW); +} + +/* + * Creates named hash instance. + * Must be called without holding any locks. + * Return pointer to new instance. + */ +struct namedobj_instance * +ipfw_objhash_create(uint32_t items) +{ + struct namedobj_instance *ni; + int i; + size_t size; + + size = sizeof(struct namedobj_instance) + + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE + + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE; + + ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO); + ni->nn_size = NAMEDOBJ_HASH_SIZE; + ni->nv_size = NAMEDOBJ_HASH_SIZE; + + ni->names = (struct namedobjects_head *)(ni +1); + ni->values = &ni->names[ni->nn_size]; + + for (i = 0; i < ni->nn_size; i++) + TAILQ_INIT(&ni->names[i]); + + for (i = 0; i < ni->nv_size; i++) + TAILQ_INIT(&ni->values[i]); + + /* Set default hashing/comparison functions */ + ni->hash_f = objhash_hash_name; + ni->cmp_f = objhash_cmp_name; + + /* Allocate bitmask separately due to possible resize */ + ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, (int *)&ni->max_blocks); + + return (ni); +} + +void +ipfw_objhash_destroy(struct namedobj_instance *ni) +{ + + free(ni->idx_mask, M_IPFW); + free(ni, M_IPFW); +} + +void +ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, + objhash_cmp_f *cmp_f) +{ + + ni->hash_f = hash_f; + ni->cmp_f = cmp_f; +} + +static uint32_t +objhash_hash_name(struct namedobj_instance *ni, void *name, uint32_t set) +{ + + return (fnv_32_str((char *)name, FNV1_32_INIT)); +} + +static int +objhash_cmp_name(struct named_object *no, void *name, uint32_t set) +{ + + if ((strcmp(no->name, (char *)name) == 0) && (no->set == set)) + return (0); + + return (1); +} + +static uint32_t +objhash_hash_idx(struct namedobj_instance *ni, uint32_t val) +{ + uint32_t v; + + v = val % (ni->nv_size - 1); + + return (v); +} + +struct named_object * +ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name) +{ + struct named_object *no; + uint32_t hash; + + hash = ni->hash_f(ni, name, set) % ni->nn_size; + + TAILQ_FOREACH(no, &ni->names[hash], nn_next) { + if (ni->cmp_f(no, name, set) == 0) + return (no); + } + + return (NULL); +} + +struct named_object * +ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) +{ + struct named_object *no; + uint32_t hash; + + hash = objhash_hash_idx(ni, kidx); + + TAILQ_FOREACH(no, &ni->values[hash], nv_next) { + if (no->kidx == kidx) + return (no); + } + + return (NULL); +} + +int +ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, + struct named_object *b) +{ + + if ((strcmp(a->name, b->name) == 0) && a->set == b->set) + return (1); + + return (0); +} + +void +ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no) +{ + uint32_t hash; + + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; + TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next); + + hash = objhash_hash_idx(ni, no->kidx); + TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next); + + ni->count++; +} + +void +ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no) +{ + uint32_t hash; + + hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; + TAILQ_REMOVE(&ni->names[hash], no, nn_next); + + hash = objhash_hash_idx(ni, no->kidx); + TAILQ_REMOVE(&ni->values[hash], no, nv_next); + + ni->count--; +} + +uint32_t +ipfw_objhash_count(struct namedobj_instance *ni) +{ + + return (ni->count); +} + +/* + * Runs @func for each found named object. + * It is safe to delete objects from callback + */ +void +ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg) +{ + struct named_object *no, *no_tmp; + int i; + + for (i = 0; i < ni->nn_size; i++) { + TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) + f(ni, no, arg); + } +} + +/* + * Removes index from given set. + * Returns 0 on success. + */ +int +ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx) +{ + u_long *mask; + int i, v; + + i = idx / BLOCK_ITEMS; + v = idx % BLOCK_ITEMS; + + if (i >= ni->max_blocks) + return (1); + + mask = &ni->idx_mask[i]; + + if ((*mask & ((u_long)1 << v)) != 0) + return (1); + + /* Mark as free */ + *mask |= (u_long)1 << v; + + /* Update free offset */ + if (ni->free_off[0] > i) + ni->free_off[0] = i; + + return (0); +} + +/* + * Allocate new index in given instance and stores in in @pidx. + * Returns 0 on success. + */ +int +ipfw_objhash_alloc_idx(void *n, uint16_t *pidx) +{ + struct namedobj_instance *ni; + u_long *mask; + int i, off, v; + + ni = (struct namedobj_instance *)n; + + off = ni->free_off[0]; + mask = &ni->idx_mask[off]; + + for (i = off; i < ni->max_blocks; i++, mask++) { + if ((v = ffsl(*mask)) == 0) + continue; + + /* Mark as busy */ + *mask &= ~ ((u_long)1 << (v - 1)); + + ni->free_off[0] = i; + + v = BLOCK_ITEMS * i + v - 1; + + *pidx = v; + return (0); + } + + return (1); +} + +/* end of file */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c new file mode 100644 index 0000000..2994528 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.c @@ -0,0 +1,3674 @@ +/*- + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table.c 272840 2014-10-09 19:32:35Z melifaro $"); + +/* + * Lookup table support for ipfw. + * + * This file contains handlers for all generic tables' operations: + * add/del/flush entries, list/dump tables etc.. + * + * Table data modification is protected by both UH and runtime lock + * while reading configuration/data is protected by UH lock. + * + * Lookup algorithms for all table types are located in ip_fw_table_algo.c + */ + +#include "opt_ipfw.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + + /* + * Table has the following `type` concepts: + * + * `no.type` represents lookup key type (addr, ifp, uid, etc..) + * vmask represents bitmask of table values which are present at the moment. + * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old + * single-value-for-all approach. + */ +struct table_config { + struct named_object no; + uint8_t tflags; /* type flags */ + uint8_t locked; /* 1 if locked from changes */ + uint8_t linked; /* 1 if already linked */ + uint8_t ochanged; /* used by set swapping */ + uint8_t vshared; /* 1 if using shared value array */ + uint8_t spare[3]; + uint32_t count; /* Number of records */ + uint32_t limit; /* Max number of records */ + uint32_t vmask; /* bitmask with supported values */ + uint32_t ocount; /* used by set swapping */ + uint64_t gencnt; /* generation count */ + char tablename[64]; /* table name */ + struct table_algo *ta; /* Callbacks for given algo */ + void *astate; /* algorithm state */ + struct table_info ti_copy; /* data to put to table_info */ + struct namedobj_instance *vi; +}; + +static struct table_config *find_table(struct namedobj_instance *ni, + struct tid_info *ti); +static struct table_config *alloc_table_config(struct ip_fw_chain *ch, + struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags); +static void free_table_config(struct namedobj_instance *ni, + struct table_config *tc); +static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref); +static void link_table(struct ip_fw_chain *ch, struct table_config *tc); +static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc); +static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc); +#define OP_ADD 1 +#define OP_DEL 0 +static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, + struct sockopt_data *sd); +static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, + ipfw_xtable_info *i); +static int dump_table_tentry(void *e, void *arg); +static int dump_table_xentry(void *e, void *arg); + +static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, + struct tid_info *b); + +static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, + struct table_config *tc, struct table_info *ti, uint32_t count); +static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti); + +static struct table_algo *find_table_algo(struct tables_config *tableconf, + struct tid_info *ti, char *name); + +static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); +static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); +static int classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); + +#define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) +#define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) + +#define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */ + +void +rollback_toperation_state(struct ip_fw_chain *ch, void *object) +{ + struct tables_config *tcfg; + struct op_state *os; + + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_FOREACH(os, &tcfg->state_list, next) + os->func(object, os); +} + +void +add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + struct tables_config *tcfg; + + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next); +} + +void +del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + struct tables_config *tcfg; + + tcfg = CHAIN_TO_TCFG(ch); + TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next); +} + +void +tc_ref(struct table_config *tc) +{ + + tc->no.refcnt++; +} + +void +tc_unref(struct table_config *tc) +{ + + tc->no.refcnt--; +} + +static struct table_value * +get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx) +{ + struct table_value *pval; + + pval = (struct table_value *)ch->valuestate; + + return (&pval[kidx]); +} + + +/* + * Checks if we're able to insert/update entry @tei into table + * w.r.t @tc limits. + * May alter @tei to indicate insertion error / insert + * options. + * + * Returns 0 if operation can be performed/ + */ +static int +check_table_limit(struct table_config *tc, struct tentry_info *tei) +{ + + if (tc->limit == 0 || tc->count < tc->limit) + return (0); + + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) { + /* Notify userland on error cause */ + tei->flags |= TEI_FLAGS_LIMIT; + return (EFBIG); + } + + /* + * We have UPDATE flag set. + * Permit updating record (if found), + * but restrict adding new one since we've + * already hit the limit. + */ + tei->flags |= TEI_FLAGS_DONTADD; + + return (0); +} + +/* + * Convert algorithm callback return code into + * one of pre-defined states known by userland. + */ +static void +store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num) +{ + int flag; + + flag = 0; + + switch (error) { + case 0: + if (op == OP_ADD && num != 0) + flag = TEI_FLAGS_ADDED; + if (op == OP_DEL) + flag = TEI_FLAGS_DELETED; + break; + case ENOENT: + flag = TEI_FLAGS_NOTFOUND; + break; + case EEXIST: + flag = TEI_FLAGS_EXISTS; + break; + default: + flag = TEI_FLAGS_ERROR; + } + + tei->flags |= flag; +} + +/* + * Creates and references table with default parameters. + * Saves table config, algo and allocated kidx info @ptc, @pta and + * @pkidx if non-zero. + * Used for table auto-creation to support old binaries. + * + * Returns 0 on success. + */ +static int +create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti, + uint16_t *pkidx) +{ + ipfw_xtable_info xi; + int error; + + memset(&xi, 0, sizeof(xi)); + /* Set default value mask for legacy clients */ + xi.vmask = IPFW_VTYPE_LEGACY; + + error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1); + if (error != 0) + return (error); + + return (0); +} + +/* + * Find and reference existing table optionally + * creating new one. + * + * Saves found table config into @ptc. + * Note function may drop/acquire UH_WLOCK. + * Returns 0 if table was found/created and referenced + * or non-zero return code. + */ +static int +find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint32_t count, int op, + struct table_config **ptc) +{ + struct namedobj_instance *ni; + struct table_config *tc; + uint16_t kidx; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + tc = NULL; + if ((tc = find_table(ni, ti)) != NULL) { + /* check table type */ + if (tc->no.type != ti->type) + return (EINVAL); + + if (tc->locked != 0) + return (EACCES); + + /* Try to exit early on limit hit */ + if (op == OP_ADD && count == 1 && + check_table_limit(tc, tei) != 0) + return (EFBIG); + + /* Reference and return */ + tc->no.refcnt++; + *ptc = tc; + return (0); + } + + if (op == OP_DEL) + return (ESRCH); + + /* Compability mode: create new table for old clients */ + if ((tei->flags & TEI_FLAGS_COMPAT) == 0) + return (ESRCH); + + IPFW_UH_WUNLOCK(ch); + error = create_table_compat(ch, ti, &kidx); + IPFW_UH_WLOCK(ch); + + if (error != 0) + return (error); + + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx)); + + /* OK, now we've got referenced table. */ + *ptc = tc; + return (0); +} + +/* + * Rolls back already @added to @tc entries using state array @ta_buf_m. + * Assume the following layout: + * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases + * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1]) + * for storing deleted state + */ +static void +rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc, + struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m, + uint32_t count, uint32_t added) +{ + struct table_algo *ta; + struct tentry_info *ptei; + caddr_t v, vv; + size_t ta_buf_sz; + int error, i; + uint32_t num; + + IPFW_UH_WLOCK_ASSERT(ch); + + ta = tc->ta; + ta_buf_sz = ta->ta_buf_size; + v = ta_buf_m; + vv = v + count * ta_buf_sz; + for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) { + ptei = &tei[i]; + if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) { + + /* + * We have old value stored by previous + * call in @ptei->value. Do add once again + * to restore it. + */ + error = ta->add(tc->astate, tinfo, ptei, v, &num); + KASSERT(error == 0, ("rollback UPDATE fail")); + KASSERT(num == 0, ("rollback UPDATE fail2")); + continue; + } + + error = ta->prepare_del(ch, ptei, vv); + KASSERT(error == 0, ("pre-rollback INSERT failed")); + error = ta->del(tc->astate, tinfo, ptei, vv, &num); + KASSERT(error == 0, ("rollback INSERT failed")); + tc->count -= num; + } +} + +/* + * Prepares add/del state for all @count entries in @tei. + * Uses either stack buffer (@ta_buf) or allocates a new one. + * Stores pointer to allocated buffer back to @ta_buf. + * + * Returns 0 on success. + */ +static int +prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, + struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf) +{ + caddr_t ta_buf_m, v; + size_t ta_buf_sz, sz; + struct tentry_info *ptei; + int error, i; + + error = 0; + ta_buf_sz = ta->ta_buf_size; + if (count == 1) { + /* Sigle add/delete, use on-stack buffer */ + memset(*ta_buf, 0, TA_BUF_SZ); + ta_buf_m = *ta_buf; + } else { + + /* + * Multiple adds/deletes, allocate larger buffer + * + * Note we need 2xcount buffer for add case: + * we have hold both ADD state + * and DELETE state (this may be needed + * if we need to rollback all changes) + */ + sz = count * ta_buf_sz; + ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP, + M_WAITOK | M_ZERO); + } + + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta_buf_sz) { + ptei = &tei[i]; + error = (op == OP_ADD) ? + ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v); + + /* + * Some syntax error (incorrect mask, or address, or + * anything). Return error regardless of atomicity + * settings. + */ + if (error != 0) + break; + } + + *ta_buf = ta_buf_m; + return (error); +} + +/* + * Flushes allocated state for each @count entries in @tei. + * Frees @ta_buf_m if differs from stack buffer @ta_buf. + */ +static void +flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, + struct tentry_info *tei, uint32_t count, int rollback, + caddr_t ta_buf_m, caddr_t ta_buf) +{ + caddr_t v; + struct tentry_info *ptei; + size_t ta_buf_sz; + int i; + + ta_buf_sz = ta->ta_buf_size; + + /* Run cleaning callback anyway */ + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta_buf_sz) { + ptei = &tei[i]; + ta->flush_entry(ch, ptei, v); + if (ptei->ptv != NULL) { + free(ptei->ptv, M_IPFW); + ptei->ptv = NULL; + } + } + + /* Clean up "deleted" state in case of rollback */ + if (rollback != 0) { + v = ta_buf_m + count * ta_buf_sz; + for (i = 0; i < count; i++, v += ta_buf_sz) + ta->flush_entry(ch, &tei[i], v); + } + + if (ta_buf_m != ta_buf) + free(ta_buf_m, M_TEMP); +} + + +static void +rollback_add_entry(void *object, struct op_state *_state) +{ + struct ip_fw_chain *ch; + struct tableop_state *ts; + + ts = (struct tableop_state *)_state; + + if (ts->tc != object && ts->ch != object) + return; + + ch = ts->ch; + + IPFW_UH_WLOCK_ASSERT(ch); + + /* Call specifid unlockers */ + rollback_table_values(ts); + + /* Indicate we've called */ + ts->modified = 1; +} + +/* + * Adds/updates one or more entries in table @ti. + * + * Function may drop/reacquire UH wlock multiple times due to + * items alloc, algorithm callbacks (check_space), value linkage + * (new values, value storage realloc), etc.. + * Other processes like other adds (which may involve storage resize), + * table swaps (which changes table data and may change algo type), + * table modify (which may change value mask) may be executed + * simultaneously so we need to deal with it. + * + * The following approach was implemented: + * we have per-chain linked list, protected with UH lock. + * add_table_entry prepares special on-stack structure wthich is passed + * to its descendants. Users add this structure to this list before unlock. + * After performing needed operations and acquiring UH lock back, each user + * checks if structure has changed. If true, it rolls local state back and + * returns without error to the caller. + * add_table_entry() on its own checks if structure has changed and restarts + * its operation from the beginning (goto restart). + * + * Functions which are modifying fields of interest (currently + * resize_shared_value_storage() and swap_tables() ) + * traverses given list while holding UH lock immediately before + * performing their operations calling function provided be list entry + * ( currently rollback_add_entry ) which performs rollback for all necessary + * state and sets appropriate values in structure indicating rollback + * has happened. + * + * Algo interaction: + * Function references @ti first to ensure table won't + * disappear or change its type. + * After that, prepare_add callback is called for each @tei entry. + * Next, we try to add each entry under UH+WHLOCK + * using add() callback. + * Finally, we free all state by calling flush_entry callback + * for each @tei. + * + * Returns 0 on success. + */ +int +add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count) +{ + struct table_config *tc; + struct table_algo *ta; + uint16_t kidx; + int error, first_error, i, rollback; + uint32_t num, numadd; + struct tentry_info *ptei; + struct tableop_state ts; + char ta_buf[TA_BUF_SZ]; + caddr_t ta_buf_m = NULL, v; + + memset(&ts, 0, sizeof(ts)); + ta = NULL; + IPFW_UH_WLOCK(ch); + + /* + * Find and reference existing table. + */ +restart: + if (ts.modified != 0) { + IPFW_UH_WUNLOCK(ch); + flush_batch_buffer(ch, ta, tei, count, rollback, + ta_buf_m, ta_buf); + memset(&ts, 0, sizeof(ts)); + ta = NULL; + IPFW_UH_WLOCK(ch); + } + + error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc); + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + ta = tc->ta; + + /* Fill in tablestate */ + ts.ch = ch; + ts.opstate.func = rollback_add_entry; + ts.tc = tc; + ts.vshared = tc->vshared; + ts.vmask = tc->vmask; + ts.ta = ta; + ts.tei = tei; + ts.count = count; + rollback = 0; + add_toperation_state(ch, &ts); + IPFW_UH_WUNLOCK(ch); + + /* Allocate memory and prepare record(s) */ + /* Pass stack buffer by default */ + ta_buf_m = ta_buf; + error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m); + if (error != 0) + goto cleanup; + + IPFW_UH_WLOCK(ch); + /* Drop reference we've used in first search */ + tc->no.refcnt--; + + /* + * Check if table swap has happened. + * (so table algo might be changed). + * Restart operation to achieve consistent behavior. + */ + del_toperation_state(ch, &ts); + if (ts.modified != 0) + goto restart; + + /* + * Link all values values to shared/per-table value array. + * + * May release/reacquire UH_WLOCK. + */ + error = ipfw_link_table_values(ch, &ts); + if (error != 0) + goto cleanup; + if (ts.modified != 0) + goto restart; + + /* + * Ensure we are able to add all entries without additional + * memory allocations. May release/reacquire UH_WLOCK. + */ + kidx = tc->no.kidx; + error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count); + if (error != 0) + goto cleanup; + if (ts.modified != 0) + goto restart; + + /* We've got valid table in @tc. Let's try to add data */ + kidx = tc->no.kidx; + ta = tc->ta; + numadd = 0; + first_error = 0; + + IPFW_WLOCK(ch); + + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta->ta_buf_size) { + ptei = &tei[i]; + num = 0; + /* check limit before adding */ + if ((error = check_table_limit(tc, ptei)) == 0) { + error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx), + ptei, v, &num); + /* Set status flag to inform userland */ + store_tei_result(ptei, OP_ADD, error, num); + } + if (error == 0) { + /* Update number of records to ease limit checking */ + tc->count += num; + numadd += num; + continue; + } + + if (first_error == 0) + first_error = error; + + /* + * Some error have happened. Check our atomicity + * settings: continue if atomicity is not required, + * rollback changes otherwise. + */ + if ((flags & IPFW_CTF_ATOMIC) == 0) + continue; + + rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx), + tei, ta_buf_m, count, i); + + rollback = 1; + break; + } + + IPFW_WUNLOCK(ch); + + ipfw_garbage_table_values(ch, tc, tei, count, rollback); + + /* Permit post-add algorithm grow/rehash. */ + if (numadd != 0) + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); + + /* Return first error to user, if any */ + error = first_error; + +cleanup: + IPFW_UH_WUNLOCK(ch); + + flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); + + return (error); +} + +/* + * Deletes one or more entries in table @ti. + * + * Returns 0 on success. + */ +int +del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count) +{ + struct table_config *tc; + struct table_algo *ta; + struct tentry_info *ptei; + uint16_t kidx; + int error, first_error, i; + uint32_t num, numdel; + char ta_buf[TA_BUF_SZ]; + caddr_t ta_buf_m, v; + + /* + * Find and reference existing table. + */ + IPFW_UH_WLOCK(ch); + error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc); + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + ta = tc->ta; + IPFW_UH_WUNLOCK(ch); + + /* Allocate memory and prepare record(s) */ + /* Pass stack buffer by default */ + ta_buf_m = ta_buf; + error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m); + if (error != 0) + goto cleanup; + + IPFW_UH_WLOCK(ch); + + /* Drop reference we've used in first search */ + tc->no.refcnt--; + + /* + * Check if table algo is still the same. + * (changed ta may be the result of table swap). + */ + if (ta != tc->ta) { + IPFW_UH_WUNLOCK(ch); + error = EINVAL; + goto cleanup; + } + + kidx = tc->no.kidx; + numdel = 0; + first_error = 0; + + IPFW_WLOCK(ch); + v = ta_buf_m; + for (i = 0; i < count; i++, v += ta->ta_buf_size) { + ptei = &tei[i]; + num = 0; + error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, + &num); + /* Save state for userland */ + store_tei_result(ptei, OP_DEL, error, num); + if (error != 0 && first_error == 0) + first_error = error; + tc->count -= num; + numdel += num; + } + IPFW_WUNLOCK(ch); + + /* Unlink non-used values */ + ipfw_garbage_table_values(ch, tc, tei, count, 0); + + if (numdel != 0) { + /* Run post-del hook to permit shrinking */ + check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); + } + + IPFW_UH_WUNLOCK(ch); + + /* Return first error to user, if any */ + error = first_error; + +cleanup: + flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf); + + return (error); +} + +/* + * Ensure that table @tc has enough space to add @count entries without + * need for reallocation. + * + * Callbacks order: + * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize. + * + * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags. + * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage + * 3) modify (UH_WLOCK + WLOCK) - switch pointers + * 4) flush_modify (UH_WLOCK) - free state, if needed + * + * Returns 0 on success. + */ +static int +check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, + struct table_config *tc, struct table_info *ti, uint32_t count) +{ + struct table_algo *ta; + uint64_t pflags; + char ta_buf[TA_BUF_SZ]; + int error; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = 0; + ta = tc->ta; + if (ta->need_modify == NULL) + return (0); + + /* Acquire reference not to loose @tc between locks/unlocks */ + tc->no.refcnt++; + + /* + * TODO: think about avoiding race between large add/large delete + * operation on algorithm which implements shrinking along with + * growing. + */ + while (true) { + pflags = 0; + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { + error = 0; + break; + } + + /* We have to shrink/grow table */ + if (ts != NULL) + add_toperation_state(ch, ts); + IPFW_UH_WUNLOCK(ch); + + memset(&ta_buf, 0, sizeof(ta_buf)); + error = ta->prepare_mod(ta_buf, &pflags); + + IPFW_UH_WLOCK(ch); + if (ts != NULL) + del_toperation_state(ch, ts); + + if (error != 0) + break; + + if (ts != NULL && ts->modified != 0) { + + /* + * Swap operation has happened + * so we're currently operating on other + * table data. Stop doing this. + */ + ta->flush_mod(ta_buf); + break; + } + + /* Check if we still need to alter table */ + ti = KIDX_TO_TI(ch, tc->no.kidx); + if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { + IPFW_UH_WUNLOCK(ch); + + /* + * Other thread has already performed resize. + * Flush our state and return. + */ + ta->flush_mod(ta_buf); + break; + } + + error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags); + if (error == 0) { + /* Do actual modification */ + IPFW_WLOCK(ch); + ta->modify(tc->astate, ti, ta_buf, pflags); + IPFW_WUNLOCK(ch); + } + + /* Anyway, flush data and retry */ + ta->flush_mod(ta_buf); + } + + tc->no.refcnt--; + return (error); +} + +/* + * Adds or deletes record in table. + * Data layout (v0): + * Request: [ ip_fw3_opheader ipfw_table_xentry ] + * + * Returns 0 on success + */ +static int +manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_table_xentry *xent; + struct tentry_info tei; + struct tid_info ti; + struct table_value v; + int error, hdrlen, read; + + hdrlen = offsetof(ipfw_table_xentry, k); + + /* Check minimum header size */ + if (sd->valsize < (sizeof(*op3) + hdrlen)) + return (EINVAL); + + read = sizeof(ip_fw3_opheader); + + /* Check if xentry len field is valid */ + xent = (ipfw_table_xentry *)(op3 + 1); + if (xent->len < hdrlen || xent->len + read > sd->valsize) + return (EINVAL); + + memset(&tei, 0, sizeof(tei)); + tei.paddr = &xent->k; + tei.masklen = xent->masklen; + ipfw_import_table_value_legacy(xent->value, &v); + tei.pvalue = &v; + /* Old requests compability */ + tei.flags = TEI_FLAGS_COMPAT; + if (xent->type == IPFW_TABLE_ADDR) { + if (xent->len - hdrlen == sizeof(in_addr_t)) + tei.subtype = AF_INET; + else + tei.subtype = AF_INET6; + } + + memset(&ti, 0, sizeof(ti)); + ti.uidx = xent->tbl; + ti.type = xent->type; + + error = (op3->opcode == IP_FW_TABLE_XADD) ? + add_table_entry(ch, &ti, &tei, 0, 1) : + del_table_entry(ch, &ti, &tei, 0, 1); + + return (error); +} + +/* + * Adds or deletes record in table. + * Data layout (v1)(current): + * Request: [ ipfw_obj_header + * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ] + * ] + * + * Returns 0 on success + */ +static int +manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_tentry *tent, *ptent; + ipfw_obj_ctlv *ctlv; + ipfw_obj_header *oh; + struct tentry_info *ptei, tei, *tei_buf; + struct tid_info ti; + int error, i, kidx, read; + + /* Check minimum header size */ + if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv))) + return (EINVAL); + + /* Check if passed data is too long */ + if (sd->valsize != sd->kavail) + return (EINVAL); + + oh = (ipfw_obj_header *)sd->kbuf; + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + read = sizeof(*oh); + + ctlv = (ipfw_obj_ctlv *)(oh + 1); + if (ctlv->head.length + read != sd->valsize) + return (EINVAL); + + read += sizeof(*ctlv); + tent = (ipfw_obj_tentry *)(ctlv + 1); + if (ctlv->count * sizeof(*tent) + read != sd->valsize) + return (EINVAL); + + if (ctlv->count == 0) + return (0); + + /* + * Mark entire buffer as "read". + * This instructs sopt api write it back + * after function return. + */ + ipfw_get_sopt_header(sd, sd->valsize); + + /* Perform basic checks for each entry */ + ptent = tent; + kidx = tent->idx; + for (i = 0; i < ctlv->count; i++, ptent++) { + if (ptent->head.length != sizeof(*ptent)) + return (EINVAL); + if (ptent->idx != kidx) + return (ENOTSUP); + } + + /* Convert data into kernel request objects */ + objheader_to_ti(oh, &ti); + ti.type = oh->ntlv.type; + ti.uidx = kidx; + + /* Use on-stack buffer for single add/del */ + if (ctlv->count == 1) { + memset(&tei, 0, sizeof(tei)); + tei_buf = &tei; + } else + tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP, + M_WAITOK | M_ZERO); + + ptei = tei_buf; + ptent = tent; + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { + ptei->paddr = &ptent->k; + ptei->subtype = ptent->subtype; + ptei->masklen = ptent->masklen; + if (ptent->head.flags & IPFW_TF_UPDATE) + ptei->flags |= TEI_FLAGS_UPDATE; + + ipfw_import_table_value_v1(&ptent->v.value); + ptei->pvalue = (struct table_value *)&ptent->v.value; + } + + error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ? + add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) : + del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count); + + /* Translate result back to userland */ + ptei = tei_buf; + ptent = tent; + for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { + if (ptei->flags & TEI_FLAGS_ADDED) + ptent->result = IPFW_TR_ADDED; + else if (ptei->flags & TEI_FLAGS_DELETED) + ptent->result = IPFW_TR_DELETED; + else if (ptei->flags & TEI_FLAGS_UPDATED) + ptent->result = IPFW_TR_UPDATED; + else if (ptei->flags & TEI_FLAGS_LIMIT) + ptent->result = IPFW_TR_LIMIT; + else if (ptei->flags & TEI_FLAGS_ERROR) + ptent->result = IPFW_TR_ERROR; + else if (ptei->flags & TEI_FLAGS_NOTFOUND) + ptent->result = IPFW_TR_NOTFOUND; + else if (ptei->flags & TEI_FLAGS_EXISTS) + ptent->result = IPFW_TR_EXISTS; + ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value); + } + + if (tei_buf != &tei) + free(tei_buf, M_TEMP); + + return (error); +} + +/* + * Looks up an entry in given table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_obj_tentry ] + * Reply: [ ipfw_obj_header ipfw_obj_tentry ] + * + * Returns 0 on success + */ +static int +find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_tentry *tent; + ipfw_obj_header *oh; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct table_info *kti; + struct namedobj_instance *ni; + int error; + size_t sz; + + /* Check minimum header size */ + sz = sizeof(*oh) + sizeof(*tent); + if (sd->valsize != sz) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + tent = (ipfw_obj_tentry *)(oh + 1); + + /* Basic length checks for TLVs */ + if (oh->ntlv.head.length != sizeof(oh->ntlv)) + return (EINVAL); + + objheader_to_ti(oh, &ti); + ti.type = oh->ntlv.type; + ti.uidx = tent->idx; + + IPFW_UH_RLOCK(ch); + ni = CHAIN_TO_NI(ch); + + /* + * Find existing table and check its type . + */ + ta = NULL; + if ((tc = find_table(ni, &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + + /* check table type */ + if (tc->no.type != ti.type) { + IPFW_UH_RUNLOCK(ch); + return (EINVAL); + } + + kti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; + + if (ta->find_tentry == NULL) + return (ENOTSUP); + + error = ta->find_tentry(tc->astate, kti, tent); + + IPFW_UH_RUNLOCK(ch); + + return (error); +} + +/* + * Flushes all entries or destroys given table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ] + * + * Returns 0 on success + */ +static int +flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + int error; + struct _ipfw_obj_header *oh; + struct tid_info ti; + + if (sd->valsize != sizeof(*oh)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)op3; + objheader_to_ti(oh, &ti); + + if (op3->opcode == IP_FW_TABLE_XDESTROY) + error = destroy_table(ch, &ti); + else if (op3->opcode == IP_FW_TABLE_XFLUSH) + error = flush_table(ch, &ti); + else + return (ENOTSUP); + + return (error); +} + +static void +restart_flush(void *object, struct op_state *_state) +{ + struct tableop_state *ts; + + ts = (struct tableop_state *)_state; + + if (ts->tc != object) + return; + + /* Indicate we've called */ + ts->modified = 1; +} + +/* + * Flushes given table. + * + * Function create new table instance with the same + * parameters, swaps it with old one and + * flushes state without holding runtime WLOCK. + * + * Returns 0 on success. + */ +int +flush_table(struct ip_fw_chain *ch, struct tid_info *ti) +{ + struct namedobj_instance *ni; + struct table_config *tc; + struct table_algo *ta; + struct table_info ti_old, ti_new, *tablestate; + void *astate_old, *astate_new; + char algostate[64], *pstate; + struct tableop_state ts; + int error; + uint16_t kidx; + uint8_t tflags; + + /* + * Stage 1: save table algoritm. + * Reference found table to ensure it won't disappear. + */ + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } +restart: + /* Set up swap handler */ + memset(&ts, 0, sizeof(ts)); + ts.opstate.func = restart_flush; + ts.tc = tc; + + ta = tc->ta; + /* Do not flush readonly tables */ + if ((ta->flags & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); + } + /* Save startup algo parameters */ + if (ta->print_config != NULL) { + ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx), + algostate, sizeof(algostate)); + pstate = algostate; + } else + pstate = NULL; + tflags = tc->tflags; + tc->no.refcnt++; + add_toperation_state(ch, &ts); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 2: allocate new table instance using same algo. + */ + memset(&ti_new, 0, sizeof(struct table_info)); + error = ta->init(ch, &astate_new, &ti_new, pstate, tflags); + + /* + * Stage 3: swap old state pointers with newly-allocated ones. + * Decrease refcount. + */ + IPFW_UH_WLOCK(ch); + tc->no.refcnt--; + del_toperation_state(ch, &ts); + + if (error != 0) { + IPFW_UH_WUNLOCK(ch); + return (error); + } + + /* + * Restart operation if table swap has happened: + * even if algo may be the same, algo init parameters + * may change. Restart operation instead of doing + * complex checks. + */ + if (ts.modified != 0) { + ta->destroy(astate_new, &ti_new); + goto restart; + } + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + tablestate = (struct table_info *)ch->tablestate; + + IPFW_WLOCK(ch); + ti_old = tablestate[kidx]; + tablestate[kidx] = ti_new; + IPFW_WUNLOCK(ch); + + astate_old = tc->astate; + tc->astate = astate_new; + tc->ti_copy = ti_new; + tc->count = 0; + + /* Notify algo on real @ti address */ + if (ta->change_ti != NULL) + ta->change_ti(tc->astate, &tablestate[kidx]); + + /* + * Stage 4: unref values. + */ + ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 5: perform real flush/destroy. + */ + ta->destroy(astate_old, &ti_old); + + return (0); +} + +/* + * Swaps two tables. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_obj_ntlv ] + * + * Returns 0 on success + */ +static int +swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + int error; + struct _ipfw_obj_header *oh; + struct tid_info ti_a, ti_b; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)op3; + ntlv_to_ti(&oh->ntlv, &ti_a); + ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b); + + error = swap_tables(ch, &ti_a, &ti_b); + + return (error); +} + +/* + * Swaps two tables of the same type/valtype. + * + * Checks if tables are compatible and limits + * permits swap, than actually perform swap. + * + * Each table consists of 2 different parts: + * config: + * @tc (with name, set, kidx) and rule bindings, which is "stable". + * number of items + * table algo + * runtime: + * runtime data @ti (ch->tablestate) + * runtime cache in @tc + * algo-specific data (@tc->astate) + * + * So we switch: + * all runtime data + * number of items + * table algo + * + * After that we call @ti change handler for each table. + * + * Note that referencing @tc won't protect tc->ta from change. + * XXX: Do we need to restrict swap between locked tables? + * XXX: Do we need to exchange ftype? + * + * Returns 0 on success. + */ +static int +swap_tables(struct ip_fw_chain *ch, struct tid_info *a, + struct tid_info *b) +{ + struct namedobj_instance *ni; + struct table_config *tc_a, *tc_b; + struct table_algo *ta; + struct table_info ti, *tablestate; + void *astate; + uint32_t count; + + /* + * Stage 1: find both tables and ensure they are of + * the same type. + */ + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc_a = find_table(ni, a)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + if ((tc_b = find_table(ni, b)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + /* It is very easy to swap between the same table */ + if (tc_a == tc_b) { + IPFW_UH_WUNLOCK(ch); + return (0); + } + + /* Check type and value are the same */ + if (tc_a->no.type != tc_b->no.type || tc_a->tflags != tc_b->tflags) { + IPFW_UH_WUNLOCK(ch); + return (EINVAL); + } + + /* Check limits before swap */ + if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) || + (tc_b->limit != 0 && tc_a->count > tc_b->limit)) { + IPFW_UH_WUNLOCK(ch); + return (EFBIG); + } + + /* Check if one of the tables is readonly */ + if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); + } + + /* Notify we're going to swap */ + rollback_toperation_state(ch, tc_a); + rollback_toperation_state(ch, tc_b); + + /* Everything is fine, prepare to swap */ + tablestate = (struct table_info *)ch->tablestate; + ti = tablestate[tc_a->no.kidx]; + ta = tc_a->ta; + astate = tc_a->astate; + count = tc_a->count; + + IPFW_WLOCK(ch); + /* a <- b */ + tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx]; + tc_a->ta = tc_b->ta; + tc_a->astate = tc_b->astate; + tc_a->count = tc_b->count; + /* b <- a */ + tablestate[tc_b->no.kidx] = ti; + tc_b->ta = ta; + tc_b->astate = astate; + tc_b->count = count; + IPFW_WUNLOCK(ch); + + /* Ensure tc.ti copies are in sync */ + tc_a->ti_copy = tablestate[tc_a->no.kidx]; + tc_b->ti_copy = tablestate[tc_b->no.kidx]; + + /* Notify both tables on @ti change */ + if (tc_a->ta->change_ti != NULL) + tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]); + if (tc_b->ta->change_ti != NULL) + tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]); + + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Destroys table specified by @ti. + * Data layout (v0)(current): + * Request: [ ip_fw3_opheader ] + * + * Returns 0 on success + */ +static int +destroy_table(struct ip_fw_chain *ch, struct tid_info *ti) +{ + struct namedobj_instance *ni; + struct table_config *tc; + + IPFW_UH_WLOCK(ch); + + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + /* Do not permit destroying referenced tables */ + if (tc->no.refcnt > 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + IPFW_WLOCK(ch); + unlink_table(ch, tc); + IPFW_WUNLOCK(ch); + + /* Free obj index */ + if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0) + printf("Error unlinking kidx %d from table %s\n", + tc->no.kidx, tc->tablename); + + /* Unref values used in tables while holding UH lock */ + ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy); + IPFW_UH_WUNLOCK(ch); + + free_table_config(ni, tc); + + return (0); +} + +static uint32_t +roundup2p(uint32_t v) +{ + + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return (v); +} + +/* + * Grow tables index. + * + * Returns 0 on success. + */ +int +ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) +{ + unsigned int ntables_old, tbl; + struct namedobj_instance *ni; + void *new_idx, *old_tablestate, *tablestate; + struct table_info *ti; + struct table_config *tc; + int i, new_blocks; + + /* Check new value for validity */ + if (ntables == 0) + return (EINVAL); + if (ntables > IPFW_TABLES_MAX) + ntables = IPFW_TABLES_MAX; + /* Alight to nearest power of 2 */ + ntables = (unsigned int)roundup2p(ntables); + + /* Allocate new pointers */ + tablestate = malloc(ntables * sizeof(struct table_info), + M_IPFW, M_WAITOK | M_ZERO); + + ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks); + + IPFW_UH_WLOCK(ch); + + tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; + ni = CHAIN_TO_NI(ch); + + /* Temporary restrict decreasing max_tables */ + if (ntables < V_fw_tables_max) { + + /* + * FIXME: Check if we really can shrink + */ + IPFW_UH_WUNLOCK(ch); + return (EINVAL); + } + + /* Copy table info/indices */ + memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl); + ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks); + + IPFW_WLOCK(ch); + + /* Change pointers */ + old_tablestate = ch->tablestate; + ch->tablestate = tablestate; + ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks); + + ntables_old = V_fw_tables_max; + V_fw_tables_max = ntables; + + IPFW_WUNLOCK(ch); + + /* Notify all consumers that their @ti pointer has changed */ + ti = (struct table_info *)ch->tablestate; + for (i = 0; i < tbl; i++, ti++) { + if (ti->lookup == NULL) + continue; + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i); + if (tc == NULL || tc->ta->change_ti == NULL) + continue; + + tc->ta->change_ti(tc->astate, ti); + } + + IPFW_UH_WUNLOCK(ch); + + /* Free old pointers */ + free(old_tablestate, M_IPFW); + ipfw_objhash_bitmap_free(new_idx, new_blocks); + + return (0); +} + +/* + * Switch between "set 0" and "rule's set" table binding, + * Check all ruleset bindings and permits changing + * IFF each binding has both rule AND table in default set (set 0). + * + * Returns 0 on success. + */ +int +ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) +{ + struct namedobj_instance *ni; + struct named_object *no; + struct ip_fw *rule; + ipfw_insn *cmd; + int cmdlen, i, l; + uint16_t kidx; + uint8_t type; + + IPFW_UH_WLOCK(ch); + + if (V_fw_tables_sets == sets) { + IPFW_UH_WUNLOCK(ch); + return (0); + } + + ni = CHAIN_TO_NI(ch); + + /* + * Scan all rules and examine tables opcodes. + */ + for (i = 0; i < ch->n_rules; i++) { + rule = ch->map[i]; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + + no = ipfw_objhash_lookup_kidx(ni, kidx); + + /* Check if both table object and rule has the set 0 */ + if (no->set != 0 || rule->set != 0) { + IPFW_UH_WUNLOCK(ch); + return (EBUSY); + } + + } + } + V_fw_tables_sets = sets; + + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Lookup an IP @addr in table @tbl. + * Stores found value in @val. + * + * Returns 1 if @addr was found. + */ +int +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val) +{ + struct table_info *ti; + + ti = KIDX_TO_TI(ch, tbl); + + return (ti->lookup(ti, &addr, sizeof(in_addr_t), val)); +} + +/* + * Lookup an arbtrary key @paddr of legth @plen in table @tbl. + * Stores found value in @val. + * + * Returns 1 if key was found. + */ +int +ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, + void *paddr, uint32_t *val) +{ + struct table_info *ti; + + ti = KIDX_TO_TI(ch, tbl); + + return (ti->lookup(ti, paddr, plen, val)); +} + +/* + * Info/List/dump support for tables. + * + */ + +/* + * High-level 'get' cmds sysctl handlers + */ + +/* + * Lists all tables currently available in kernel. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ] + * + * Returns 0 on success + */ +static int +list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + int error; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + error = export_tables(ch, olh, sd); + IPFW_UH_RUNLOCK(ch); + + return (error); +} + +/* + * Store table info to buffer provided by @sd. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info(empty)] + * Reply: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success. + */ +static int +describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + struct table_config *tc; + struct tid_info ti; + size_t sz; + + sz = sizeof(*oh) + sizeof(ipfw_xtable_info); + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + + objheader_to_ti(oh, &ti); + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + + export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1)); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +/* + * Modifies existing table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success + */ +static int +modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + char *tname; + struct tid_info ti; + struct namedobj_instance *ni; + struct table_config *tc; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)sd->kbuf; + i = (ipfw_xtable_info *)(oh + 1); + + /* + * Verify user-supplied strings. + * Check for null-terminated/zero-length strings/ + */ + tname = oh->ntlv.name; + if (ipfw_check_table_name(tname) != 0) + return (EINVAL); + + objheader_to_ti(oh, &ti); + ti.type = i->type; + + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + if ((tc = find_table(ni, &ti)) == NULL) { + IPFW_UH_WUNLOCK(ch); + return (ESRCH); + } + + /* Do not support any modifications for readonly tables */ + if ((tc->ta->flags & TA_FLAG_READONLY) != 0) { + IPFW_UH_WUNLOCK(ch); + return (EACCES); + } + + if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0) + tc->limit = i->limit; + if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0) + tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0); + IPFW_UH_WUNLOCK(ch); + + return (0); +} + +/* + * Creates new table. + * Data layout (v0)(current): + * Request: [ ipfw_obj_header ipfw_xtable_info ] + * + * Returns 0 on success + */ +static int +create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + char *tname, *aname; + struct tid_info ti; + struct namedobj_instance *ni; + struct table_config *tc; + + if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) + return (EINVAL); + + oh = (struct _ipfw_obj_header *)sd->kbuf; + i = (ipfw_xtable_info *)(oh + 1); + + /* + * Verify user-supplied strings. + * Check for null-terminated/zero-length strings/ + */ + tname = oh->ntlv.name; + aname = i->algoname; + if (ipfw_check_table_name(tname) != 0 || + strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname)) + return (EINVAL); + + if (aname[0] == '\0') { + /* Use default algorithm */ + aname = NULL; + } + + objheader_to_ti(oh, &ti); + ti.type = i->type; + + ni = CHAIN_TO_NI(ch); + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(ni, &ti)) != NULL) { + IPFW_UH_RUNLOCK(ch); + return (EEXIST); + } + IPFW_UH_RUNLOCK(ch); + + return (create_table_internal(ch, &ti, aname, i, NULL, 0)); +} + +/* + * Creates new table based on @ti and @aname. + * + * Relies on table name checking inside find_name_tlv() + * Assume @aname to be checked and valid. + * Stores allocated table kidx inside @pkidx (if non-NULL). + * Reference created table if @compat is non-zero. + * + * Returns 0 on success. + */ +static int +create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, + char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat) +{ + struct namedobj_instance *ni; + struct table_config *tc, *tc_new, *tmp; + struct table_algo *ta; + uint16_t kidx; + + ni = CHAIN_TO_NI(ch); + + ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname); + if (ta == NULL) + return (ENOTSUP); + + tc = alloc_table_config(ch, ti, ta, aname, i->tflags); + if (tc == NULL) + return (ENOMEM); + + tc->vmask = i->vmask; + tc->limit = i->limit; + if (ta->flags & TA_FLAG_READONLY) + tc->locked = 1; + else + tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0; + + IPFW_UH_WLOCK(ch); + + /* Check if table has been already created */ + tc_new = find_table(ni, ti); + if (tc_new != NULL) { + + /* + * Compat: do not fail if we're + * requesting to create existing table + * which has the same type + */ + if (compat == 0 || tc_new->no.type != tc->no.type) { + IPFW_UH_WUNLOCK(ch); + free_table_config(ni, tc); + return (EEXIST); + } + + /* Exchange tc and tc_new for proper refcounting & freeing */ + tmp = tc; + tc = tc_new; + tc_new = tmp; + } else { + /* New table */ + if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) { + IPFW_UH_WUNLOCK(ch); + printf("Unable to allocate table index." + " Consider increasing net.inet.ip.fw.tables_max"); + free_table_config(ni, tc); + return (EBUSY); + } + tc->no.kidx = kidx; + + IPFW_WLOCK(ch); + link_table(ch, tc); + IPFW_WUNLOCK(ch); + } + + if (compat != 0) + tc->no.refcnt++; + if (pkidx != NULL) + *pkidx = tc->no.kidx; + + IPFW_UH_WUNLOCK(ch); + + if (tc_new != NULL) + free_table_config(ni, tc_new); + + return (0); +} + +static void +ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti) +{ + + memset(ti, 0, sizeof(struct tid_info)); + ti->set = ntlv->set; + ti->uidx = ntlv->idx; + ti->tlvs = ntlv; + ti->tlen = ntlv->head.length; +} + +static void +objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) +{ + + ntlv_to_ti(&oh->ntlv, ti); +} + +/* + * Exports basic table info as name TLV. + * Used inside dump_static_rules() to provide info + * about all tables referenced by current ruleset. + * + * Returns 0 on success. + */ +int +ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, + struct sockopt_data *sd) +{ + struct namedobj_instance *ni; + struct named_object *no; + ipfw_obj_ntlv *ntlv; + + ni = CHAIN_TO_NI(ch); + + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, ("invalid table kidx passed")); + + ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); + if (ntlv == NULL) + return (ENOMEM); + + ntlv->head.type = IPFW_TLV_TBL_NAME; + ntlv->head.length = sizeof(*ntlv); + ntlv->idx = no->kidx; + strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); + + return (0); +} + +/* + * Marks every table kidx used in @rule with bit in @bmask. + * Used to generate bitmask of referenced tables for given ruleset. + * + * Returns number of newly-referenced tables. + */ +int +ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, + uint32_t *bmask) +{ + int cmdlen, l, count; + ipfw_insn *cmd; + uint16_t kidx; + uint8_t type; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + count = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + + if ((bmask[kidx / 32] & (1 << (kidx % 32))) == 0) + count++; + + bmask[kidx / 32] |= 1 << (kidx % 32); + } + + return (count); +} + +struct dump_args { + struct ip_fw_chain *ch; + struct table_info *ti; + struct table_config *tc; + struct sockopt_data *sd; + uint32_t cnt; + uint16_t uidx; + int error; + uint32_t size; + ipfw_table_entry *ent; + ta_foreach_f *f; + void *farg; + ipfw_obj_tentry tent; +}; + +static int +count_ext_entries(void *e, void *arg) +{ + struct dump_args *da; + + da = (struct dump_args *)arg; + da->cnt++; + + return (0); +} + +/* + * Gets number of items from table either using + * internal counter or calling algo callback for + * externally-managed tables. + * + * Returns number of records. + */ +static uint32_t +table_get_count(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct table_info *ti; + struct table_algo *ta; + struct dump_args da; + + ti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; + + /* Use internal counter for self-managed tables */ + if ((ta->flags & TA_FLAG_READONLY) == 0) + return (tc->count); + + /* Use callback to quickly get number of items */ + if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0) + return (ta->get_count(tc->astate, ti)); + + /* Count number of iterms ourselves */ + memset(&da, 0, sizeof(da)); + ta->foreach(tc->astate, ti, count_ext_entries, &da); + + return (da.cnt); +} + +/* + * Exports table @tc info into standard ipfw_xtable_info format. + */ +static void +export_table_info(struct ip_fw_chain *ch, struct table_config *tc, + ipfw_xtable_info *i) +{ + struct table_info *ti; + struct table_algo *ta; + + i->type = tc->no.type; + i->tflags = tc->tflags; + i->vmask = tc->vmask; + i->set = tc->no.set; + i->kidx = tc->no.kidx; + i->refcnt = tc->no.refcnt; + i->count = table_get_count(ch, tc); + i->limit = tc->limit; + i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0; + i->size = tc->count * sizeof(ipfw_obj_tentry); + i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); + strlcpy(i->tablename, tc->tablename, sizeof(i->tablename)); + ti = KIDX_TO_TI(ch, tc->no.kidx); + ta = tc->ta; + if (ta->print_config != NULL) { + /* Use algo function to print table config to string */ + ta->print_config(tc->astate, ti, i->algoname, + sizeof(i->algoname)); + } else + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); + /* Dump algo-specific data, if possible */ + if (ta->dump_tinfo != NULL) { + ta->dump_tinfo(tc->astate, ti, &i->ta_info); + i->ta_info.flags |= IPFW_TATFLAGS_DATA; + } +} + +struct dump_table_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; +}; + +static void +export_table_internal(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + ipfw_xtable_info *i; + struct dump_table_args *dta; + + dta = (struct dump_table_args *)arg; + + i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i)); + KASSERT(i != 0, ("previously checked buffer is not enough")); + + export_table_info(dta->ch, (struct table_config *)no, i); +} + +/* + * Export all tables as ipfw_xtable_info structures to + * storage provided by @sd. + * + * If supplied buffer is too small, fills in required size + * and returns ENOMEM. + * Returns 0 on success. + */ +static int +export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, + struct sockopt_data *sd) +{ + uint32_t size; + uint32_t count; + struct dump_table_args dta; + + count = ipfw_objhash_count(CHAIN_TO_NI(ch)); + size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_xtable_info); + + if (size > olh->size) { + olh->size = size; + return (ENOMEM); + } + + olh->size = size; + + dta.ch = ch; + dta.sd = sd; + + ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta); + + return (0); +} + +/* + * Dumps all table data + * Data layout (v1)(current): + * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size + * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ] + * + * Returns 0 on success + */ +static int +dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_header *oh; + ipfw_xtable_info *i; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + uint32_t sz; + + sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); + oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); + if (oh == NULL) + return (EINVAL); + + i = (ipfw_xtable_info *)(oh + 1); + objheader_to_ti(oh, &ti); + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (ESRCH); + } + export_table_info(ch, tc, i); + + if (sd->valsize < i->size) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @i structure with + * relevant table info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + + /* + * Do the actual dump in eXtended format + */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.sd = sd; + + ta = tc->ta; + + ta->foreach(tc->astate, da.ti, dump_table_tentry, &da); + IPFW_UH_RUNLOCK(ch); + + return (da.error); +} + +/* + * Dumps all table data + * Data layout (version 0)(legacy): + * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE() + * Reply: [ ipfw_xtable ipfw_table_xentry x N ] + * + * Returns 0 on success + */ +static int +dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_xtable *xtbl; + struct tid_info ti; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + size_t sz, count; + + xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable)); + if (xtbl == NULL) + return (EINVAL); + + memset(&ti, 0, sizeof(ti)); + ti.uidx = xtbl->tbl; + + IPFW_UH_RLOCK(ch); + if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { + IPFW_UH_RUNLOCK(ch); + return (0); + } + count = table_get_count(ch, tc); + sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable); + + xtbl->cnt = count; + xtbl->size = sz; + xtbl->type = tc->no.type; + xtbl->tbl = ti.uidx; + + if (sd->valsize < sz) { + + /* + * Submitted buffer size is not enough. + * WE've already filled in @i structure with + * relevant table info including size, so we + * can return. Buffer will be flushed automatically. + */ + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + + /* Do the actual dump in eXtended format */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.sd = sd; + + ta = tc->ta; + + ta->foreach(tc->astate, da.ti, dump_table_xentry, &da); + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +/* + * Legacy function to retrieve number of items in table. + */ +static int +get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + uint32_t *tbl; + struct tid_info ti; + size_t sz; + int error; + + sz = sizeof(*op3) + sizeof(uint32_t); + op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz); + if (op3 == NULL) + return (EINVAL); + + tbl = (uint32_t *)(op3 + 1); + memset(&ti, 0, sizeof(ti)); + ti.uidx = *tbl; + IPFW_UH_RLOCK(ch); + error = ipfw_count_xtable(ch, &ti, tbl); + IPFW_UH_RUNLOCK(ch); + return (error); +} + +/* + * Legacy IP_FW_TABLE_GETSIZE handler + */ +int +ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) +{ + struct table_config *tc; + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) + return (ESRCH); + *cnt = table_get_count(ch, tc); + return (0); +} + +/* + * Legacy IP_FW_TABLE_XGETSIZE handler + */ +int +ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) +{ + struct table_config *tc; + uint32_t count; + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) { + *cnt = 0; + return (0); /* 'table all list' requires success */ + } + + count = table_get_count(ch, tc); + *cnt = count * sizeof(ipfw_table_xentry); + if (count > 0) + *cnt += sizeof(ipfw_xtable); + return (0); +} + +static int +dump_table_entry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + ipfw_table_entry *ent; + struct table_value *pval; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + /* Out of memory, returning */ + if (da->cnt == da->size) + return (1); + ent = da->ent++; + ent->tbl = da->uidx; + da->cnt++; + + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); + if (error != 0) + return (error); + + ent->addr = da->tent.k.addr.s_addr; + ent->masklen = da->tent.masklen; + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); + ent->value = ipfw_export_table_value_legacy(pval); + + return (0); +} + +/* + * Dumps table in pre-8.1 legacy format. + */ +int +ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, + ipfw_table *tbl) +{ + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + + tbl->cnt = 0; + + if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) + return (0); /* XXX: We should return ESRCH */ + + ta = tc->ta; + + /* This dump format supports IPv4 only */ + if (tc->no.type != IPFW_TABLE_ADDR) + return (0); + + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.ent = &tbl->ent[0]; + da.size = tbl->size; + + tbl->cnt = 0; + ta->foreach(tc->astate, da.ti, dump_table_entry, &da); + tbl->cnt = da.cnt; + + return (0); +} + +/* + * Dumps table entry in eXtended format (v1)(current). + */ +static int +dump_table_tentry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + struct table_value *pval; + ipfw_obj_tentry *tent; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent)); + /* Out of memory, returning */ + if (tent == NULL) { + da->error = ENOMEM; + return (1); + } + tent->head.length = sizeof(ipfw_obj_tentry); + tent->idx = da->uidx; + + error = ta->dump_tentry(tc->astate, da->ti, e, tent); + if (error != 0) + return (error); + + pval = get_table_value(da->ch, da->tc, tent->v.kidx); + ipfw_export_table_value_v1(pval, &tent->v.value); + + return (0); +} + +/* + * Dumps table entry in eXtended format (v0). + */ +static int +dump_table_xentry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + ipfw_table_xentry *xent; + ipfw_obj_tentry *tent; + struct table_value *pval; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent)); + /* Out of memory, returning */ + if (xent == NULL) + return (1); + xent->len = sizeof(ipfw_table_xentry); + xent->tbl = da->uidx; + + memset(&da->tent, 0, sizeof(da->tent)); + tent = &da->tent; + error = ta->dump_tentry(tc->astate, da->ti, e, tent); + if (error != 0) + return (error); + + /* Convert current format to previous one */ + xent->masklen = tent->masklen; + pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); + xent->value = ipfw_export_table_value_legacy(pval); + /* Apply some hacks */ + if (tc->no.type == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { + xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; + xent->flags = IPFW_TCF_INET; + } else + memcpy(&xent->k, &tent->k, sizeof(xent->k)); + + return (0); +} + +/* + * Helper function to export table algo data + * to tentry format before calling user function. + * + * Returns 0 on success. + */ +static int +prepare_table_tentry(void *e, void *arg) +{ + struct dump_args *da; + struct table_config *tc; + struct table_algo *ta; + int error; + + da = (struct dump_args *)arg; + + tc = da->tc; + ta = tc->ta; + + error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); + if (error != 0) + return (error); + + da->f(&da->tent, da->farg); + + return (0); +} + +/* + * Allow external consumers to read table entries in standard format. + */ +int +ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, + ta_foreach_f *f, void *arg) +{ + struct namedobj_instance *ni; + struct table_config *tc; + struct table_algo *ta; + struct dump_args da; + + ni = CHAIN_TO_NI(ch); + + tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); + if (tc == NULL) + return (ESRCH); + + ta = tc->ta; + + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.ti = KIDX_TO_TI(ch, tc->no.kidx); + da.tc = tc; + da.f = f; + da.farg = arg; + + ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da); + + return (0); +} + +/* + * Table algorithms + */ + +/* + * Finds algoritm by index, table type or supplied name. + * + * Returns pointer to algo or NULL. + */ +static struct table_algo * +find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name) +{ + int i, l; + struct table_algo *ta; + + if (ti->type > IPFW_TABLE_MAXTYPE) + return (NULL); + + /* Search by index */ + if (ti->atype != 0) { + if (ti->atype > tcfg->algo_count) + return (NULL); + return (tcfg->algo[ti->atype]); + } + + if (name == NULL) { + /* Return default algorithm for given type if set */ + return (tcfg->def_algo[ti->type]); + } + + /* Search by name */ + /* TODO: better search */ + for (i = 1; i <= tcfg->algo_count; i++) { + ta = tcfg->algo[i]; + + /* + * One can supply additional algorithm + * parameters so we compare only the first word + * of supplied name: + * 'addr:chash hsize=32' + * '^^^^^^^^^' + * + */ + l = strlen(ta->name); + if (strncmp(name, ta->name, l) != 0) + continue; + if (name[l] != '\0' && name[l] != ' ') + continue; + /* Check if we're requesting proper table type */ + if (ti->type != 0 && ti->type != ta->type) + return (NULL); + return (ta); + } + + return (NULL); +} + +/* + * Register new table algo @ta. + * Stores algo id inside @idx. + * + * Returns 0 on success. + */ +int +ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, + uint32_t *idx) +{ + struct tables_config *tcfg; + struct table_algo *ta_new; + size_t sz; + + if (size > sizeof(struct table_algo)) + return (EINVAL); + + /* Check for the required on-stack size for add/del */ + sz = roundup2(ta->ta_buf_size, sizeof(void *)); + if (sz > TA_BUF_SZ) + return (EINVAL); + + KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE")); + + /* Copy algorithm data to stable storage. */ + ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO); + memcpy(ta_new, ta, size); + + tcfg = CHAIN_TO_TCFG(ch); + + KASSERT(tcfg->algo_count < 255, ("Increase algo array size")); + + tcfg->algo[++tcfg->algo_count] = ta_new; + ta_new->idx = tcfg->algo_count; + + /* Set algorithm as default one for given type */ + if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 && + tcfg->def_algo[ta_new->type] == NULL) + tcfg->def_algo[ta_new->type] = ta_new; + + *idx = ta_new->idx; + + return (0); +} + +/* + * Unregisters table algo using @idx as id. + * XXX: It is NOT safe to call this function in any place + * other than ipfw instance destroy handler. + */ +void +ipfw_del_table_algo(struct ip_fw_chain *ch, int idx) +{ + struct tables_config *tcfg; + struct table_algo *ta; + + tcfg = CHAIN_TO_TCFG(ch); + + KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d", + idx, tcfg->algo_count)); + + ta = tcfg->algo[idx]; + KASSERT(ta != NULL, ("algo idx %d is NULL", idx)); + + if (tcfg->def_algo[ta->type] == ta) + tcfg->def_algo[ta->type] = NULL; + + free(ta, M_IPFW); +} + +/* + * Lists all table algorithms currently available. + * Data layout (v0)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ] + * + * Returns 0 on success + */ +static int +list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + struct tables_config *tcfg; + ipfw_ta_info *i; + struct table_algo *ta; + uint32_t count, n, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + tcfg = CHAIN_TO_TCFG(ch); + count = tcfg->algo_count; + size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_ta_info); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + for (n = 1; n <= count; n++) { + i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i)); + KASSERT(i != 0, ("previously checked buffer is not enough")); + ta = tcfg->algo[n]; + strlcpy(i->algoname, ta->name, sizeof(i->algoname)); + i->type = ta->type; + i->refcnt = ta->refcnt; + } + + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +/* + * Tables rewriting code + */ + +/* + * Determine table number and lookup type for @cmd. + * Fill @tbl and @type with appropriate values. + * Returns 0 for relevant opcodes, 1 otherwise. + */ +static int +classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +{ + ipfw_insn_if *cmdif; + int skip; + uint16_t v; + + skip = 1; + + switch (cmd->opcode) { + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + /* Basic IPv4/IPv6 or u32 lookups */ + *puidx = cmd->arg1; + /* Assume ADDR by default */ + *ptype = IPFW_TABLE_ADDR; + skip = 0; + + if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { + /* + * generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + switch (v) { + case 0: + case 1: + /* IPv4 src/dst */ + break; + case 2: + case 3: + /* src/dst port */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 4: + /* uid/gid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 5: + /* jid */ + *ptype = IPFW_TABLE_NUMBER; + break; + case 6: + /* dscp */ + *ptype = IPFW_TABLE_NUMBER; + break; + } + } + break; + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + if (cmdif->name[0] != '\1') + break; + + *ptype = IPFW_TABLE_INTERFACE; + *puidx = cmdif->p.kidx; + skip = 0; + break; + case O_IP_FLOW_LOOKUP: + *puidx = cmd->arg1; + *ptype = IPFW_TABLE_FLOW; + skip = 0; + break; + } + + return (skip); +} + +/* + * Sets new table value for given opcode. + * Assume the same opcodes as classify_table_opcode() + */ +static void +update_table_opcode(ipfw_insn *cmd, uint16_t idx) +{ + ipfw_insn_if *cmdif; + + switch (cmd->opcode) { + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + /* Basic IPv4/IPv6 or u32 lookups */ + cmd->arg1 = idx; + break; + case O_XMIT: + case O_RECV: + case O_VIA: + /* Interface table, possibly */ + cmdif = (ipfw_insn_if *)cmd; + cmdif->p.kidx = idx; + break; + case O_IP_FLOW_LOOKUP: + cmd->arg1 = idx; + break; + } +} + +/* + * Checks table name for validity. + * Enforce basic length checks, the rest + * should be done in userland. + * + * Returns 0 if name is considered valid. + */ +int +ipfw_check_table_name(char *name) +{ + int nsize; + ipfw_obj_ntlv *ntlv = NULL; + + nsize = sizeof(ntlv->name); + + if (strnlen(name, nsize) == nsize) + return (EINVAL); + + if (name[0] == '\0') + return (EINVAL); + + /* + * TODO: do some more complicated checks + */ + + return (0); +} + +/* + * Find tablename TLV by @uid. + * Check @tlvs for valid data inside. + * + * Returns pointer to found TLV or NULL. + */ +static ipfw_obj_ntlv * +find_name_tlv(void *tlvs, int len, uint16_t uidx) +{ + ipfw_obj_ntlv *ntlv; + uintptr_t pa, pe; + int l; + + pa = (uintptr_t)tlvs; + pe = pa + len; + l = 0; + for (; pa < pe; pa += l) { + ntlv = (ipfw_obj_ntlv *)pa; + l = ntlv->head.length; + + if (l != sizeof(*ntlv)) + return (NULL); + + if (ntlv->head.type != IPFW_TLV_TBL_NAME) + continue; + + if (ntlv->idx != uidx) + continue; + + if (ipfw_check_table_name(ntlv->name) != 0) + return (NULL); + + return (ntlv); + } + + return (NULL); +} + +/* + * Finds table config based on either legacy index + * or name in ntlv. + * Note @ti structure contains unchecked data from userland. + * + * Returns pointer to table_config or NULL. + */ +static struct table_config * +find_table(struct namedobj_instance *ni, struct tid_info *ti) +{ + char *name, bname[16]; + struct named_object *no; + ipfw_obj_ntlv *ntlv; + uint32_t set; + + if (ti->tlvs != NULL) { + ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); + if (ntlv == NULL) + return (NULL); + name = ntlv->name; + + /* + * Use set provided by @ti instead of @ntlv one. + * This is needed due to different sets behavior + * controlled by V_fw_tables_sets. + */ + set = ti->set; + } else { + snprintf(bname, sizeof(bname), "%d", ti->uidx); + name = bname; + set = 0; + } + + no = ipfw_objhash_lookup_name(ni, set, name); + + return ((struct table_config *)no); +} + +/* + * Allocate new table config structure using + * specified @algo and @aname. + * + * Returns pointer to config or NULL. + */ +static struct table_config * +alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, + struct table_algo *ta, char *aname, uint8_t tflags) +{ + char *name, bname[16]; + struct table_config *tc; + int error; + ipfw_obj_ntlv *ntlv; + uint32_t set; + + if (ti->tlvs != NULL) { + ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); + if (ntlv == NULL) + return (NULL); + name = ntlv->name; + set = ntlv->set; + } else { + snprintf(bname, sizeof(bname), "%d", ti->uidx); + name = bname; + set = 0; + } + + tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); + tc->no.name = tc->tablename; + tc->no.type = ta->type; + tc->no.set = set; + tc->tflags = tflags; + tc->ta = ta; + strlcpy(tc->tablename, name, sizeof(tc->tablename)); + /* Set "shared" value type by default */ + tc->vshared = 1; + + if (ti->tlvs == NULL) { + tc->no.compat = 1; + tc->no.uidx = ti->uidx; + } + + /* Preallocate data structures for new tables */ + error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); + if (error != 0) { + free(tc, M_IPFW); + return (NULL); + } + + return (tc); +} + +/* + * Destroys table state and config. + */ +static void +free_table_config(struct namedobj_instance *ni, struct table_config *tc) +{ + + KASSERT(tc->linked == 0, ("free() on linked config")); + + /* + * We're using ta without any locking/referencing. + * TODO: fix this if we're going to use unloadable algos. + */ + tc->ta->destroy(tc->astate, &tc->ti_copy); + free(tc, M_IPFW); +} + +/* + * Links @tc to @chain table named instance. + * Sets appropriate type/states in @chain table info. + */ +static void +link_table(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct namedobj_instance *ni; + struct table_info *ti; + uint16_t kidx; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + + ipfw_objhash_add(ni, &tc->no); + + ti = KIDX_TO_TI(ch, kidx); + *ti = tc->ti_copy; + + /* Notify algo on real @ti address */ + if (tc->ta->change_ti != NULL) + tc->ta->change_ti(tc->astate, ti); + + tc->linked = 1; + tc->ta->refcnt++; +} + +/* + * Unlinks @tc from @chain table named instance. + * Zeroes states in @chain and stores them in @tc. + */ +static void +unlink_table(struct ip_fw_chain *ch, struct table_config *tc) +{ + struct namedobj_instance *ni; + struct table_info *ti; + uint16_t kidx; + + IPFW_UH_WLOCK_ASSERT(ch); + IPFW_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + kidx = tc->no.kidx; + + /* Clear state. @ti copy is already saved inside @tc */ + ipfw_objhash_del(ni, &tc->no); + ti = KIDX_TO_TI(ch, kidx); + memset(ti, 0, sizeof(struct table_info)); + tc->linked = 0; + tc->ta->refcnt--; + + /* Notify algo on real @ti address */ + if (tc->ta->change_ti != NULL) + tc->ta->change_ti(tc->astate, NULL); +} + +struct swap_table_args { + int set; + int new_set; + int mv; +}; + +/* + * Change set for each matching table. + * + * Ensure we dispatch each table once by setting/checking ochange + * fields. + */ +static void +swap_table_set(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct table_config *tc; + struct swap_table_args *sta; + + tc = (struct table_config *)no; + sta = (struct swap_table_args *)arg; + + if (no->set != sta->set && (no->set != sta->new_set || sta->mv != 0)) + return; + + if (tc->ochanged != 0) + return; + + tc->ochanged = 1; + ipfw_objhash_del(ni, no); + if (no->set == sta->set) + no->set = sta->new_set; + else + no->set = sta->set; + ipfw_objhash_add(ni, no); +} + +/* + * Cleans up ochange field for all tables. + */ +static void +clean_table_set_data(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + struct table_config *tc; + struct swap_table_args *sta; + + tc = (struct table_config *)no; + sta = (struct swap_table_args *)arg; + + tc->ochanged = 0; +} + +/* + * Swaps tables within two sets. + */ +void +ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t set, + uint32_t new_set, int mv) +{ + struct swap_table_args sta; + + IPFW_UH_WLOCK_ASSERT(ch); + + sta.set = set; + sta.new_set = new_set; + sta.mv = mv; + + ipfw_objhash_foreach(CHAIN_TO_NI(ch), swap_table_set, &sta); + ipfw_objhash_foreach(CHAIN_TO_NI(ch), clean_table_set_data, &sta); +} + +/* + * Move all tables which are reference by rules in @rr to set @new_set. + * Makes sure that all relevant tables are referenced ONLLY by given rules. + * + * Retuns 0 on success, + */ +int +ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, + uint32_t new_set) +{ + struct ip_fw *rule; + struct table_config *tc; + struct named_object *no; + struct namedobj_instance *ni; + int bad, i, l, cmdlen; + uint16_t kidx; + uint8_t type; + ipfw_insn *cmd; + + IPFW_UH_WLOCK_ASSERT(ch); + + ni = CHAIN_TO_NI(ch); + + /* Stage 1: count number of references by given rules */ + for (i = 0; i < ch->n_rules - 1; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, + ("objhash lookup failed on index %d", kidx)); + tc = (struct table_config *)no; + tc->ocount++; + } + + } + + /* Stage 2: verify "ownership" */ + bad = 0; + for (i = 0; i < ch->n_rules - 1; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, + ("objhash lookup failed on index %d", kidx)); + tc = (struct table_config *)no; + if (tc->no.refcnt != tc->ocount) { + + /* + * Number of references differ: + * Other rule(s) are holding reference to given + * table, so it is not possible to change its set. + * + * Note that refcnt may account + * references to some going-to-be-added rules. + * Since we don't know their numbers (and event + * if they will be added) it is perfectly OK + * to return error here. + */ + bad = 1; + break; + } + } + + if (bad != 0) + break; + } + + /* Stage 3: change set or cleanup */ + for (i = 0; i < ch->n_rules - 1; i++) { + rule = ch->map[i]; + if (ipfw_match_range(rule, rt) == 0) + continue; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + no = ipfw_objhash_lookup_kidx(ni, kidx); + KASSERT(no != NULL, + ("objhash lookup failed on index %d", kidx)); + tc = (struct table_config *)no; + + tc->ocount = 0; + if (bad != 0) + continue; + + /* Actually change set. */ + ipfw_objhash_del(ni, no); + no->set = new_set; + ipfw_objhash_add(ni, no); + } + } + + return (bad); +} + +/* + * Finds and bumps refcount for tables referenced by given @rule. + * Auto-creates non-existing tables. + * Fills in @oib array with userland/kernel indexes. + * First free oidx pointer is saved back in @oib. + * + * Returns 0 on success. + */ +static int +find_ref_rule_tables(struct ip_fw_chain *ch, struct ip_fw *rule, + struct rule_check_info *ci, struct obj_idx **oib, struct tid_info *ti) +{ + struct table_config *tc; + struct namedobj_instance *ni; + struct named_object *no; + int cmdlen, error, l, numnew; + uint16_t kidx; + ipfw_insn *cmd; + struct obj_idx *pidx, *pidx_first, *p; + + pidx_first = *oib; + pidx = pidx_first; + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + error = 0; + numnew = 0; + + IPFW_UH_WLOCK(ch); + ni = CHAIN_TO_NI(ch); + + /* Increase refcount on each existing referenced table. */ + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + if (classify_table_opcode(cmd, &ti->uidx, &ti->type) != 0) + continue; + + pidx->uidx = ti->uidx; + pidx->type = ti->type; + + if ((tc = find_table(ni, ti)) != NULL) { + if (tc->no.type != ti->type) { + /* Incompatible types */ + error = EINVAL; + break; + } + + /* Reference found table and save kidx */ + tc->no.refcnt++; + pidx->kidx = tc->no.kidx; + pidx++; + continue; + } + + /* + * Compability stuff for old clients: + * prepare to manually create non-existing tables. + */ + pidx++; + numnew++; + } + + if (error != 0) { + /* Unref everything we have already done */ + for (p = *oib; p < pidx; p++) { + if (p->kidx == 0) + continue; + + /* Find & unref by existing idx */ + no = ipfw_objhash_lookup_kidx(ni, p->kidx); + KASSERT(no != NULL, ("Ref'd table %d disappeared", + p->kidx)); + + no->refcnt--; + } + } + + IPFW_UH_WUNLOCK(ch); + + if (numnew == 0) { + *oib = pidx; + return (error); + } + + /* + * Compatibility stuff: do actual creation for non-existing, + * but referenced tables. + */ + for (p = pidx_first; p < pidx; p++) { + if (p->kidx != 0) + continue; + + ti->uidx = p->uidx; + ti->type = p->type; + ti->atype = 0; + + error = create_table_compat(ch, ti, &kidx); + if (error == 0) { + p->kidx = kidx; + continue; + } + + /* Error. We have to drop references */ + IPFW_UH_WLOCK(ch); + for (p = pidx_first; p < pidx; p++) { + if (p->kidx == 0) + continue; + + /* Find & unref by existing idx */ + no = ipfw_objhash_lookup_kidx(ni, p->kidx); + KASSERT(no != NULL, ("Ref'd table %d disappeared", + p->kidx)); + + no->refcnt--; + } + IPFW_UH_WUNLOCK(ch); + + return (error); + } + + *oib = pidx; + + return (error); +} + +/* + * Remove references from every table used in @rule. + */ +void +ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule) +{ + int cmdlen, l; + ipfw_insn *cmd; + struct namedobj_instance *ni; + struct named_object *no; + uint16_t kidx; + uint8_t type; + + IPFW_UH_WLOCK_ASSERT(chain); + ni = CHAIN_TO_NI(chain); + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + + no = ipfw_objhash_lookup_kidx(ni, kidx); + + KASSERT(no != NULL, ("table id %d not found", kidx)); + KASSERT(no->type == type, ("wrong type %d (%d) for table id %d", + no->type, type, kidx)); + KASSERT(no->refcnt > 0, ("refcount for table %d is %d", + kidx, no->refcnt)); + + no->refcnt--; + } +} + +/* + * Compatibility function for old ipfw(8) binaries. + * Rewrites table kernel indices with userland ones. + * Convert tables matching '/^\d+$/' to their atoi() value. + * Use number 65535 for other tables. + * + * Returns 0 on success. + */ +int +ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, struct ip_fw_rule0 *rule) +{ + int cmdlen, error, l; + ipfw_insn *cmd; + uint16_t kidx, uidx; + uint8_t type; + struct named_object *no; + struct namedobj_instance *ni; + + ni = CHAIN_TO_NI(chain); + error = 0; + + l = rule->cmd_len; + cmd = rule->cmd; + cmdlen = 0; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + + if (classify_table_opcode(cmd, &kidx, &type) != 0) + continue; + + if ((no = ipfw_objhash_lookup_kidx(ni, kidx)) == NULL) + return (1); + + uidx = no->uidx; + if (no->compat == 0) { + + /* + * We are called via legacy opcode. + * Save error and show table as fake number + * not to make ipfw(8) hang. + */ + uidx = 65535; + error = 2; + } + + update_table_opcode(cmd, uidx); + } + + return (error); +} + +/* + * Checks is opcode is referencing table of appropriate type. + * Adds reference count for found table if true. + * Rewrites user-supplied opcode values with kernel ones. + * + * Returns 0 on success and appropriate error code otherwise. + */ +int +ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, + struct rule_check_info *ci) +{ + int cmdlen, error, l; + ipfw_insn *cmd; + uint16_t uidx; + uint8_t type; + struct namedobj_instance *ni; + struct obj_idx *p, *pidx_first, *pidx_last; + struct tid_info ti; + + ni = CHAIN_TO_NI(chain); + + /* + * Prepare an array for storing opcode indices. + * Use stack allocation by default. + */ + if (ci->table_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { + /* Stack */ + pidx_first = ci->obuf; + } else + pidx_first = malloc(ci->table_opcodes * sizeof(struct obj_idx), + M_IPFW, M_WAITOK | M_ZERO); + + pidx_last = pidx_first; + error = 0; + type = 0; + memset(&ti, 0, sizeof(ti)); + + /* + * Use default set for looking up tables (old way) or + * use set rule is assigned to (new way). + */ + ti.set = (V_fw_tables_sets != 0) ? ci->krule->set : 0; + if (ci->ctlv != NULL) { + ti.tlvs = (void *)(ci->ctlv + 1); + ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); + } + + /* Reference all used tables */ + error = find_ref_rule_tables(chain, ci->krule, ci, &pidx_last, &ti); + if (error != 0) + goto free; + + IPFW_UH_WLOCK(chain); + + /* Perform rule rewrite */ + l = ci->krule->cmd_len; + cmd = ci->krule->cmd; + cmdlen = 0; + p = pidx_first; + for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (classify_table_opcode(cmd, &uidx, &type) != 0) + continue; + update_table_opcode(cmd, p->kidx); + p++; + } + + IPFW_UH_WUNLOCK(chain); + +free: + if (pidx_first != ci->obuf) + free(pidx_first, M_IPFW); + + return (error); +} + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table }, + { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table }, + { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table }, + { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables }, + { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 }, + { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 }, + { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 }, + { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 }, + { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry }, + { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table }, + { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo }, + { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size }, +}; + +static void +destroy_table_locked(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + + unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no); + if (ipfw_objhash_free_idx(ni, no->kidx) != 0) + printf("Error unlinking kidx %d from table %s\n", + no->kidx, no->name); + free_table_config(ni, (struct table_config *)no); +} + +/* + * Shuts tables module down. + */ +void +ipfw_destroy_tables(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_SOPT_HANDLER(last, scodes); + + /* Remove all tables from working set */ + IPFW_UH_WLOCK(ch); + IPFW_WLOCK(ch); + ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch); + IPFW_WUNLOCK(ch); + IPFW_UH_WUNLOCK(ch); + + /* Free pointers itself */ + free(ch->tablestate, M_IPFW); + + ipfw_table_value_destroy(ch, last); + ipfw_table_algo_destroy(ch); + + ipfw_objhash_destroy(CHAIN_TO_NI(ch)); + free(CHAIN_TO_TCFG(ch), M_IPFW); +} + +/* + * Starts tables module. + */ +int +ipfw_init_tables(struct ip_fw_chain *ch, int first) +{ + struct tables_config *tcfg; + + /* Allocate pointers */ + ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info), + M_IPFW, M_WAITOK | M_ZERO); + + tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO); + tcfg->namehash = ipfw_objhash_create(V_fw_tables_max); + ch->tblcfg = tcfg; + + ipfw_table_value_init(ch, first); + ipfw_table_algo_init(ch); + + IPFW_ADD_SOPT_HANDLER(first, scodes); + return (0); +} + + + diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h new file mode 100644 index 0000000..216d713 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table.h @@ -0,0 +1,246 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: head/sys/netpfil/ipfw/ip_fw_table.h 272840 2014-10-09 19:32:35Z melifaro $ + */ + +#ifndef _IPFW2_TABLE_H +#define _IPFW2_TABLE_H + +/* + * Internal constants and data structures used by ipfw tables + * not meant to be exported outside the kernel. + */ +#ifdef _KERNEL + +struct table_algo; +struct tables_config { + struct namedobj_instance *namehash; + struct namedobj_instance *valhash; + uint32_t val_size; + uint32_t algo_count; + struct table_algo *algo[256]; + struct table_algo *def_algo[IPFW_TABLE_MAXTYPE + 1]; + TAILQ_HEAD(op_state_l,op_state) state_list; +}; +#define CHAIN_TO_TCFG(chain) ((struct tables_config *)(chain)->tblcfg) + +struct table_info { + table_lookup_t *lookup; /* Lookup function */ + void *state; /* Lookup radix/other structure */ + void *xstate; /* eXtended state */ + u_long data; /* Hints for given func */ +}; + +/* Internal structures for handling sockopt data */ +struct tid_info { + uint32_t set; /* table set */ + uint16_t uidx; /* table index */ + uint8_t type; /* table type */ + uint8_t atype; + void *tlvs; /* Pointer to first TLV */ + int tlen; /* Total TLV size block */ +}; + +struct table_value; +struct tentry_info { + void *paddr; + struct table_value *pvalue; + void *ptv; /* Temporary field to hold obj */ + uint8_t masklen; /* mask length */ + uint8_t subtype; + uint16_t flags; /* record flags */ + uint32_t value; /* value index */ +}; +#define TEI_FLAGS_UPDATE 0x0001 /* Add or update rec if exists */ +#define TEI_FLAGS_UPDATED 0x0002 /* Entry has been updated */ +#define TEI_FLAGS_COMPAT 0x0004 /* Called from old ABI */ +#define TEI_FLAGS_DONTADD 0x0008 /* Do not create new rec */ +#define TEI_FLAGS_ADDED 0x0010 /* Entry was added */ +#define TEI_FLAGS_DELETED 0x0020 /* Entry was deleted */ +#define TEI_FLAGS_LIMIT 0x0040 /* Limit was hit */ +#define TEI_FLAGS_ERROR 0x0080 /* Unknown request error */ +#define TEI_FLAGS_NOTFOUND 0x0100 /* Entry was not found */ +#define TEI_FLAGS_EXISTS 0x0200 /* Entry already exists */ + +typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +typedef void (ta_destroy)(void *ta_state, struct table_info *ti); +typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +typedef int (ta_add)(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +typedef int (ta_del)(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +typedef void (ta_flush_entry)(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); + +typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); +typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +typedef void (ta_modify)(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t pflags); +typedef void (ta_flush_mod)(void *ta_buf); + +typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); +typedef void (ta_print_config)(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize); + +typedef int ta_foreach_f(void *node, void *arg); +typedef void ta_foreach(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg); +typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +typedef int ta_find_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +typedef uint32_t ta_get_count(void *ta_state, struct table_info *ti); + +struct table_algo { + char name[16]; + uint32_t idx; + uint32_t type; + uint32_t refcnt; + uint32_t flags; + uint32_t vlimit; + size_t ta_buf_size; + ta_init *init; + ta_destroy *destroy; + ta_prepare_add *prepare_add; + ta_prepare_del *prepare_del; + ta_add *add; + ta_del *del; + ta_flush_entry *flush_entry; + ta_find_tentry *find_tentry; + ta_need_modify *need_modify; + ta_prepare_mod *prepare_mod; + ta_fill_mod *fill_mod; + ta_modify *modify; + ta_flush_mod *flush_mod; + ta_change_ti *change_ti; + ta_foreach *foreach; + ta_dump_tentry *dump_tentry; + ta_print_config *print_config; + ta_dump_tinfo *dump_tinfo; + ta_get_count *get_count; +}; +#define TA_FLAG_DEFAULT 0x01 /* Algo is default for given type */ +#define TA_FLAG_READONLY 0x02 /* Algo does not support modifications*/ +#define TA_FLAG_EXTCOUNTER 0x04 /* Algo has external counter available*/ + +int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, + size_t size, uint32_t *idx); +void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx); + +void ipfw_table_algo_init(struct ip_fw_chain *chain); +void ipfw_table_algo_destroy(struct ip_fw_chain *chain); + +MALLOC_DECLARE(M_IPFW_TBL); +/* Exported to support legacy opcodes */ +int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count); +int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, + struct tentry_info *tei, uint8_t flags, uint32_t count); +int flush_table(struct ip_fw_chain *ch, struct tid_info *ti); +void ipfw_import_table_value_legacy(uint32_t value, struct table_value *v); +uint32_t ipfw_export_table_value_legacy(struct table_value *v); +int ipfw_get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +/* ipfw_table_value.c functions */ +struct table_config; +struct tableop_state; +void ipfw_table_value_init(struct ip_fw_chain *ch, int first); +void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last); +int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts); +void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct tentry_info *tei, uint32_t count, int rollback); +void ipfw_import_table_value_v1(ipfw_table_value *iv); +void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *iv); +void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct table_algo *ta, void *astate, struct table_info *ti); +void rollback_table_values(struct tableop_state *ts); + +int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, + struct rule_check_info *ci); +int ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, + struct ip_fw_rule0 *rule); +int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, + uint32_t *bmask); +int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, + struct sockopt_data *sd); +void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule); + +/* utility functions */ +int ipfw_check_table_name(char *name); +int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, + uint32_t new_set); +void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t old_set, + uint32_t new_set, int mv); +int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, + ta_foreach_f f, void *arg); + +/* internal functions */ +void tc_ref(struct table_config *tc); +void tc_unref(struct table_config *tc); + +struct op_state; +typedef void (op_rollback_f)(void *object, struct op_state *state); +struct op_state { + TAILQ_ENTRY(op_state) next; /* chain link */ + op_rollback_f *func; +}; + +struct tableop_state { + struct op_state opstate; + struct ip_fw_chain *ch; + struct table_config *tc; + struct table_algo *ta; + struct tentry_info *tei; + uint32_t count; + uint32_t vmask; + int vshared; + int modified; +}; + +void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); +void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); +void rollback_toperation_state(struct ip_fw_chain *ch, void *object); + +/* Legacy interfaces */ +int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, + uint32_t *cnt); +int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, + uint32_t *cnt); +int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, + ipfw_table *tbl); + + +#endif /* _KERNEL */ +#endif /* _IPFW2_TABLE_H */ diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c new file mode 100644 index 0000000..d9d0547 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_algo.c @@ -0,0 +1,4081 @@ +/*- + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table_algo.c 272912 2014-10-10 20:37:06Z melifaro $"); + +/* + * Lookup table algorithms. + * + */ + +#include "opt_ipfw.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ +#include <net/radix.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + + +/* + * IPFW table lookup algorithms. + * + * What is needed to add another table algo? + * + * Algo init: + * * struct table_algo has to be filled with: + * name: "type:algoname" format, e.g. "addr:radix". Currently + * there are the following types: "addr", "iface", "number" and "flow". + * type: one of IPFW_TABLE_* types + * flags: one or more TA_FLAGS_* + * ta_buf_size: size of structure used to store add/del item state. + * Needs to be less than TA_BUF_SZ. + * callbacks: see below for description. + * * ipfw_add_table_algo / ipfw_del_table_algo has to be called + * + * Callbacks description: + * + * -init: request to initialize new table instance. + * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, + * struct table_info *ti, char *data, uint8_t tflags); + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocate all structures needed for normal operations. + * * Caller may want to parse @data for some algo-specific + * options provided by userland. + * * Caller may want to save configuration state pointer to @ta_state + * * Caller needs to save desired runtime structure pointer(s) + * inside @ti fields. Note that it is not correct to save + * @ti pointer at this moment. Use -change_ti hook for that. + * * Caller has to fill in ti->lookup to appropriate function + * pointer. + * + * + * + * -destroy: request to destroy table instance. + * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); + * MANDATORY, may be locked (UH+WLOCK). (M_NOWAIT). + * + * Frees all table entries and all tables structures allocated by -init. + * + * + * + * -prepare_add: request to allocate state for adding new entry. + * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, + * void *ta_buf); + * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocates state and fills it in with all necessary data (EXCEPT value) + * from @tei to minimize operations needed to be done under WLOCK. + * "value" field has to be copied to new entry in @add callback. + * Buffer ta_buf of size ta->ta_buf_sz may be used to store + * allocated state. + * + * + * + * -prepare_del: request to set state for deleting existing entry. + * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, + * void *ta_buf); + * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. + * + * Buffer ta_buf of size ta->ta_buf_sz may be used to store + * allocated state. Caller should use on-stack ta_buf allocation + * instead of doing malloc(). + * + * + * + * -add: request to insert new entry into runtime/config structures. + * typedef int (ta_add)(void *ta_state, struct table_info *ti, + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. + * + * Insert new entry using previously-allocated state in @ta_buf. + * * @tei may have the following flags: + * TEI_FLAGS_UPDATE: request to add or update entry. + * TEI_FLAGS_DONTADD: request to update (but not add) entry. + * * Caller is required to do the following: + * copy real entry value from @tei + * entry added: return 0, set 1 to @pnum + * entry updated: return 0, store 0 to @pnum, store old value in @tei, + * add TEI_FLAGS_UPDATED flag to @tei. + * entry exists: return EEXIST + * entry not found: return ENOENT + * other error: return non-zero error code. + * + * + * + * -del: request to delete existing entry from runtime/config structures. + * typedef int (ta_del)(void *ta_state, struct table_info *ti, + * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); + * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. + * + * Delete entry using previously set up in @ta_buf. + * * Caller is required to do the following: + * entry deleted: return 0, set 1 to @pnum, store old value in @tei. + * entry not found: return ENOENT + * other error: return non-zero error code. + * + * + * + * -flush_entry: flush entry state created by -prepare_add / -del / others + * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, + * struct tentry_info *tei, void *ta_buf); + * MANDATORY, may be locked. (M_NOWAIT). + * + * Delete state allocated by: + * -prepare_add (-add returned EEXIST|UPDATED) + * -prepare_del (if any) + * -del + * * Caller is required to handle empty @ta_buf correctly. + * + * + * -find_tentry: finds entry specified by key @tei + * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, + * ipfw_obj_tentry *tent); + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. + * + * Finds entry specified by given key. + * * Caller is requred to do the following: + * entry found: returns 0, export entry to @tent + * entry not found: returns ENOENT + * + * + * -need_modify: checks if @ti has enough space to hold another @count items. + * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, + * uint32_t count, uint64_t *pflags); + * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. + * + * Checks if given table has enough space to add @count items without + * resize. Caller may use @pflags to store desired modification data. + * + * + * + * -prepare_mod: allocate structures for table modification. + * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); + * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. + * + * Allocate all needed state for table modification. Caller + * should use `struct mod_item` to store new state in @ta_buf. + * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. + * + * + * + * -fill_mod: copy some data to new state/ + * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, + * void *ta_buf, uint64_t *pflags); + * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. + * + * Copy as much data as we can to minimize changes under WLOCK. + * For example, array can be merged inside this callback. + * + * + * + * -modify: perform final modification. + * typedef void (ta_modify)(void *ta_state, struct table_info *ti, + * void *ta_buf, uint64_t pflags); + * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). + * + * Performs all changes necessary to switch to new structures. + * * Caller should save old pointers to @ta_buf storage. + * + * + * + * -flush_mod: flush table modification state. + * typedef void (ta_flush_mod)(void *ta_buf); + * OPTIONAL(need_modify), unlocked. (M_WAITOK). + * + * Performs flush for the following: + * - prepare_mod (modification was not necessary) + * - modify (for the old state) + * + * + * + * -change_gi: monitor table info pointer changes + * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); + * OPTIONAL, locked (UH). (M_NOWAIT). + * + * Called on @ti pointer changed. Called immediately after -init + * to set initial state. + * + * + * + * -foreach: calls @f for each table entry + * typedef void ta_foreach(void *ta_state, struct table_info *ti, + * ta_foreach_f *f, void *arg); + * MANDATORY, locked(UH). (M_NOWAIT). + * + * Runs callback with specified argument for each table entry, + * Typically used for dumping table entries. + * + * + * + * -dump_tentry: dump table entry in current @tentry format. + * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, + * ipfw_obj_tentry *tent); + * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. + * + * Dumps entry @e to @tent. + * + * + * -print_config: prints custom algoritm options into buffer. + * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, + * char *buf, size_t bufsize); + * OPTIONAL. locked(UH). (M_NOWAIT). + * + * Prints custom algorithm options in the format suitable to pass + * back to -init callback. + * + * + * + * -dump_tinfo: dumps algo-specific info. + * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, + * ipfw_ta_tinfo *tinfo); + * OPTIONAL. locked(UH). (M_NOWAIT). + * + * Dumps options like items size/hash size, etc. + */ + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +/* + * Utility structures/functions common to more than one algo + */ + +struct mod_item { + void *main_ptr; + size_t size; + void *main_ptr6; + size_t size6; +}; + +static int badd(const void *key, void *item, void *base, size_t nmemb, + size_t size, int (*compar) (const void *, const void *)); +static int bdel(const void *key, void *base, size_t nmemb, size_t size, + int (*compar) (const void *, const void *)); + + +/* + * ADDR implementation using radix + * + */ + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +/* + * Do not require radix to compare more than actual IPv4/IPv6 address + */ +#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) +#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) + +#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) +#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) + +struct radix_addr_entry { + struct radix_node rn[2]; + struct sockaddr_in addr; + uint32_t value; + uint8_t masklen; +}; + +struct sa_in6 { + uint8_t sin6_len; + uint8_t sin6_family; + uint8_t pad[2]; + struct in6_addr sin6_addr; +}; + +struct radix_addr_xentry { + struct radix_node rn[2]; + struct sa_in6 addr6; + uint32_t value; + uint8_t masklen; +}; + +struct radix_cfg { + struct radix_node_head *head4; + struct radix_node_head *head6; + size_t count4; + size_t count6; +}; + +struct ta_buf_radix +{ + void *ent_ptr; + struct sockaddr *addr_ptr; + struct sockaddr *mask_ptr; + union { + struct { + struct sockaddr_in sa; + struct sockaddr_in ma; + } a4; + struct { + struct sa_in6 sa; + struct sa_in6 ma; + } a6; + } addr; +}; + +static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static int flush_radix_entry(struct radix_node *rn, void *arg); +static void ta_destroy_radix(void *ta_state, struct table_info *ti); +static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_radix(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, + struct sockaddr *ma, int *set_mask); +static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_radix(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_radix(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_radix(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); + +static int +ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct radix_node_head *rnh; + + if (keylen == sizeof(in_addr_t)) { + struct radix_addr_entry *ent; + struct sockaddr_in sa; + KEY_LEN(sa) = KEY_LEN_INET; + sa.sin_addr.s_addr = *((in_addr_t *)key); + rnh = (struct radix_node_head *)ti->state; + ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, rnh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + } else { + struct radix_addr_xentry *xent; + struct sa_in6 sa6; + KEY_LEN(sa6) = KEY_LEN_INET6; + memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); + rnh = (struct radix_node_head *)ti->xstate; + xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, rnh)); + if (xent != NULL) { + *val = xent->value; + return (1); + } + } + + return (0); +} + +/* + * New table + */ +static int +ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct radix_cfg *cfg; + + if (!rn_inithead(&ti->state, OFF_LEN_INET)) + return (ENOMEM); + if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { + rn_detachhead(&ti->state); + return (ENOMEM); + } + + cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); + + *ta_state = cfg; + ti->lookup = ta_lookup_radix; + + return (0); +} + +static int +flush_radix_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct radix_addr_entry *ent; + + ent = (struct radix_addr_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +static void +ta_destroy_radix(void *ta_state, struct table_info *ti) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + + cfg = (struct radix_cfg *)ta_state; + + rnh = (struct radix_node_head *)(ti->state); + rnh->rnh_walktree(rnh, flush_radix_entry, rnh); + rn_detachhead(&ti->state); + + rnh = (struct radix_node_head *)(ti->xstate); + rnh->rnh_walktree(rnh, flush_radix_entry, rnh); + rn_detachhead(&ti->xstate); + + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct radix_cfg *cfg; + + cfg = (struct radix_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_RADIX; + tinfo->count4 = cfg->count4; + tinfo->itemsize4 = sizeof(struct radix_addr_entry); + tinfo->taclass6 = IPFW_TACLASS_RADIX; + tinfo->count6 = cfg->count6; + tinfo->itemsize6 = sizeof(struct radix_addr_xentry); +} + +static int +ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct radix_addr_entry *n; +#ifdef INET6 + struct radix_addr_xentry *xn; +#endif + + n = (struct radix_addr_entry *)e; + + /* Guess IPv4/IPv6 radix by sockaddr family */ + if (n->addr.sin_family == AF_INET) { + tent->k.addr.s_addr = n->addr.sin_addr.s_addr; + tent->masklen = n->masklen; + tent->subtype = AF_INET; + tent->v.kidx = n->value; +#ifdef INET6 + } else { + xn = (struct radix_addr_xentry *)e; + memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); + tent->masklen = xn->masklen; + tent->subtype = AF_INET6; + tent->v.kidx = xn->value; +#endif + } + + return (0); +} + +static int +ta_find_radix_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct radix_node_head *rnh; + void *e; + + e = NULL; + if (tent->subtype == AF_INET) { + struct sockaddr_in sa; + KEY_LEN(sa) = KEY_LEN_INET; + sa.sin_addr.s_addr = tent->k.addr.s_addr; + rnh = (struct radix_node_head *)ti->state; + e = rnh->rnh_matchaddr(&sa, rnh); + } else { + struct sa_in6 sa6; + KEY_LEN(sa6) = KEY_LEN_INET6; + memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); + rnh = (struct radix_node_head *)ti->xstate; + e = rnh->rnh_matchaddr(&sa6, rnh); + } + + if (e != NULL) { + ta_dump_radix_tentry(ta_state, ti, e, tent); + return (0); + } + + return (ENOENT); +} + +static void +ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct radix_node_head *rnh; + + rnh = (struct radix_node_head *)(ti->state); + rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); + + rnh = (struct radix_node_head *)(ti->xstate); + rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); +} + + +#ifdef INET6 +static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); + +static inline void +ipv6_writemask(struct in6_addr *addr6, uint8_t mask) +{ + uint32_t *cp; + + for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) + *cp++ = 0xFFFFFFFF; + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); +} +#endif + +static void +tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, + struct sockaddr *ma, int *set_mask) +{ + int mlen; +#ifdef INET + struct sockaddr_in *addr, *mask; +#endif +#ifdef INET6 + struct sa_in6 *addr6, *mask6; +#endif + in_addr_t a4; + + mlen = tei->masklen; + + if (tei->subtype == AF_INET) { +#ifdef INET + addr = (struct sockaddr_in *)sa; + mask = (struct sockaddr_in *)ma; + /* Set 'total' structure length */ + KEY_LEN(*addr) = KEY_LEN_INET; + KEY_LEN(*mask) = KEY_LEN_INET; + addr->sin_family = AF_INET; + mask->sin_addr.s_addr = + htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + a4 = *((in_addr_t *)tei->paddr); + addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; + if (mlen != 32) + *set_mask = 1; + else + *set_mask = 0; +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + addr6 = (struct sa_in6 *)sa; + mask6 = (struct sa_in6 *)ma; + /* Set 'total' structure length */ + KEY_LEN(*addr6) = KEY_LEN_INET6; + KEY_LEN(*mask6) = KEY_LEN_INET6; + addr6->sin6_family = AF_INET6; + ipv6_writemask(&mask6->sin6_addr, mlen); + memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); + APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); + if (mlen != 128) + *set_mask = 1; + else + *set_mask = 0; +#endif + } +} + +static int +ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + struct radix_addr_entry *ent; +#ifdef INET6 + struct radix_addr_xentry *xent; +#endif + struct sockaddr *addr, *mask; + int mlen, set_mask; + + tb = (struct ta_buf_radix *)ta_buf; + + mlen = tei->masklen; + set_mask = 0; + + if (tei->subtype == AF_INET) { +#ifdef INET + if (mlen > 32) + return (EINVAL); + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); + ent->masklen = mlen; + + addr = (struct sockaddr *)&ent->addr; + mask = (struct sockaddr *)&tb->addr.a4.ma; + tb->ent_ptr = ent; +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + if (mlen > 128) + return (EINVAL); + xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); + xent->masklen = mlen; + + addr = (struct sockaddr *)&xent->addr6; + mask = (struct sockaddr *)&tb->addr.a6.ma; + tb->ent_ptr = xent; +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); + /* Set pointers */ + tb->addr_ptr = addr; + if (set_mask != 0) + tb->mask_ptr = mask; + + return (0); +} + +static int +ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + struct radix_node *rn; + struct ta_buf_radix *tb; + uint32_t *old_value, value; + + cfg = (struct radix_cfg *)ta_state; + tb = (struct ta_buf_radix *)ta_buf; + + /* Save current entry value from @tei */ + if (tei->subtype == AF_INET) { + rnh = ti->state; + ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; + } else { + rnh = ti->xstate; + ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; + } + + /* Search for an entry first */ + rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, rnh); + if (rn != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + if (tei->subtype == AF_INET) + old_value = &((struct radix_addr_entry *)rn)->value; + else + old_value = &((struct radix_addr_xentry *)rn)->value; + + value = *old_value; + *old_value = tei->value; + tei->value = value; + + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, rnh, tb->ent_ptr); + if (rn == NULL) { + /* Unknown error */ + return (EINVAL); + } + + if (tei->subtype == AF_INET) + cfg->count4++; + else + cfg->count6++; + tb->ent_ptr = NULL; + *pnum = 1; + + return (0); +} + +static int +ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + struct sockaddr *addr, *mask; + int mlen, set_mask; + + tb = (struct ta_buf_radix *)ta_buf; + + mlen = tei->masklen; + set_mask = 0; + + if (tei->subtype == AF_INET) { + if (mlen > 32) + return (EINVAL); + + addr = (struct sockaddr *)&tb->addr.a4.sa; + mask = (struct sockaddr *)&tb->addr.a4.ma; +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + if (mlen > 128) + return (EINVAL); + + addr = (struct sockaddr *)&tb->addr.a6.sa; + mask = (struct sockaddr *)&tb->addr.a6.ma; +#endif + } else + return (EINVAL); + + tei_to_sockaddr_ent(tei, addr, mask, &set_mask); + tb->addr_ptr = addr; + if (set_mask != 0) + tb->mask_ptr = mask; + + return (0); +} + +static int +ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct radix_cfg *cfg; + struct radix_node_head *rnh; + struct radix_node *rn; + struct ta_buf_radix *tb; + + cfg = (struct radix_cfg *)ta_state; + tb = (struct ta_buf_radix *)ta_buf; + + if (tei->subtype == AF_INET) + rnh = ti->state; + else + rnh = ti->xstate; + + rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, rnh); + + if (rn == NULL) + return (ENOENT); + + /* Save entry value to @tei */ + if (tei->subtype == AF_INET) + tei->value = ((struct radix_addr_entry *)rn)->value; + else + tei->value = ((struct radix_addr_xentry *)rn)->value; + + tb->ent_ptr = rn; + + if (tei->subtype == AF_INET) + cfg->count4--; + else + cfg->count6--; + *pnum = 1; + + return (0); +} + +static void +ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_radix *tb; + + tb = (struct ta_buf_radix *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +static int +ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + + /* + * radix does not require additional memory allocations + * other than nodes itself. Adding new masks to the tree do + * but we don't have any API to call (and we don't known which + * sizes do we need). + */ + return (0); +} + +struct table_algo addr_radix = { + .name = "addr:radix", + .type = IPFW_TABLE_ADDR, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_radix), + .init = ta_init_radix, + .destroy = ta_destroy_radix, + .prepare_add = ta_prepare_add_radix, + .prepare_del = ta_prepare_del_radix, + .add = ta_add_radix, + .del = ta_del_radix, + .flush_entry = ta_flush_radix_entry, + .foreach = ta_foreach_radix, + .dump_tentry = ta_dump_radix_tentry, + .find_tentry = ta_find_radix_tentry, + .dump_tinfo = ta_dump_radix_tinfo, + .need_modify = ta_need_modify_radix, +}; + + +/* + * addr:hash cmds + * + * + * ti->data: + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] + * [ 8][ 8[ 8][ 8] + * + * inv.mask4: 32 - mask + * inv.mask6: + * 1) _slow lookup: mask + * 2) _aligned: (128 - mask) / 8 + * 3) _64: 8 + * + * + * pflags: + * [v4=1/v6=0][hsize] + * [ 32][ 32] + */ + +struct chashentry; + +SLIST_HEAD(chashbhead, chashentry); + +struct chash_cfg { + struct chashbhead *head4; + struct chashbhead *head6; + size_t size4; + size_t size6; + size_t items4; + size_t items6; + uint8_t mask4; + uint8_t mask6; +}; + +struct chashentry { + SLIST_ENTRY(chashentry) next; + uint32_t value; + uint32_t type; + union { + uint32_t a4; /* Host format */ + struct in6_addr a6; /* Network format */ + } a; +}; + +struct ta_buf_chash +{ + void *ent_ptr; + struct chashentry ent; +}; + +#ifdef INET +static __inline uint32_t hash_ip(uint32_t addr, int hsize); +#endif +#ifdef INET6 +static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); +static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); +static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, + int mask, int hsize); +static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, + int hsize); +#endif +static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_lookup_chash_aligned(struct table_info *ti, void *key, + uint32_t keylen, uint32_t *val); +static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int chash_parse_opts(struct chash_cfg *cfg, char *data); +static void ta_print_chash_config(void *ta_state, struct table_info *ti, + char *buf, size_t bufsize); +static int ta_log2(uint32_t v); +static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_chash(void *ta_state, struct table_info *ti); +static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, + uint32_t size); +static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); +static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_chash(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_chash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_chash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_chash(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags); +static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_chash(void *ta_buf); + + +#ifdef INET +static __inline uint32_t +hash_ip(uint32_t addr, int hsize) +{ + + return (addr % (hsize - 1)); +} +#endif + +#ifdef INET6 +static __inline uint32_t +hash_ip6(struct in6_addr *addr6, int hsize) +{ + uint32_t i; + + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ + addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; + + return (i % (hsize - 1)); +} + + +static __inline uint16_t +hash_ip64(struct in6_addr *addr6, int hsize) +{ + uint32_t i; + + i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; + + return (i % (hsize - 1)); +} + + +static __inline uint32_t +hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) +{ + struct in6_addr mask6; + + ipv6_writemask(&mask6, mask); + memcpy(addr6, key, sizeof(struct in6_addr)); + APPLY_MASK(addr6, &mask6); + return (hash_ip6(addr6, hsize)); +} + +static __inline uint32_t +hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) +{ + uint64_t *paddr; + + paddr = (uint64_t *)addr6; + *paddr = 0; + *(paddr + 1) = 0; + memcpy(addr6, key, mask); + return (hash_ip6(addr6, hsize)); +} +#endif + +static int +ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: worst scenario: non-round mask */ + struct in6_addr addr6; + head = (struct chashbhead *)ti->xstate; + imask = (ti->data & 0xFF0000) >> 16; + hsize = 1 << (ti->data & 0xFF); + hash = hash_ip6_slow(&addr6, key, imask, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (memcmp(&ent->a.a6, &addr6, 16) == 0) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: aligned to 8bit mask */ + struct in6_addr addr6; + uint64_t *paddr, *ptmp; + head = (struct chashbhead *)ti->xstate; + imask = (ti->data & 0xFF0000) >> 16; + hsize = 1 << (ti->data & 0xFF); + + hash = hash_ip6_al(&addr6, key, imask, hsize); + paddr = (uint64_t *)&addr6; + SLIST_FOREACH(ent, &head[hash], next) { + ptmp = (uint64_t *)&ent->a.a6; + if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct chashbhead *head; + struct chashentry *ent; + uint16_t hash, hsize; + uint8_t imask; + + if (keylen == sizeof(in_addr_t)) { +#ifdef INET + head = (struct chashbhead *)ti->state; + imask = ti->data >> 24; + hsize = 1 << ((ti->data & 0xFFFF) >> 8); + uint32_t a; + a = ntohl(*((in_addr_t *)key)); + a = a >> imask; + hash = hash_ip(a, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (ent->a.a4 == a) { + *val = ent->value; + return (1); + } + } +#endif + } else { +#ifdef INET6 + /* IPv6: /64 */ + uint64_t a6, *paddr; + head = (struct chashbhead *)ti->xstate; + paddr = (uint64_t *)key; + hsize = 1 << (ti->data & 0xFF); + a6 = *paddr; + hash = hash_ip64((struct in6_addr *)key, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + paddr = (uint64_t *)&ent->a.a6; + if (a6 == *paddr) { + *val = ent->value; + return (1); + } + } +#endif + } + + return (0); +} + +static int +chash_parse_opts(struct chash_cfg *cfg, char *data) +{ + char *pdel, *pend, *s; + int mask4, mask6; + + mask4 = cfg->mask4; + mask6 = cfg->mask6; + + if (data == NULL) + return (0); + if ((pdel = strchr(data, ' ')) == NULL) + return (0); + while (*pdel == ' ') + pdel++; + if (strncmp(pdel, "masks=", 6) != 0) + return (EINVAL); + if ((s = strchr(pdel, ' ')) != NULL) + *s++ = '\0'; + + pdel += 6; + /* Need /XX[,/YY] */ + if (*pdel++ != '/') + return (EINVAL); + mask4 = strtol(pdel, &pend, 10); + if (*pend == ',') { + /* ,/YY */ + pdel = pend + 1; + if (*pdel++ != '/') + return (EINVAL); + mask6 = strtol(pdel, &pend, 10); + if (*pend != '\0') + return (EINVAL); + } else if (*pend != '\0') + return (EINVAL); + + if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) + return (EINVAL); + + cfg->mask4 = mask4; + cfg->mask6 = mask6; + + return (0); +} + +static void +ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize) +{ + struct chash_cfg *cfg; + + cfg = (struct chash_cfg *)ta_state; + + if (cfg->mask4 != 32 || cfg->mask6 != 128) + snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", + cfg->mask4, cfg->mask6); + else + snprintf(buf, bufsize, "%s", "addr:hash"); +} + +static int +ta_log2(uint32_t v) +{ + uint32_t r; + + r = 0; + while (v >>= 1) + r++; + + return (r); +} + +/* + * New table. + * We assume 'data' to be either NULL or the following format: + * 'addr:hash [masks=/32[,/128]]' + */ +static int +ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int error, i; + uint32_t hsize; + struct chash_cfg *cfg; + + cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->mask4 = 32; + cfg->mask6 = 128; + + if ((error = chash_parse_opts(cfg, data)) != 0) { + free(cfg, M_IPFW); + return (error); + } + + cfg->size4 = 128; + cfg->size6 = 128; + + cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, + M_WAITOK | M_ZERO); + cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < cfg->size4; i++) + SLIST_INIT(&cfg->head4[i]); + for (i = 0; i < cfg->size6; i++) + SLIST_INIT(&cfg->head6[i]); + + + *ta_state = cfg; + ti->state = cfg->head4; + ti->xstate = cfg->head6; + + /* Store data depending on v6 mask length */ + hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); + if (cfg->mask6 == 64) { + ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| + hsize; + ti->lookup = ta_lookup_chash_64; + } else if ((cfg->mask6 % 8) == 0) { + ti->data = (32 - cfg->mask4) << 24 | + cfg->mask6 << 13 | hsize; + ti->lookup = ta_lookup_chash_aligned; + } else { + /* don't do that! */ + ti->data = (32 - cfg->mask4) << 24 | + cfg->mask6 << 16 | hsize; + ti->lookup = ta_lookup_chash_slow; + } + + return (0); +} + +static void +ta_destroy_chash(void *ta_state, struct table_info *ti) +{ + struct chash_cfg *cfg; + struct chashentry *ent, *ent_next; + int i; + + cfg = (struct chash_cfg *)ta_state; + + for (i = 0; i < cfg->size4; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) + free(ent, M_IPFW_TBL); + + for (i = 0; i < cfg->size6; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) + free(ent, M_IPFW_TBL); + + free(cfg->head4, M_IPFW); + free(cfg->head6, M_IPFW); + + free(cfg, M_IPFW); +} + +static void +ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct chash_cfg *cfg; + + cfg = (struct chash_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_HASH; + tinfo->size4 = cfg->size4; + tinfo->count4 = cfg->items4; + tinfo->itemsize4 = sizeof(struct chashentry); + tinfo->taclass6 = IPFW_TACLASS_HASH; + tinfo->size6 = cfg->size6; + tinfo->count6 = cfg->items6; + tinfo->itemsize6 = sizeof(struct chashentry); +} + +static int +ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct chash_cfg *cfg; + struct chashentry *ent; + + cfg = (struct chash_cfg *)ta_state; + ent = (struct chashentry *)e; + + if (ent->type == AF_INET) { + tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); + tent->masklen = cfg->mask4; + tent->subtype = AF_INET; + tent->v.kidx = ent->value; +#ifdef INET6 + } else { + memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr)); + tent->masklen = cfg->mask6; + tent->subtype = AF_INET6; + tent->v.kidx = ent->value; +#endif + } + + return (0); +} + +static uint32_t +hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) +{ + uint32_t hash; + + hash = 0; + + if (af == AF_INET) { +#ifdef INET + hash = hash_ip(ent->a.a4, size); +#endif + } else { +#ifdef INET6 + if (mlen == 64) + hash = hash_ip64(&ent->a.a6, size); + else + hash = hash_ip6(&ent->a.a6, size); +#endif + } + + return (hash); +} + +static int +tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) +{ + int mlen; +#ifdef INET6 + struct in6_addr mask6; +#endif + + + mlen = tei->masklen; + + if (tei->subtype == AF_INET) { +#ifdef INET + if (mlen > 32) + return (EINVAL); + ent->type = AF_INET; + + /* Calculate masked address */ + ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + /* IPv6 case */ + if (mlen > 128) + return (EINVAL); + ent->type = AF_INET6; + + ipv6_writemask(&mask6, mlen); + memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); + APPLY_MASK(&ent->a.a6, &mask6); +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + return (0); +} + +static int +ta_find_chash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry ent, *tmp; + struct tentry_info tei; + int error; + uint32_t hash; + + cfg = (struct chash_cfg *)ta_state; + + memset(&ent, 0, sizeof(ent)); + memset(&tei, 0, sizeof(tei)); + + if (tent->subtype == AF_INET) { + tei.paddr = &tent->k.addr; + tei.masklen = cfg->mask4; + tei.subtype = AF_INET; + + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) + return (error); + + head = cfg->head4; + hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (tmp->a.a4 != ent.a.a4) + continue; + + ta_dump_chash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } else { + tei.paddr = &tent->k.addr6; + tei.masklen = cfg->mask6; + tei.subtype = AF_INET6; + + if ((error = tei_to_chash_ent(&tei, &ent)) != 0) + return (error); + + head = cfg->head6; + hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) + continue; + ta_dump_chash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } + + return (ENOENT); +} + +static void +ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct chash_cfg *cfg; + struct chashentry *ent, *ent_next; + int i; + + cfg = (struct chash_cfg *)ta_state; + + for (i = 0; i < cfg->size4; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) + f(ent, arg); + + for (i = 0; i < cfg->size6; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) + f(ent, arg); +} + +static int +ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + struct chashentry *ent; + int error; + + tb = (struct ta_buf_chash *)ta_buf; + + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); + + error = tei_to_chash_ent(tei, ent); + if (error != 0) { + free(ent, M_IPFW_TBL); + return (error); + } + tb->ent_ptr = ent; + + return (0); +} + +static int +ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry *ent, *tmp; + struct ta_buf_chash *tb; + int exists; + uint32_t hash, value; + + cfg = (struct chash_cfg *)ta_state; + tb = (struct ta_buf_chash *)ta_buf; + ent = (struct chashentry *)tb->ent_ptr; + hash = 0; + exists = 0; + + /* Read current value from @tei */ + ent->value = tei->value; + + /* Read cuurrent value */ + if (tei->subtype == AF_INET) { + if (tei->masklen != cfg->mask4) + return (EINVAL); + head = cfg->head4; + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (tmp->a.a4 == ent->a.a4) { + exists = 1; + break; + } + } + } else { + if (tei->masklen != cfg->mask6) + return (EINVAL); + head = cfg->head6; + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { + exists = 1; + break; + } + } + } + + if (exists == 1) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + } else { + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + SLIST_INSERT_HEAD(&head[hash], ent, next); + tb->ent_ptr = NULL; + *pnum = 1; + + /* Update counters */ + if (tei->subtype == AF_INET) + cfg->items4++; + else + cfg->items6++; + } + + return (0); +} + +static int +ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + + tb = (struct ta_buf_chash *)ta_buf; + + return (tei_to_chash_ent(tei, &tb->ent)); +} + +static int +ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct chash_cfg *cfg; + struct chashbhead *head; + struct chashentry *tmp, *tmp_next, *ent; + struct ta_buf_chash *tb; + uint32_t hash; + + cfg = (struct chash_cfg *)ta_state; + tb = (struct ta_buf_chash *)ta_buf; + ent = &tb->ent; + + if (tei->subtype == AF_INET) { + if (tei->masklen != cfg->mask4) + return (EINVAL); + head = cfg->head4; + hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); + + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { + if (tmp->a.a4 != ent->a.a4) + continue; + + SLIST_REMOVE(&head[hash], tmp, chashentry, next); + cfg->items4--; + tb->ent_ptr = tmp; + tei->value = tmp->value; + *pnum = 1; + return (0); + } + } else { + if (tei->masklen != cfg->mask6) + return (EINVAL); + head = cfg->head6; + hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); + SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { + if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) + continue; + + SLIST_REMOVE(&head[hash], tmp, chashentry, next); + cfg->items6--; + tb->ent_ptr = tmp; + tei->value = tmp->value; + *pnum = 1; + return (0); + } + } + + return (ENOENT); +} + +static void +ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_chash *tb; + + tb = (struct ta_buf_chash *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +/* + * Hash growing callbacks. + */ + +static int +ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct chash_cfg *cfg; + uint64_t data; + + /* + * Since we don't know exact number of IPv4/IPv6 records in @count, + * ignore non-zero @count value at all. Check current hash sizes + * and return appropriate data. + */ + + cfg = (struct chash_cfg *)ta_state; + + data = 0; + if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) + data |= (cfg->size4 * 2) << 16; + if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) + data |= cfg->size6 * 2; + + if (data != 0) { + *pflags = data; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger chash. + */ +static int +ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + struct chashbhead *head; + int i; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = (*pflags >> 16) & 0xFFFF; + mi->size6 = *pflags & 0xFFFF; + if (mi->size > 0) { + head = malloc(sizeof(struct chashbhead) * mi->size, + M_IPFW, M_WAITOK | M_ZERO); + for (i = 0; i < mi->size; i++) + SLIST_INIT(&head[i]); + mi->main_ptr = head; + } + + if (mi->size6 > 0) { + head = malloc(sizeof(struct chashbhead) * mi->size6, + M_IPFW, M_WAITOK | M_ZERO); + for (i = 0; i < mi->size6; i++) + SLIST_INIT(&head[i]); + mi->main_ptr6 = head; + } + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + + /* In is not possible to do rehash if we're not holidng WLOCK. */ + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct chash_cfg *cfg; + struct chashbhead *old_head, *new_head; + struct chashentry *ent, *ent_next; + int af, i, mlen; + uint32_t nhash; + size_t old_size, new_size; + + mi = (struct mod_item *)ta_buf; + cfg = (struct chash_cfg *)ta_state; + + /* Check which hash we need to grow and do we still need that */ + if (mi->size > 0 && cfg->size4 < mi->size) { + new_head = (struct chashbhead *)mi->main_ptr; + new_size = mi->size; + old_size = cfg->size4; + old_head = ti->state; + mlen = cfg->mask4; + af = AF_INET; + + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_ent(ent, af, mlen, new_size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->state = new_head; + cfg->head4 = new_head; + cfg->size4 = mi->size; + mi->main_ptr = old_head; + } + + if (mi->size6 > 0 && cfg->size6 < mi->size6) { + new_head = (struct chashbhead *)mi->main_ptr6; + new_size = mi->size6; + old_size = cfg->size6; + old_head = ti->xstate; + mlen = cfg->mask6; + af = AF_INET6; + + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_ent(ent, af, mlen, new_size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->xstate = new_head; + cfg->head6 = new_head; + cfg->size6 = mi->size6; + mi->main_ptr6 = old_head; + } + + /* Update lower 32 bits with new values */ + ti->data &= 0xFFFFFFFF00000000; + ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_chash(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); + if (mi->main_ptr6 != NULL) + free(mi->main_ptr6, M_IPFW); +} + +struct table_algo addr_hash = { + .name = "addr:hash", + .type = IPFW_TABLE_ADDR, + .ta_buf_size = sizeof(struct ta_buf_chash), + .init = ta_init_chash, + .destroy = ta_destroy_chash, + .prepare_add = ta_prepare_add_chash, + .prepare_del = ta_prepare_del_chash, + .add = ta_add_chash, + .del = ta_del_chash, + .flush_entry = ta_flush_chash_entry, + .foreach = ta_foreach_chash, + .dump_tentry = ta_dump_chash_tentry, + .find_tentry = ta_find_chash_tentry, + .print_config = ta_print_chash_config, + .dump_tinfo = ta_dump_chash_tinfo, + .need_modify = ta_need_modify_chash, + .prepare_mod = ta_prepare_mod_chash, + .fill_mod = ta_fill_mod_chash, + .modify = ta_modify_chash, + .flush_mod = ta_flush_mod_chash, +}; + + +/* + * Iface table cmds. + * + * Implementation: + * + * Runtime part: + * - sorted array of "struct ifidx" pointed by ti->state. + * Array is allocated with rounding up to IFIDX_CHUNK. Only existing + * interfaces are stored in array, however its allocated size is + * sufficient to hold all table records if needed. + * - current array size is stored in ti->data + * + * Table data: + * - "struct iftable_cfg" is allocated to store table state (ta_state). + * - All table records are stored inside namedobj instance. + * + */ + +struct ifidx { + uint16_t kidx; + uint16_t spare; + uint32_t value; +}; +#define DEFAULT_IFIDX_SIZE 64 + +struct iftable_cfg; + +struct ifentry { + struct named_object no; + struct ipfw_ifc ic; + struct iftable_cfg *icfg; + uint32_t value; + int linked; +}; + +struct iftable_cfg { + struct namedobj_instance *ii; + struct ip_fw_chain *ch; + struct table_info *ti; + void *main_ptr; + size_t size; /* Number of items allocated in array */ + size_t count; /* Number of all items */ + size_t used; /* Number of items _active_ now */ +}; + +struct ta_buf_ifidx +{ + struct ifentry *ife; + uint32_t value; +}; + +int compare_ifidx(const void *k, const void *v); +static struct ifidx * ifidx_find(struct table_info *ti, void *key); +static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); +static void destroy_ifidx_locked(struct namedobj_instance *ii, + struct named_object *no, void *arg); +static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); +static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_add_ifidx(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_ifidx(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); +static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_ifidx(void *ta_buf); +static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, + void *arg); +static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + +int +compare_ifidx(const void *k, const void *v) +{ + const struct ifidx *ifidx; + uint16_t key; + + key = *((const uint16_t *)k); + ifidx = (const struct ifidx *)v; + + if (key < ifidx->kidx) + return (-1); + else if (key > ifidx->kidx) + return (1); + + return (0); +} + +/* + * Adds item @item with key @key into ascending-sorted array @base. + * Assumes @base has enough additional storage. + * + * Returns 1 on success, 0 on duplicate key. + */ +static int +badd(const void *key, void *item, void *base, size_t nmemb, + size_t size, int (*compar) (const void *, const void *)) +{ + int min, max, mid, shift, res; + caddr_t paddr; + + if (nmemb == 0) { + memcpy(base, item, size); + return (1); + } + + /* Binary search */ + min = 0; + max = nmemb - 1; + mid = 0; + while (min <= max) { + mid = (min + max) / 2; + res = compar(key, (const void *)((caddr_t)base + mid * size)); + if (res == 0) + return (0); + + if (res > 0) + min = mid + 1; + else + max = mid - 1; + } + + /* Item not found. */ + res = compar(key, (const void *)((caddr_t)base + mid * size)); + if (res > 0) + shift = mid + 1; + else + shift = mid; + + paddr = (caddr_t)base + shift * size; + if (nmemb > shift) + memmove(paddr + size, paddr, (nmemb - shift) * size); + + memcpy(paddr, item, size); + + return (1); +} + +/* + * Deletes item with key @key from ascending-sorted array @base. + * + * Returns 1 on success, 0 for non-existent key. + */ +static int +bdel(const void *key, void *base, size_t nmemb, size_t size, + int (*compar) (const void *, const void *)) +{ + caddr_t item; + size_t sz; + + item = (caddr_t)bsearch(key, base, nmemb, size, compar); + + if (item == NULL) + return (0); + + sz = (caddr_t)base + nmemb * size - item; + + if (sz > 0) + memmove(item, item + size, sz); + + return (1); +} + +static struct ifidx * +ifidx_find(struct table_info *ti, void *key) +{ + struct ifidx *ifi; + + ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), + compare_ifidx); + + return (ifi); +} + +static int +ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct ifidx *ifi; + + ifi = ifidx_find(ti, key); + + if (ifi != NULL) { + *val = ifi->value; + return (1); + } + + return (0); +} + +static int +ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct iftable_cfg *icfg; + + icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); + + icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); + icfg->size = DEFAULT_IFIDX_SIZE; + icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, + M_WAITOK | M_ZERO); + icfg->ch = ch; + + *ta_state = icfg; + ti->state = icfg->main_ptr; + ti->lookup = ta_lookup_ifidx; + + return (0); +} + +/* + * Handle tableinfo @ti pointer change (on table array resize). + */ +static void +ta_change_ti_ifidx(void *ta_state, struct table_info *ti) +{ + struct iftable_cfg *icfg; + + icfg = (struct iftable_cfg *)ta_state; + icfg->ti = ti; +} + +static void +destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + struct ifentry *ife; + struct ip_fw_chain *ch; + + ch = (struct ip_fw_chain *)arg; + ife = (struct ifentry *)no; + + ipfw_iface_del_notify(ch, &ife->ic); + free(ife, M_IPFW_TBL); +} + + +/* + * Destroys table @ti + */ +static void +ta_destroy_ifidx(void *ta_state, struct table_info *ti) +{ + struct iftable_cfg *icfg; + struct ip_fw_chain *ch; + + icfg = (struct iftable_cfg *)ta_state; + ch = icfg->ch; + + if (icfg->main_ptr != NULL) + free(icfg->main_ptr, M_IPFW); + + ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); + + ipfw_objhash_destroy(icfg->ii); + + free(icfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct iftable_cfg *cfg; + + cfg = (struct iftable_cfg *)ta_state; + + tinfo->taclass4 = IPFW_TACLASS_ARRAY; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->used; + tinfo->itemsize4 = sizeof(struct ifidx); +} + +/* + * Prepare state to add to the table: + * allocate ifentry and reference needed interface. + */ +static int +ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + char *ifname; + struct ifentry *ife; + + tb = (struct ta_buf_ifidx *)ta_buf; + + /* Check if string is terminated */ + ifname = (char *)tei->paddr; + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); + ife->ic.cb = if_notifier; + ife->ic.cbdata = ife; + + if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { + free(ife, M_IPFW_TBL); + return (EINVAL); + } + + /* Use ipfw_iface 'ifname' field as stable storage */ + ife->no.name = ife->ic.iface->ifname; + + tb->ife = ife; + + return (0); +} + +static int +ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct iftable_cfg *icfg; + struct ifentry *ife, *tmp; + struct ta_buf_ifidx *tb; + struct ipfw_iface *iif; + struct ifidx *ifi; + char *ifname; + uint32_t value; + + tb = (struct ta_buf_ifidx *)ta_buf; + ifname = (char *)tei->paddr; + icfg = (struct iftable_cfg *)ta_state; + ife = tb->ife; + + ife->icfg = icfg; + ife->value = tei->value; + + tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (tmp != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + + /* Exchange values in @tmp and @tei */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + + iif = tmp->ic.iface; + if (iif->resolved != 0) { + /* We have to update runtime value, too */ + ifi = ifidx_find(ti, &iif->ifindex); + ifi->value = ife->value; + } + + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + /* Link to internal list */ + ipfw_objhash_add(icfg->ii, &ife->no); + + /* Link notifier (possible running its callback) */ + ipfw_iface_add_notify(icfg->ch, &ife->ic); + icfg->count++; + + tb->ife = NULL; + *pnum = 1; + + return (0); +} + +/* + * Prepare to delete key from table. + * Do basic interface name checks. + */ +static int +ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + char *ifname; + + tb = (struct ta_buf_ifidx *)ta_buf; + + /* Check if string is terminated */ + ifname = (char *)tei->paddr; + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + return (0); +} + +/* + * Remove key from both configuration list and + * runtime array. Removed interface notification. + */ +static int +ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct iftable_cfg *icfg; + struct ifentry *ife; + struct ta_buf_ifidx *tb; + char *ifname; + uint16_t ifindex; + int res; + + tb = (struct ta_buf_ifidx *)ta_buf; + ifname = (char *)tei->paddr; + icfg = (struct iftable_cfg *)ta_state; + ife = tb->ife; + + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (ife == NULL) + return (ENOENT); + + if (ife->linked != 0) { + /* We have to remove item from runtime */ + ifindex = ife->ic.iface->ifindex; + + res = bdel(&ifindex, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + + KASSERT(res == 1, ("index %d does not exist", ifindex)); + icfg->used--; + ti->data = icfg->used; + ife->linked = 0; + } + + /* Unlink from local list */ + ipfw_objhash_del(icfg->ii, &ife->no); + /* Unlink notifier */ + ipfw_iface_del_notify(icfg->ch, &ife->ic); + + icfg->count--; + tei->value = ife->value; + + tb->ife = ife; + *pnum = 1; + + return (0); +} + +/* + * Flush deleted entry. + * Drops interface reference and frees entry. + */ +static void +ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_ifidx *tb; + + tb = (struct ta_buf_ifidx *)ta_buf; + + if (tb->ife != NULL) { + /* Unlink first */ + ipfw_iface_unref(ch, &tb->ife->ic); + free(tb->ife, M_IPFW_TBL); + } +} + + +/* + * Handle interface announce/withdrawal for particular table. + * Every real runtime array modification happens here. + */ +static void +if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) +{ + struct ifentry *ife; + struct ifidx ifi; + struct iftable_cfg *icfg; + struct table_info *ti; + int res; + + ife = (struct ifentry *)cbdata; + icfg = ife->icfg; + ti = icfg->ti; + + KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); + + if (ife->linked == 0 && ifindex != 0) { + /* Interface announce */ + ifi.kidx = ifindex; + ifi.spare = 0; + ifi.value = ife->value; + res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + KASSERT(res == 1, ("index %d already exists", ifindex)); + icfg->used++; + ti->data = icfg->used; + ife->linked = 1; + } else if (ife->linked != 0 && ifindex == 0) { + /* Interface withdrawal */ + ifindex = ife->ic.iface->ifindex; + + res = bdel(&ifindex, icfg->main_ptr, icfg->used, + sizeof(struct ifidx), compare_ifidx); + + KASSERT(res == 1, ("index %d does not exist", ifindex)); + icfg->used--; + ti->data = icfg->used; + ife->linked = 0; + } +} + + +/* + * Table growing callbacks. + */ + +static int +ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct iftable_cfg *cfg; + uint32_t size; + + cfg = (struct iftable_cfg *)ta_state; + + size = cfg->size; + while (size < cfg->count + count) + size *= 2; + + if (size != cfg->size) { + *pflags = size; + return (1); + } + + return (0); +} + +/* + * Allocate ned, larger runtime ifidx array. + */ +static int +ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + struct mod_item *mi; + struct iftable_cfg *icfg; + + mi = (struct mod_item *)ta_buf; + icfg = (struct iftable_cfg *)ta_state; + + /* Check if we still need to grow array */ + if (icfg->size >= mi->size) { + *pflags = 0; + return (0); + } + + memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); + + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct iftable_cfg *icfg; + void *old_ptr; + + mi = (struct mod_item *)ta_buf; + icfg = (struct iftable_cfg *)ta_state; + + old_ptr = icfg->main_ptr; + icfg->main_ptr = mi->main_ptr; + icfg->size = mi->size; + ti->state = icfg->main_ptr; + + mi->main_ptr = old_ptr; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_ifidx(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +static int +ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct ifentry *ife; + + ife = (struct ifentry *)e; + + tent->masklen = 8 * IF_NAMESIZE; + memcpy(&tent->k, ife->no.name, IF_NAMESIZE); + tent->v.kidx = ife->value; + + return (0); +} + +static int +ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct iftable_cfg *icfg; + struct ifentry *ife; + char *ifname; + + icfg = (struct iftable_cfg *)ta_state; + ifname = tent->k.iface; + + if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) + return (EINVAL); + + ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); + + if (ife != NULL) { + ta_dump_ifidx_tentry(ta_state, ti, ife, tent); + return (0); + } + + return (ENOENT); +} + +struct wa_ifidx { + ta_foreach_f *f; + void *arg; +}; + +static void +foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, + void *arg) +{ + struct ifentry *ife; + struct wa_ifidx *wa; + + ife = (struct ifentry *)no; + wa = (struct wa_ifidx *)arg; + + wa->f(ife, wa->arg); +} + +static void +ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct iftable_cfg *icfg; + struct wa_ifidx wa; + + icfg = (struct iftable_cfg *)ta_state; + + wa.f = f; + wa.arg = arg; + + ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); +} + +struct table_algo iface_idx = { + .name = "iface:array", + .type = IPFW_TABLE_INTERFACE, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_ifidx), + .init = ta_init_ifidx, + .destroy = ta_destroy_ifidx, + .prepare_add = ta_prepare_add_ifidx, + .prepare_del = ta_prepare_del_ifidx, + .add = ta_add_ifidx, + .del = ta_del_ifidx, + .flush_entry = ta_flush_ifidx_entry, + .foreach = ta_foreach_ifidx, + .dump_tentry = ta_dump_ifidx_tentry, + .find_tentry = ta_find_ifidx_tentry, + .dump_tinfo = ta_dump_ifidx_tinfo, + .need_modify = ta_need_modify_ifidx, + .prepare_mod = ta_prepare_mod_ifidx, + .fill_mod = ta_fill_mod_ifidx, + .modify = ta_modify_ifidx, + .flush_mod = ta_flush_mod_ifidx, + .change_ti = ta_change_ti_ifidx, +}; + +/* + * Number array cmds. + * + * Implementation: + * + * Runtime part: + * - sorted array of "struct numarray" pointed by ti->state. + * Array is allocated with rounding up to NUMARRAY_CHUNK. + * - current array size is stored in ti->data + * + */ + +struct numarray { + uint32_t number; + uint32_t value; +}; + +struct numarray_cfg { + void *main_ptr; + size_t size; /* Number of items allocated in array */ + size_t used; /* Number of items _active_ now */ +}; + +struct ta_buf_numarray +{ + struct numarray na; +}; + +int compare_numarray(const void *k, const void *v); +static struct numarray *numarray_find(struct table_info *ti, void *key); +static int ta_lookup_numarray(struct table_info *ti, void *key, + uint32_t keylen, uint32_t *val); +static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_numarray(void *ta_state, struct table_info *ti); +static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_prepare_add_numarray(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_add_numarray(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_del_numarray(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_numarray_entry(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_numarray(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t pflags); +static void ta_flush_mod_numarray(void *ta_buf); +static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_numarray(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + +int +compare_numarray(const void *k, const void *v) +{ + const struct numarray *na; + uint32_t key; + + key = *((const uint32_t *)k); + na = (const struct numarray *)v; + + if (key < na->number) + return (-1); + else if (key > na->number) + return (1); + + return (0); +} + +static struct numarray * +numarray_find(struct table_info *ti, void *key) +{ + struct numarray *ri; + + ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), + compare_ifidx); + + return (ri); +} + +static int +ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct numarray *ri; + + ri = numarray_find(ti, key); + + if (ri != NULL) { + *val = ri->value; + return (1); + } + + return (0); +} + +static int +ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + struct numarray_cfg *cfg; + + cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->size = 16; + cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, + M_WAITOK | M_ZERO); + + *ta_state = cfg; + ti->state = cfg->main_ptr; + ti->lookup = ta_lookup_numarray; + + return (0); +} + +/* + * Destroys table @ti + */ +static void +ta_destroy_numarray(void *ta_state, struct table_info *ti) +{ + struct numarray_cfg *cfg; + + cfg = (struct numarray_cfg *)ta_state; + + if (cfg->main_ptr != NULL) + free(cfg->main_ptr, M_IPFW); + + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct numarray_cfg *cfg; + + cfg = (struct numarray_cfg *)ta_state; + + tinfo->taclass4 = IPFW_TACLASS_ARRAY; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->used; + tinfo->itemsize4 = sizeof(struct numarray); +} + +/* + * Prepare for addition/deletion to an array. + */ +static int +ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_numarray *tb; + + tb = (struct ta_buf_numarray *)ta_buf; + + tb->na.number = *((uint32_t *)tei->paddr); + + return (0); +} + +static int +ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct numarray_cfg *cfg; + struct ta_buf_numarray *tb; + struct numarray *ri; + int res; + uint32_t value; + + tb = (struct ta_buf_numarray *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + /* Read current value from @tei */ + tb->na.value = tei->value; + + ri = numarray_find(ti, &tb->na.number); + + if (ri != NULL) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + + /* Exchange values between ri and @tei */ + value = ri->value; + ri->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + return (0); + } + + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, + sizeof(struct numarray), compare_numarray); + + KASSERT(res == 1, ("number %d already exists", tb->na.number)); + cfg->used++; + ti->data = cfg->used; + *pnum = 1; + + return (0); +} + +/* + * Remove key from both configuration list and + * runtime array. Removed interface notification. + */ +static int +ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct numarray_cfg *cfg; + struct ta_buf_numarray *tb; + struct numarray *ri; + int res; + + tb = (struct ta_buf_numarray *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + ri = numarray_find(ti, &tb->na.number); + if (ri == NULL) + return (ENOENT); + + tei->value = ri->value; + + res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, + sizeof(struct numarray), compare_numarray); + + KASSERT(res == 1, ("number %u does not exist", tb->na.number)); + cfg->used--; + ti->data = cfg->used; + *pnum = 1; + + return (0); +} + +static void +ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + + /* We don't have any state, do nothing */ +} + + +/* + * Table growing callbacks. + */ + +static int +ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct numarray_cfg *cfg; + size_t size; + + cfg = (struct numarray_cfg *)ta_state; + + size = cfg->size; + while (size < cfg->used + count) + size *= 2; + + if (size != cfg->size) { + *pflags = size; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger runtime array. + */ +static int +ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + struct mod_item *mi; + struct numarray_cfg *cfg; + + mi = (struct mod_item *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + /* Check if we still need to grow array */ + if (cfg->size >= mi->size) { + *pflags = 0; + return (0); + } + + memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); + + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct numarray_cfg *cfg; + void *old_ptr; + + mi = (struct mod_item *)ta_buf; + cfg = (struct numarray_cfg *)ta_state; + + old_ptr = cfg->main_ptr; + cfg->main_ptr = mi->main_ptr; + cfg->size = mi->size; + ti->state = cfg->main_ptr; + + mi->main_ptr = old_ptr; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_numarray(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +static int +ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct numarray *na; + + na = (struct numarray *)e; + + tent->k.key = na->number; + tent->v.kidx = na->value; + + return (0); +} + +static int +ta_find_numarray_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct numarray_cfg *cfg; + struct numarray *ri; + + cfg = (struct numarray_cfg *)ta_state; + + ri = numarray_find(ti, &tent->k.key); + + if (ri != NULL) { + ta_dump_numarray_tentry(ta_state, ti, ri, tent); + return (0); + } + + return (ENOENT); +} + +static void +ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct numarray_cfg *cfg; + struct numarray *array; + int i; + + cfg = (struct numarray_cfg *)ta_state; + array = cfg->main_ptr; + + for (i = 0; i < cfg->used; i++) + f(&array[i], arg); +} + +struct table_algo number_array = { + .name = "number:array", + .type = IPFW_TABLE_NUMBER, + .ta_buf_size = sizeof(struct ta_buf_numarray), + .init = ta_init_numarray, + .destroy = ta_destroy_numarray, + .prepare_add = ta_prepare_add_numarray, + .prepare_del = ta_prepare_add_numarray, + .add = ta_add_numarray, + .del = ta_del_numarray, + .flush_entry = ta_flush_numarray_entry, + .foreach = ta_foreach_numarray, + .dump_tentry = ta_dump_numarray_tentry, + .find_tentry = ta_find_numarray_tentry, + .dump_tinfo = ta_dump_numarray_tinfo, + .need_modify = ta_need_modify_numarray, + .prepare_mod = ta_prepare_mod_numarray, + .fill_mod = ta_fill_mod_numarray, + .modify = ta_modify_numarray, + .flush_mod = ta_flush_mod_numarray, +}; + +/* + * flow:hash cmds + * + * + * ti->data: + * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] + * [ 8][ 8[ 8][ 8] + * + * inv.mask4: 32 - mask + * inv.mask6: + * 1) _slow lookup: mask + * 2) _aligned: (128 - mask) / 8 + * 3) _64: 8 + * + * + * pflags: + * [hsize4][hsize6] + * [ 16][ 16] + */ + +struct fhashentry; + +SLIST_HEAD(fhashbhead, fhashentry); + +struct fhashentry { + SLIST_ENTRY(fhashentry) next; + uint8_t af; + uint8_t proto; + uint16_t spare0; + uint16_t dport; + uint16_t sport; + uint32_t value; + uint32_t spare1; +}; + +struct fhashentry4 { + struct fhashentry e; + struct in_addr dip; + struct in_addr sip; +}; + +struct fhashentry6 { + struct fhashentry e; + struct in6_addr dip6; + struct in6_addr sip6; +}; + +struct fhash_cfg { + struct fhashbhead *head; + size_t size; + size_t items; + struct fhashentry4 fe4; + struct fhashentry6 fe6; +}; + +struct ta_buf_fhash { + void *ent_ptr; + struct fhashentry6 fe6; +}; + +static __inline int cmp_flow_ent(struct fhashentry *a, + struct fhashentry *b, size_t sz); +static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); +static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); +static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); +static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, +struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_fhash(void *ta_state, struct table_info *ti); +static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, + void *e, ipfw_obj_tentry *tent); +static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); +static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_fhash(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); +static int ta_prepare_add_fhash(struct ip_fw_chain *ch, + struct tentry_info *tei, void *ta_buf); +static int ta_add_fhash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_del_fhash(void *ta_state, struct table_info *ti, + struct tentry_info *tei, void *ta_buf, uint32_t *pnum); +static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf); +static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, + uint32_t count, uint64_t *pflags); +static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); +static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, + void *ta_buf, uint64_t *pflags); +static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags); +static void ta_flush_mod_fhash(void *ta_buf); + +static __inline int +cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) +{ + uint64_t *ka, *kb; + + ka = (uint64_t *)(&a->next + 1); + kb = (uint64_t *)(&b->next + 1); + + if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) + return (1); + + return (0); +} + +static __inline uint32_t +hash_flow4(struct fhashentry4 *f, int hsize) +{ + uint32_t i; + + i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); + + return (i % (hsize - 1)); +} + +static __inline uint32_t +hash_flow6(struct fhashentry6 *f, int hsize) +{ + uint32_t i; + + i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ + (f->dip6.__u6_addr.__u6_addr32[3]) ^ + (f->sip6.__u6_addr.__u6_addr32[2]) ^ + (f->sip6.__u6_addr.__u6_addr32[3]) ^ + (f->e.dport) ^ (f->e.sport); + + return (i % (hsize - 1)); +} + +static uint32_t +hash_flow_ent(struct fhashentry *ent, uint32_t size) +{ + uint32_t hash; + + if (ent->af == AF_INET) { + hash = hash_flow4((struct fhashentry4 *)ent, size); + } else { + hash = hash_flow6((struct fhashentry6 *)ent, size); + } + + return (hash); +} + +static int +ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct fhashbhead *head; + struct fhashentry *ent; + struct fhashentry4 *m4; + struct ipfw_flow_id *id; + uint16_t hash, hsize; + + id = (struct ipfw_flow_id *)key; + head = (struct fhashbhead *)ti->state; + hsize = ti->data; + m4 = (struct fhashentry4 *)ti->xstate; + + if (id->addr_type == 4) { + struct fhashentry4 f; + + /* Copy hash mask */ + f = *m4; + + f.dip.s_addr &= id->dst_ip; + f.sip.s_addr &= id->src_ip; + f.e.dport &= id->dst_port; + f.e.sport &= id->src_port; + f.e.proto &= id->proto; + hash = hash_flow4(&f, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { + *val = ent->value; + return (1); + } + } + } else if (id->addr_type == 6) { + struct fhashentry6 f; + uint64_t *fp, *idp; + + /* Copy hash mask */ + f = *((struct fhashentry6 *)(m4 + 1)); + + /* Handle lack of __u6_addr.__u6_addr64 */ + fp = (uint64_t *)&f.dip6; + idp = (uint64_t *)&id->dst_ip6; + /* src IPv6 is stored after dst IPv6 */ + *fp++ &= *idp++; + *fp++ &= *idp++; + *fp++ &= *idp++; + *fp &= *idp; + f.e.dport &= id->dst_port; + f.e.sport &= id->src_port; + f.e.proto &= id->proto; + hash = hash_flow6(&f, hsize); + SLIST_FOREACH(ent, &head[hash], next) { + if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { + *val = ent->value; + return (1); + } + } + } + + return (0); +} + +/* + * New table. + */ +static int +ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int i; + struct fhash_cfg *cfg; + struct fhashentry4 *fe4; + struct fhashentry6 *fe6; + + cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); + + cfg->size = 512; + + cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < cfg->size; i++) + SLIST_INIT(&cfg->head[i]); + + /* Fill in fe masks based on @tflags */ + fe4 = &cfg->fe4; + fe6 = &cfg->fe6; + if (tflags & IPFW_TFFLAG_SRCIP) { + memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); + memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); + } + if (tflags & IPFW_TFFLAG_DSTIP) { + memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); + memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); + } + if (tflags & IPFW_TFFLAG_SRCPORT) { + memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); + memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); + } + if (tflags & IPFW_TFFLAG_DSTPORT) { + memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); + memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); + } + if (tflags & IPFW_TFFLAG_PROTO) { + memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); + memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); + } + + fe4->e.af = AF_INET; + fe6->e.af = AF_INET6; + + *ta_state = cfg; + ti->state = cfg->head; + ti->xstate = &cfg->fe4; + ti->data = cfg->size; + ti->lookup = ta_lookup_fhash; + + return (0); +} + +static void +ta_destroy_fhash(void *ta_state, struct table_info *ti) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent, *ent_next; + int i; + + cfg = (struct fhash_cfg *)ta_state; + + for (i = 0; i < cfg->size; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) + free(ent, M_IPFW_TBL); + + free(cfg->head, M_IPFW); + free(cfg, M_IPFW); +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + struct fhash_cfg *cfg; + + cfg = (struct fhash_cfg *)ta_state; + + tinfo->flags = IPFW_TATFLAGS_AFITEM; + tinfo->taclass4 = IPFW_TACLASS_HASH; + tinfo->size4 = cfg->size; + tinfo->count4 = cfg->items; + tinfo->itemsize4 = sizeof(struct fhashentry4); + tinfo->itemsize6 = sizeof(struct fhashentry6); +} + +static int +ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent; + struct fhashentry4 *fe4; +#ifdef INET6 + struct fhashentry6 *fe6; +#endif + struct tflow_entry *tfe; + + cfg = (struct fhash_cfg *)ta_state; + ent = (struct fhashentry *)e; + tfe = &tent->k.flow; + + tfe->af = ent->af; + tfe->proto = ent->proto; + tfe->dport = htons(ent->dport); + tfe->sport = htons(ent->sport); + tent->v.kidx = ent->value; + tent->subtype = ent->af; + + if (ent->af == AF_INET) { + fe4 = (struct fhashentry4 *)ent; + tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); + tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); + tent->masklen = 32; +#ifdef INET6 + } else { + fe6 = (struct fhashentry6 *)ent; + tfe->a.a6.sip6 = fe6->sip6; + tfe->a.a6.dip6 = fe6->dip6; + tent->masklen = 128; +#endif + } + + return (0); +} + +static int +tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) +{ +#ifdef INET + struct fhashentry4 *fe4; +#endif +#ifdef INET6 + struct fhashentry6 *fe6; +#endif + struct tflow_entry *tfe; + + tfe = (struct tflow_entry *)tei->paddr; + + ent->af = tei->subtype; + ent->proto = tfe->proto; + ent->dport = ntohs(tfe->dport); + ent->sport = ntohs(tfe->sport); + + if (tei->subtype == AF_INET) { +#ifdef INET + fe4 = (struct fhashentry4 *)ent; + fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); + fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); +#endif +#ifdef INET6 + } else if (tei->subtype == AF_INET6) { + fe6 = (struct fhashentry6 *)ent; + fe6->sip6 = tfe->a.a6.sip6; + fe6->dip6 = tfe->a.a6.dip6; +#endif + } else { + /* Unknown CIDR type */ + return (EINVAL); + } + + return (0); +} + + +static int +ta_find_fhash_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct fhashentry6 fe6; + struct tentry_info tei; + int error; + uint32_t hash; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + + ent = &fe6.e; + + memset(&fe6, 0, sizeof(fe6)); + memset(&tei, 0, sizeof(tei)); + + tei.paddr = &tent->k.flow; + tei.subtype = tent->subtype; + + if ((error = tei_to_fhash_ent(&tei, ent)) != 0) + return (error); + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei.subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) != 0) { + ta_dump_fhash_tentry(ta_state, ti, tmp, tent); + return (0); + } + } + + return (ENOENT); +} + +static void +ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct fhash_cfg *cfg; + struct fhashentry *ent, *ent_next; + int i; + + cfg = (struct fhash_cfg *)ta_state; + + for (i = 0; i < cfg->size; i++) + SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) + f(ent, arg); +} + +static int +ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + struct fhashentry *ent; + size_t sz; + int error; + + tb = (struct ta_buf_fhash *)ta_buf; + + if (tei->subtype == AF_INET) + sz = sizeof(struct fhashentry4); + else if (tei->subtype == AF_INET6) + sz = sizeof(struct fhashentry6); + else + return (EINVAL); + + ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); + + error = tei_to_fhash_ent(tei, ent); + if (error != 0) { + free(ent, M_IPFW_TBL); + return (error); + } + tb->ent_ptr = ent; + + return (0); +} + +static int +ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct ta_buf_fhash *tb; + int exists; + uint32_t hash, value; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + tb = (struct ta_buf_fhash *)ta_buf; + ent = (struct fhashentry *)tb->ent_ptr; + exists = 0; + + /* Read current value from @tei */ + ent->value = tei->value; + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei->subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) != 0) { + exists = 1; + break; + } + } + + if (exists == 1) { + if ((tei->flags & TEI_FLAGS_UPDATE) == 0) + return (EEXIST); + /* Record already exists. Update value if we're asked to */ + /* Exchange values between tmp and @tei */ + value = tmp->value; + tmp->value = tei->value; + tei->value = value; + /* Indicate that update has happened instead of addition */ + tei->flags |= TEI_FLAGS_UPDATED; + *pnum = 0; + } else { + if ((tei->flags & TEI_FLAGS_DONTADD) != 0) + return (EFBIG); + + SLIST_INSERT_HEAD(&head[hash], ent, next); + tb->ent_ptr = NULL; + *pnum = 1; + + /* Update counters and check if we need to grow hash */ + cfg->items++; + } + + return (0); +} + +static int +ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + + tb = (struct ta_buf_fhash *)ta_buf; + + return (tei_to_fhash_ent(tei, &tb->fe6.e)); +} + +static int +ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, + void *ta_buf, uint32_t *pnum) +{ + struct fhash_cfg *cfg; + struct fhashbhead *head; + struct fhashentry *ent, *tmp; + struct ta_buf_fhash *tb; + uint32_t hash; + size_t sz; + + cfg = (struct fhash_cfg *)ta_state; + tb = (struct ta_buf_fhash *)ta_buf; + ent = &tb->fe6.e; + + head = cfg->head; + hash = hash_flow_ent(ent, cfg->size); + + if (tei->subtype == AF_INET) + sz = 2 * sizeof(struct in_addr); + else + sz = 2 * sizeof(struct in6_addr); + + /* Check for existence */ + SLIST_FOREACH(tmp, &head[hash], next) { + if (cmp_flow_ent(tmp, ent, sz) == 0) + continue; + + SLIST_REMOVE(&head[hash], tmp, fhashentry, next); + tei->value = tmp->value; + *pnum = 1; + cfg->items--; + tb->ent_ptr = tmp; + return (0); + } + + return (ENOENT); +} + +static void +ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, + void *ta_buf) +{ + struct ta_buf_fhash *tb; + + tb = (struct ta_buf_fhash *)ta_buf; + + if (tb->ent_ptr != NULL) + free(tb->ent_ptr, M_IPFW_TBL); +} + +/* + * Hash growing callbacks. + */ + +static int +ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, + uint64_t *pflags) +{ + struct fhash_cfg *cfg; + + cfg = (struct fhash_cfg *)ta_state; + + if (cfg->items > cfg->size && cfg->size < 65536) { + *pflags = cfg->size * 2; + return (1); + } + + return (0); +} + +/* + * Allocate new, larger fhash. + */ +static int +ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) +{ + struct mod_item *mi; + struct fhashbhead *head; + int i; + + mi = (struct mod_item *)ta_buf; + + memset(mi, 0, sizeof(struct mod_item)); + mi->size = *pflags; + head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, + M_WAITOK | M_ZERO); + for (i = 0; i < mi->size; i++) + SLIST_INIT(&head[i]); + + mi->main_ptr = head; + + return (0); +} + +/* + * Copy data from old runtime array to new one. + */ +static int +ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t *pflags) +{ + + /* In is not possible to do rehash if we're not holidng WLOCK. */ + return (0); +} + +/* + * Switch old & new arrays. + */ +static void +ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, + uint64_t pflags) +{ + struct mod_item *mi; + struct fhash_cfg *cfg; + struct fhashbhead *old_head, *new_head; + struct fhashentry *ent, *ent_next; + int i; + uint32_t nhash; + size_t old_size; + + mi = (struct mod_item *)ta_buf; + cfg = (struct fhash_cfg *)ta_state; + + old_size = cfg->size; + old_head = ti->state; + + new_head = (struct fhashbhead *)mi->main_ptr; + for (i = 0; i < old_size; i++) { + SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { + nhash = hash_flow_ent(ent, mi->size); + SLIST_INSERT_HEAD(&new_head[nhash], ent, next); + } + } + + ti->state = new_head; + ti->data = mi->size; + cfg->head = new_head; + cfg->size = mi->size; + + mi->main_ptr = old_head; +} + +/* + * Free unneded array. + */ +static void +ta_flush_mod_fhash(void *ta_buf) +{ + struct mod_item *mi; + + mi = (struct mod_item *)ta_buf; + if (mi->main_ptr != NULL) + free(mi->main_ptr, M_IPFW); +} + +struct table_algo flow_hash = { + .name = "flow:hash", + .type = IPFW_TABLE_FLOW, + .flags = TA_FLAG_DEFAULT, + .ta_buf_size = sizeof(struct ta_buf_fhash), + .init = ta_init_fhash, + .destroy = ta_destroy_fhash, + .prepare_add = ta_prepare_add_fhash, + .prepare_del = ta_prepare_del_fhash, + .add = ta_add_fhash, + .del = ta_del_fhash, + .flush_entry = ta_flush_fhash_entry, + .foreach = ta_foreach_fhash, + .dump_tentry = ta_dump_fhash_tentry, + .find_tentry = ta_find_fhash_tentry, + .dump_tinfo = ta_dump_fhash_tinfo, + .need_modify = ta_need_modify_fhash, + .prepare_mod = ta_prepare_mod_fhash, + .fill_mod = ta_fill_mod_fhash, + .modify = ta_modify_fhash, + .flush_mod = ta_flush_mod_fhash, +}; + +/* + * Kernel fibs bindings. + * + * Implementation: + * + * Runtime part: + * - fully relies on route API + * - fib number is stored in ti->data + * + */ + +static struct rtentry *lookup_kfib(void *key, int keylen, int fib); +static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val); +static int kfib_parse_opts(int *pfib, char *data); +static void ta_print_kfib_config(void *ta_state, struct table_info *ti, + char *buf, size_t bufsize); +static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, + struct table_info *ti, char *data, uint8_t tflags); +static void ta_destroy_kfib(void *ta_state, struct table_info *ti); +static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, + ipfw_ta_tinfo *tinfo); +static int contigmask(uint8_t *p, int len); +static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent); +static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent); +static void ta_foreach_kfib(void *ta_state, struct table_info *ti, + ta_foreach_f *f, void *arg); + +static struct rtentry * +lookup_kfib(void *key, int keylen, int fib) +{ + struct sockaddr *s; + + if (keylen == 4) { + struct sockaddr_in sin; + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = *(in_addr_t *)key; + s = (struct sockaddr *)&sin; + } else { + struct sockaddr_in6 sin6; + bzero(&sin6, sizeof(sin6)); + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *(struct in6_addr *)key; + s = (struct sockaddr *)&sin6; + } + + return (rtalloc1_fib(s, 0, 0, fib)); +} + +static int +ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, + uint32_t *val) +{ + struct rtentry *rte; + + if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) + return (0); + + *val = 0; + RTFREE_LOCKED(rte); + + return (1); +} + +/* Parse 'fib=%d' */ +static int +kfib_parse_opts(int *pfib, char *data) +{ + char *pdel, *pend, *s; + int fibnum; + + if (data == NULL) + return (0); + if ((pdel = strchr(data, ' ')) == NULL) + return (0); + while (*pdel == ' ') + pdel++; + if (strncmp(pdel, "fib=", 4) != 0) + return (EINVAL); + if ((s = strchr(pdel, ' ')) != NULL) + *s++ = '\0'; + + pdel += 4; + /* Need \d+ */ + fibnum = strtol(pdel, &pend, 10); + if (*pend != '\0') + return (EINVAL); + + *pfib = fibnum; + + return (0); +} + +static void +ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, + size_t bufsize) +{ + + if (ti->data != 0) + snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); + else + snprintf(buf, bufsize, "%s", "addr:kfib"); +} + +static int +ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, + char *data, uint8_t tflags) +{ + int error, fibnum; + + fibnum = 0; + if ((error = kfib_parse_opts(&fibnum, data)) != 0) + return (error); + + if (fibnum >= rt_numfibs) + return (E2BIG); + + ti->data = fibnum; + ti->lookup = ta_lookup_kfib; + + return (0); +} + +/* + * Destroys table @ti + */ +static void +ta_destroy_kfib(void *ta_state, struct table_info *ti) +{ + +} + +/* + * Provide algo-specific table info + */ +static void +ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) +{ + + tinfo->flags = IPFW_TATFLAGS_AFDATA; + tinfo->taclass4 = IPFW_TACLASS_RADIX; + tinfo->count4 = 0; + tinfo->itemsize4 = sizeof(struct rtentry); + tinfo->taclass6 = IPFW_TACLASS_RADIX; + tinfo->count6 = 0; + tinfo->itemsize6 = sizeof(struct rtentry); +} + +static int +contigmask(uint8_t *p, int len) +{ + int i, n; + + for (i = 0; i < len ; i++) + if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ + break; + for (n= i + 1; n < len; n++) + if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) + return (-1); /* mask not contiguous */ + return (i); +} + + +static int +ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, + ipfw_obj_tentry *tent) +{ + struct rtentry *rte; +#ifdef INET + struct sockaddr_in *addr, *mask; +#endif +#ifdef INET6 + struct sockaddr_in6 *addr6, *mask6; +#endif + int len; + + rte = (struct rtentry *)e; + addr = (struct sockaddr_in *)rt_key(rte); + mask = (struct sockaddr_in *)rt_mask(rte); + len = 0; + + /* Guess IPv4/IPv6 radix by sockaddr family */ +#ifdef INET + if (addr->sin_family == AF_INET) { + tent->k.addr.s_addr = addr->sin_addr.s_addr; + len = 32; + if (mask != NULL) + len = contigmask((uint8_t *)&mask->sin_addr, 32); + if (len == -1) + len = 0; + tent->masklen = len; + tent->subtype = AF_INET; + tent->v.kidx = 0; /* Do we need to put GW here? */ + } +#endif +#ifdef INET6 + if (addr->sin_family == AF_INET6) { + addr6 = (struct sockaddr_in6 *)addr; + mask6 = (struct sockaddr_in6 *)mask; + memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr)); + len = 128; + if (mask6 != NULL) + len = contigmask((uint8_t *)&mask6->sin6_addr, 128); + if (len == -1) + len = 0; + tent->masklen = len; + tent->subtype = AF_INET6; + tent->v.kidx = 0; + } +#endif + + return (0); +} + +static int +ta_find_kfib_tentry(void *ta_state, struct table_info *ti, + ipfw_obj_tentry *tent) +{ + struct rtentry *rte; + void *key; + int keylen; + + if (tent->subtype == AF_INET) { + key = &tent->k.addr; + keylen = sizeof(struct in_addr); + } else { + key = &tent->k.addr6; + keylen = sizeof(struct in6_addr); + } + + if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) + return (0); + + if (rte != NULL) { + ta_dump_kfib_tentry(ta_state, ti, rte, tent); + RTFREE_LOCKED(rte); + return (0); + } + + return (ENOENT); +} + +static void +ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, + void *arg) +{ + struct radix_node_head *rnh; + int error; + + rnh = rt_tables_get_rnh(ti->data, AF_INET); + if (rnh != NULL) { + RADIX_NODE_HEAD_RLOCK(rnh); + error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); + RADIX_NODE_HEAD_RUNLOCK(rnh); + } + + rnh = rt_tables_get_rnh(ti->data, AF_INET6); + if (rnh != NULL) { + RADIX_NODE_HEAD_RLOCK(rnh); + error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); + RADIX_NODE_HEAD_RUNLOCK(rnh); + } +} + +struct table_algo addr_kfib = { + .name = "addr:kfib", + .type = IPFW_TABLE_ADDR, + .flags = TA_FLAG_READONLY, + .ta_buf_size = 0, + .init = ta_init_kfib, + .destroy = ta_destroy_kfib, + .foreach = ta_foreach_kfib, + .dump_tentry = ta_dump_kfib_tentry, + .find_tentry = ta_find_kfib_tentry, + .dump_tinfo = ta_dump_kfib_tinfo, + .print_config = ta_print_kfib_config, +}; + +void +ipfw_table_algo_init(struct ip_fw_chain *ch) +{ + size_t sz; + + /* + * Register all algorithms presented here. + */ + sz = sizeof(struct table_algo); + ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); + ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); + ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); + ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); + ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); + ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); +} + +void +ipfw_table_algo_destroy(struct ip_fw_chain *ch) +{ + + ipfw_del_table_algo(ch, addr_radix.idx); + ipfw_del_table_algo(ch, addr_hash.idx); + ipfw_del_table_algo(ch, iface_idx.idx); + ipfw_del_table_algo(ch, number_array.idx); + ipfw_del_table_algo(ch, flow_hash.idx); + ipfw_del_table_algo(ch, addr_kfib.idx); +} + + diff --git a/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c new file mode 100644 index 0000000..4b7b193 --- /dev/null +++ b/example/ipfw/sys/netpfil/ipfw/ip_fw_table_value.c @@ -0,0 +1,812 @@ +/*- + * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: head/sys/netpfil/ipfw/ip_fw_table_value.c 272940 2014-10-11 15:04:50Z melifaro $"); + +/* + * Multi-field value support for ipfw tables. + * + * This file contains necessary functions to convert + * large multi-field values into u32 indices suitable to be fed + * to various table algorithms. Other machinery like proper refcounting, + * internal structures resizing are also kept here. + */ + +#include "opt_ipfw.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/hash.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> + +#include <netpfil/ipfw/ip_fw_private.h> +#include <netpfil/ipfw/ip_fw_table.h> + +static uint32_t hash_table_value(struct namedobj_instance *ni, void *key, + uint32_t kopt); +static int cmp_table_value(struct named_object *no, void *key, uint32_t kopt); + +static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd); + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values }, +}; + +#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash) + +struct table_val_link +{ + struct named_object no; + struct table_value *pval; /* Pointer to real table value */ +}; +#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */ + +struct vdump_args { + struct ip_fw_chain *ch; + struct sockopt_data *sd; + struct table_value *pval; + int error; +}; + + +static uint32_t +hash_table_value(struct namedobj_instance *ni, void *key, uint32_t kopt) +{ + + return (hash32_buf(key, 56, 0)); +} + +static int +cmp_table_value(struct named_object *no, void *key, uint32_t kopt) +{ + + return (memcmp(((struct table_val_link *)no)->pval, key, 56)); +} + +static void +mask_table_value(struct table_value *src, struct table_value *dst, + uint32_t mask) +{ +#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; } + + memset(dst, 0, sizeof(*dst)); + _MCPY(tag, IPFW_VTYPE_TAG); + _MCPY(pipe, IPFW_VTYPE_PIPE); + _MCPY(divert, IPFW_VTYPE_DIVERT); + _MCPY(skipto, IPFW_VTYPE_SKIPTO); + _MCPY(netgraph, IPFW_VTYPE_NETGRAPH); + _MCPY(fib, IPFW_VTYPE_FIB); + _MCPY(nat, IPFW_VTYPE_NAT); + _MCPY(dscp, IPFW_VTYPE_DSCP); + _MCPY(nh4, IPFW_VTYPE_NH4); + _MCPY(nh6, IPFW_VTYPE_NH6); +#undef _MCPY +} + +static void +get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared, + struct table_value **ptv, struct namedobj_instance **pvi) +{ + struct table_value *pval; + struct namedobj_instance *vi; + + if (vshared != 0) { + pval = (struct table_value *)ch->valuestate; + vi = CHAIN_TO_VI(ch); + } else { + pval = NULL; + vi = NULL; + //pval = (struct table_value *)&tc->ti.data; + } + + if (ptv != NULL) + *ptv = pval; + if (pvi != NULL) + *pvi = vi; +} + +/* + * Update pointers to real vaues after @pval change. + */ +static void +update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) +{ + struct vdump_args *da; + struct table_val_link *ptv; + struct table_value *pval; + + da = (struct vdump_args *)arg; + ptv = (struct table_val_link *)no; + + pval = da->pval; + ptv->pval = &pval[ptv->no.kidx]; + +} + +/* + * Grows value storage shared among all tables. + * Drops/reacquires UH locks. + * Notifies other running adds on @ch shared storage resize. + * Note function does not guarantee that free space + * will be available after invocation, so one caller needs + * to roll cycle himself. + * + * Returns 0 if case of no errors. + */ +static int +resize_shared_value_storage(struct ip_fw_chain *ch) +{ + struct tables_config *tcfg; + struct namedobj_instance *vi; + struct table_value *pval, *valuestate, *old_valuestate; + void *new_idx; + struct vdump_args da; + int new_blocks; + int val_size, val_size_old; + + IPFW_UH_WLOCK_ASSERT(ch); + + valuestate = NULL; + new_idx = NULL; + + pval = (struct table_value *)ch->valuestate; + vi = CHAIN_TO_VI(ch); + tcfg = CHAIN_TO_TCFG(ch); + + val_size = tcfg->val_size * 2; + + if (val_size == (1 << 30)) + return (ENOSPC); + + IPFW_UH_WUNLOCK(ch); + + valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW, + M_WAITOK | M_ZERO); + ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx, + &new_blocks); + + IPFW_UH_WLOCK(ch); + + /* + * Check if we still need to resize + */ + if (tcfg->val_size >= val_size) + goto done; + + /* Update pointers and notify everyone we're changing @ch */ + pval = (struct table_value *)ch->valuestate; + rollback_toperation_state(ch, ch); + + /* Good. Let's merge */ + memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size); + ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks); + + IPFW_WLOCK(ch); + /* Change pointers */ + old_valuestate = ch->valuestate; + ch->valuestate = valuestate; + valuestate = old_valuestate; + ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks); + + val_size_old = tcfg->val_size; + tcfg->val_size = val_size; + val_size = val_size_old; + IPFW_WUNLOCK(ch); + /* Update pointers to reflect resize */ + memset(&da, 0, sizeof(da)); + da.pval = (struct table_value *)ch->valuestate; + ipfw_objhash_foreach(vi, update_tvalue, &da); + +done: + free(valuestate, M_IPFW); + ipfw_objhash_bitmap_free(new_idx, new_blocks); + + return (0); +} + +/* + * Drops reference for table value with index @kidx, stored in @pval and + * @vi. Frees value if it has no references. + */ +static void +unref_table_value(struct namedobj_instance *vi, struct table_value *pval, + uint32_t kidx) +{ + struct table_val_link *ptvl; + + KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx)); + if (--pval[kidx].refcnt > 0) + return; + + /* Last reference, delete item */ + ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx); + KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx)); + ipfw_objhash_del(vi, &ptvl->no); + ipfw_objhash_free_idx(vi, kidx); + free(ptvl, M_IPFW); +} + +struct flush_args { + struct ip_fw_chain *ch; + struct table_algo *ta; + struct table_info *ti; + void *astate; + ipfw_obj_tentry tent; +}; + +static int +unref_table_value_cb(void *e, void *arg) +{ + struct flush_args *fa; + struct ip_fw_chain *ch; + struct table_algo *ta; + ipfw_obj_tentry *tent; + int error; + + fa = (struct flush_args *)arg; + + ta = fa->ta; + memset(&fa->tent, 0, sizeof(fa->tent)); + tent = &fa->tent; + error = ta->dump_tentry(fa->astate, fa->ti, e, tent); + if (error != 0) + return (error); + + ch = fa->ch; + + unref_table_value(CHAIN_TO_VI(ch), + (struct table_value *)ch->valuestate, tent->v.kidx); + + return (0); +} + +/* + * Drop references for each value used in @tc. + */ +void +ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct table_algo *ta, void *astate, struct table_info *ti) +{ + struct flush_args fa; + + IPFW_UH_WLOCK_ASSERT(ch); + + memset(&fa, 0, sizeof(fa)); + fa.ch = ch; + fa.ta = ta; + fa.astate = astate; + fa.ti = ti; + + ta->foreach(astate, ti, unref_table_value_cb, &fa); +} + +/* + * Table operation state handler. + * Called when we are going to change something in @tc which + * may lead to inconsistencies in on-going table data addition. + * + * Here we rollback all already committed state (table values, currently) + * and set "modified" field to non-zero value to indicate + * that we need to restart original operation. + */ +void +rollback_table_values(struct tableop_state *ts) +{ + struct ip_fw_chain *ch; + struct table_value *pval; + struct tentry_info *ptei; + struct namedobj_instance *vi; + int i; + + ch = ts->ch; + + IPFW_UH_WLOCK_ASSERT(ch); + + /* Get current table value pointer */ + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); + + for (i = 0; i < ts->count; i++) { + ptei = &ts->tei[i]; + + if (ptei->value == 0) + continue; + + unref_table_value(vi, pval, ptei->value); + } +} + +/* + * Allocate new value index in either shared or per-table array. + * Function may drop/reacquire UH lock. + * + * Returns 0 on success. + */ +static int +alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts, + struct namedobj_instance *vi, uint16_t *pvidx) +{ + int error, vlimit; + uint16_t vidx; + + IPFW_UH_WLOCK_ASSERT(ch); + + error = ipfw_objhash_alloc_idx(vi, &vidx); + if (error != 0) { + + /* + * We need to resize array. This involves + * lock/unlock, so we need to check "modified" + * state. + */ + ts->opstate.func(ts->tc, &ts->opstate); + error = resize_shared_value_storage(ch); + return (error); /* ts->modified should be set, we will restart */ + } + + vlimit = ts->ta->vlimit; + if (vlimit != 0 && vidx >= vlimit) { + + /* + * Algorithm is not able to store given index. + * We have to rollback state, start using + * per-table value array or return error + * if we're already using it. + * + * TODO: do not rollback state if + * atomicity is not required. + */ + if (ts->vshared != 0) { + /* shared -> per-table */ + return (ENOSPC); /* TODO: proper error */ + } + + /* per-table. Fail for now. */ + return (ENOSPC); /* TODO: proper error */ + } + + *pvidx = vidx; + return (0); +} + +/* + * Drops value reference for unused values (updates, deletes, partially + * successful adds or rollbacks). + */ +void +ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, + struct tentry_info *tei, uint32_t count, int rollback) +{ + int i; + struct tentry_info *ptei; + struct table_value *pval; + struct namedobj_instance *vi; + + /* + * We have two slightly different ADD cases here: + * either (1) we are successful / partially successful, + * in that case we need + * * to ignore ADDED entries values + * * rollback every other values (either UPDATED since + * old value has been stored there, or some failure like + * EXISTS or LIMIT or simply "ignored" case. + * + * (2): atomic rollback of partially successful operation + * in that case we simply need to unref all entries. + * + * DELETE case is simpler: no atomic support there, so + * we simply unref all non-zero values. + */ + + /* + * Get current table value pointers. + * XXX: Properly read vshared + */ + get_value_ptrs(ch, tc, 1, &pval, &vi); + + for (i = 0; i < count; i++) { + ptei = &tei[i]; + + if (ptei->value == 0) { + + /* + * We may be deleting non-existing record. + * Skip. + */ + continue; + } + + if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) { + ptei->value = 0; + continue; + } + + unref_table_value(vi, pval, ptei->value); + ptei->value = 0; + } +} + +/* + * Main function used to link values of entries going to be added, + * to the index. Since we may perform many UH locks drops/acquires, + * handle changes by checking tablestate "modified" field. + * + * Success: return 0. + */ +int +ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts) +{ + int error, i, found; + struct namedobj_instance *vi; + struct table_config *tc; + struct tentry_info *tei, *ptei; + uint32_t count, vlimit; + uint16_t vidx; + struct table_val_link *ptv; + struct table_value tval, *pval; + + /* + * Stage 1: reference all existing values and + * save their indices. + */ + IPFW_UH_WLOCK_ASSERT(ch); + get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); + + error = 0; + found = 0; + vlimit = ts->ta->vlimit; + vidx = 0; + tc = ts->tc; + tei = ts->tei; + count = ts->count; + for (i = 0; i < count; i++) { + ptei = &tei[i]; + ptei->value = 0; /* Ensure value is always 0 in the beginnig */ + mask_table_value(ptei->pvalue, &tval, ts->vmask); + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, + (char *)&tval); + if (ptv == NULL) + continue; + /* Deal with vlimit later */ + if (vlimit > 0 && vlimit <= ptv->no.kidx) + continue; + + /* Value found. Bump refcount */ + ptv->pval->refcnt++; + ptei->value = ptv->no.kidx; + found++; + } + + if (ts->count == found) { + /* We've found all values , no need ts create new ones */ + return (0); + } + + /* + * we have added some state here, let's attach operation + * state ts the list ts be able ts rollback if necessary. + */ + add_toperation_state(ch, ts); + /* Ensure table won't disappear */ + tc_ref(tc); + IPFW_UH_WUNLOCK(ch); + + /* + * Stage 2: allocate objects for non-existing values. + */ + for (i = 0; i < count; i++) { + ptei = &tei[i]; + if (ptei->value != 0) + continue; + if (ptei->ptv != NULL) + continue; + ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW, + M_WAITOK | M_ZERO); + } + + /* + * Stage 3: allocate index numbers for new values + * and link them to index. + */ + IPFW_UH_WLOCK(ch); + tc_unref(tc); + del_toperation_state(ch, ts); + if (ts->modified != 0) { + + /* + * In general, we should free all state/indexes here + * and return. However, we keep allocated state instead + * to ensure we achieve some progress on each restart. + */ + return (0); + } + + KASSERT(pval == ch->valuestate, ("resize_storage() notify failure")); + + /* Let's try to link values */ + for (i = 0; i < count; i++) { + ptei = &tei[i]; + if (ptei->value != 0) { + + /* + * We may be here after several process restarts, + * so we need to update all fields that might + * have changed. + */ + ptv = (struct table_val_link *)ptei->ptv; + ptv->pval = &pval[i]; + continue; + } + + /* Check if record has appeared */ + mask_table_value(ptei->pvalue, &tval, ts->vmask); + ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, + (char *)&tval); + if (ptv != NULL) { + ptv->pval->refcnt++; + ptei->value = ptv->no.kidx; + continue; + } + + /* May perform UH unlock/lock */ + error = alloc_table_vidx(ch, ts, vi, &vidx); + if (error != 0) { + ts->opstate.func(ts->tc, &ts->opstate); + return (error); + } + /* value storage resize has happened, return */ + if (ts->modified != 0) + return (0); + + /* Finally, we have allocated valid index, let's add entry */ + ptei->value = vidx; + ptv = (struct table_val_link *)ptei->ptv; + ptei->ptv = NULL; + + ptv->no.kidx = vidx; + ptv->no.name = (char *)&pval[vidx]; + ptv->pval = &pval[vidx]; + memcpy(ptv->pval, &tval, sizeof(struct table_value)); + pval[vidx].refcnt = 1; + ipfw_objhash_add(vi, &ptv->no); + } + + return (0); +} + +/* + * Compability function used to import data from old + * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes. + */ +void +ipfw_import_table_value_legacy(uint32_t value, struct table_value *v) +{ + + memset(v, 0, sizeof(*v)); + v->tag = value; + v->pipe = value; + v->divert = value; + v->skipto = value; + v->netgraph = value; + v->fib = value; + v->nat = value; + v->nh4 = value; /* host format */ + v->dscp = value; + v->limit = value; +} + +/* + * Export data to legacy table dumps opcodes. + */ +uint32_t +ipfw_export_table_value_legacy(struct table_value *v) +{ + + /* + * TODO: provide more compatibility depending on + * vmask value. + */ + return (v->tag); +} + +/* + * Imports table value from current userland format. + * Saves value in kernel format to the same place. + */ +void +ipfw_import_table_value_v1(ipfw_table_value *iv) +{ + struct table_value v; + + memset(&v, 0, sizeof(v)); + v.tag = iv->tag; + v.pipe = iv->pipe; + v.divert = iv->divert; + v.skipto = iv->skipto; + v.netgraph = iv->netgraph; + v.fib = iv->fib; + v.nat = iv->nat; + v.dscp = iv->dscp; + v.nh4 = iv->nh4; + v.nh6 = iv->nh6; + v.limit = iv->limit; + + memcpy(iv, &v, sizeof(ipfw_table_value)); +} + +/* + * Export real table value @v to current userland format. + * Note that @v and @piv may point to the same memory. + */ +void +ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv) +{ + ipfw_table_value iv; + + memset(&iv, 0, sizeof(iv)); + iv.tag = v->tag; + iv.pipe = v->pipe; + iv.divert = v->divert; + iv.skipto = v->skipto; + iv.netgraph = v->netgraph; + iv.fib = v->fib; + iv.nat = v->nat; + iv.dscp = v->dscp; + iv.limit = v->limit; + iv.nh4 = v->nh4; + iv.nh6 = v->nh6; + + memcpy(piv, &iv, sizeof(iv)); +} + +/* + * Exports real value data into ipfw_table_value structure. + * Utilizes "spare1" field to store kernel index. + */ +static void +dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) +{ + struct vdump_args *da; + struct table_val_link *ptv; + struct table_value *v; + + da = (struct vdump_args *)arg; + ptv = (struct table_val_link *)no; + + v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v)); + /* Out of memory, returning */ + if (v == NULL) { + da->error = ENOMEM; + return; + } + + memcpy(v, ptv->pval, sizeof(*v)); + v->spare1 = ptv->no.kidx; +} + +/* + * Dumps all shared/table value data + * Data layout (v1)(current): + * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size + * Reply: [ ipfw_obj_lheader ipfw_table_value x N ] + * + * Returns 0 on success + */ +static int +list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + struct _ipfw_obj_lheader *olh; + struct namedobj_instance *vi; + struct vdump_args da; + uint32_t count, size; + + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + if (olh == NULL) + return (EINVAL); + if (sd->valsize < olh->size) + return (EINVAL); + + IPFW_UH_RLOCK(ch); + vi = CHAIN_TO_VI(ch); + + count = ipfw_objhash_count(vi); + size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader); + + /* Fill in header regadless of buffer size */ + olh->count = count; + olh->objsize = sizeof(ipfw_table_value); + + if (size > olh->size) { + olh->size = size; + IPFW_UH_RUNLOCK(ch); + return (ENOMEM); + } + olh->size = size; + + /* + * Do the actual value dump + */ + memset(&da, 0, sizeof(da)); + da.ch = ch; + da.sd = sd; + ipfw_objhash_foreach(vi, dump_tvalue, &da); + + IPFW_UH_RUNLOCK(ch); + + return (0); +} + +void +ipfw_table_value_init(struct ip_fw_chain *ch, int first) +{ + struct tables_config *tcfg; + + ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value), + M_IPFW, M_WAITOK | M_ZERO); + + tcfg = ch->tblcfg; + + tcfg->val_size = VALDATA_START_SIZE; + tcfg->valhash = ipfw_objhash_create(tcfg->val_size); + ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value, + cmp_table_value); + + IPFW_ADD_SOPT_HANDLER(first, scodes); +} + +static void +destroy_value(struct namedobj_instance *ni, struct named_object *no, + void *arg) +{ + + free(no, M_IPFW); +} + +void +ipfw_table_value_destroy(struct ip_fw_chain *ch, int last) +{ + + IPFW_DEL_SOPT_HANDLER(last, scodes); + + free(ch->valuestate, M_IPFW); + ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch); + ipfw_objhash_destroy(CHAIN_TO_VI(ch)); +} + diff --git a/example/ipfw/sys/sys/fnv_hash.h b/example/ipfw/sys/sys/fnv_hash.h new file mode 100644 index 0000000..f070e6e --- /dev/null +++ b/example/ipfw/sys/sys/fnv_hash.h @@ -0,0 +1,71 @@ +/*- + * Fowler / Noll / Vo Hash (FNV Hash) + * http://www.isthe.com/chongo/tech/comp/fnv/ + * + * This is an implementation of the algorithms posted above. + * This file is placed in the public domain by Peter Wemm. + * + * $FreeBSD: head/sys/sys/fnv_hash.h 268351 2014-07-07 00:27:09Z marcel $ + */ +#ifndef _SYS_FNV_HASH_H_ +#define _SYS_FNV_HASH_H_ + +typedef u_int32_t Fnv32_t; +typedef u_int64_t Fnv64_t; + +#define FNV1_32_INIT ((Fnv32_t) 33554467UL) +#define FNV1_64_INIT ((Fnv64_t) 0xcbf29ce484222325ULL) + +#define FNV_32_PRIME ((Fnv32_t) 0x01000193UL) +#define FNV_64_PRIME ((Fnv64_t) 0x100000001b3ULL) + +static __inline Fnv32_t +fnv_32_buf(const void *buf, size_t len, Fnv32_t hval) +{ + const u_int8_t *s = (const u_int8_t *)buf; + + while (len-- != 0) { + hval *= FNV_32_PRIME; + hval ^= *s++; + } + return hval; +} + +static __inline Fnv32_t +fnv_32_str(const char *str, Fnv32_t hval) +{ + const u_int8_t *s = (const u_int8_t *)str; + Fnv32_t c; + + while ((c = *s++) != 0) { + hval *= FNV_32_PRIME; + hval ^= c; + } + return hval; +} + +static __inline Fnv64_t +fnv_64_buf(const void *buf, size_t len, Fnv64_t hval) +{ + const u_int8_t *s = (const u_int8_t *)buf; + + while (len-- != 0) { + hval *= FNV_64_PRIME; + hval ^= *s++; + } + return hval; +} + +static __inline Fnv64_t +fnv_64_str(const char *str, Fnv64_t hval) +{ + const u_int8_t *s = (const u_int8_t *)str; + u_register_t c; /* 32 bit on i386, 64 bit on alpha */ + + while ((c = *s++) != 0) { + hval *= FNV_64_PRIME; + hval ^= c; + } + return hval; +} +#endif /* _SYS_FNV_HASH_H_ */ diff --git a/example/ipfw/sys/sys/hash.h b/example/ipfw/sys/sys/hash.h new file mode 100644 index 0000000..bd8fa69 --- /dev/null +++ b/example/ipfw/sys/sys/hash.h @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2001 Tobias Weingartner + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: hash.h,v 1.4 2004/05/25 18:37:23 jmc Exp $ + * $FreeBSD: head/sys/sys/hash.h 272906 2014-10-10 19:26:26Z gnn $ + */ + +#ifndef _SYS_HASH_H_ +#define _SYS_HASH_H_ +#include <sys/types.h> + +/* Convenience */ +#ifndef HASHINIT +#define HASHINIT 5381 +#define HASHSTEP(x,c) (((x << 5) + x) + (c)) +#endif + +/* + * Return a 32-bit hash of the given buffer. The init + * value should be 0, or the previous hash value to extend + * the previous hash. + */ +static __inline uint32_t +hash32_buf(const void *buf, size_t len, uint32_t hash) +{ + const unsigned char *p = buf; + + while (len--) + hash = HASHSTEP(hash, *p++); + + return hash; +} + +/* + * Return a 32-bit hash of the given string. + */ +static __inline uint32_t +hash32_str(const void *buf, uint32_t hash) +{ + const unsigned char *p = buf; + + while (*p) + hash = HASHSTEP(hash, *p++); + + return hash; +} + +/* + * Return a 32-bit hash of the given string, limited by N. + */ +static __inline uint32_t +hash32_strn(const void *buf, size_t len, uint32_t hash) +{ + const unsigned char *p = buf; + + while (*p && len--) + hash = HASHSTEP(hash, *p++); + + return hash; +} + +/* + * Return a 32-bit hash of the given string terminated by C, + * (as well as 0). This is mainly here as a helper for the + * namei() hashing of path name parts. + */ +static __inline uint32_t +hash32_stre(const void *buf, int end, const char **ep, uint32_t hash) +{ + const unsigned char *p = buf; + + while (*p && (*p != end)) + hash = HASHSTEP(hash, *p++); + + if (ep) + *ep = (const char *)p; + + return hash; +} + +/* + * Return a 32-bit hash of the given string, limited by N, + * and terminated by C (as well as 0). This is mainly here + * as a helper for the namei() hashing of path name parts. + */ +static __inline uint32_t +hash32_strne(const void *buf, size_t len, int end, const char **ep, + uint32_t hash) +{ + const unsigned char *p = buf; + + while (*p && (*p != end) && len--) + hash = HASHSTEP(hash, *p++); + + if (ep) + *ep = (const char *)p; + + return hash; +} + +#ifdef _KERNEL +/* + * Hashing function from Bob Jenkins. Implementation in libkern/jenkins_hash.c. + */ +uint32_t jenkins_hash(const void *, size_t, uint32_t); +uint32_t jenkins_hash32(const uint32_t *, size_t, uint32_t); + +uint32_t murmur3_aligned_32(const void *data, size_t len, uint32_t seed); + +#endif /* _KERNEL */ + +#endif /* !_SYS_HASH_H_ */