Index: amd64/conf/VIMAGE =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- amd64/conf/VIMAGE Mon Aug 25 00:28:59 2008 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC + ident VIMAGE + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: amd64/conf/VIMAGE_NODEBUG =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- amd64/conf/VIMAGE_NODEBUG Mon Aug 25 00:28:59 2008 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC_NODEBUG + ident VIMAGE_NODEBUG + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: amd64/conf/VLINT =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- amd64/conf/VLINT Mon Aug 25 00:29:00 2008 *************** *** 0 **** --- 1,15 ---- + # + # VLINT = LINT + options vimage + nooptions SCTP + # + # $FreeBSD$ + # + include LINT + ident VLINT + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: compat/linprocfs/linprocfs.c =========================================================================== --- compat/linprocfs/linprocfs.c 2008/08/25 00:28:58 #8 +++ compat/linprocfs/linprocfs.c 2008/08/25 00:28:58 @@ -44,6 +44,8 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linprocfs/linprocfs.c,v 1.121 2008/08/25 04:55:29 julian Exp $"); +#include "opt_compat.h" + #include #include #include @@ -75,6 +77,7 @@ #include #include +#include #include #include Index: compat/linux/linux_ioctl.c =========================================================================== --- compat/linux/linux_ioctl.c 2008/08/25 00:28:58 #11 +++ compat/linux/linux_ioctl.c 2008/08/25 00:28:58 @@ -31,6 +31,8 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linux/linux_ioctl.c,v 1.143 2008/08/25 04:55:29 julian Exp $"); +#include "opt_compat.h" + #include #include #include @@ -60,6 +62,7 @@ #include #include +#include #include #include #include Index: compat/linux/linux_socket.c =========================================================================== --- compat/linux/linux_socket.c 2008/08/25 00:28:58 #5 +++ compat/linux/linux_socket.c 2008/08/25 00:28:58 @@ -52,9 +52,12 @@ #include #include +#include + #include #include #include +#include #ifdef INET6 #include #include @@ -546,6 +549,9 @@ static int linux_socket(struct thread *td, struct linux_socket_args *args) { +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif struct linux_socket_args linux_args; struct socket_args /* { int domain; @@ -584,13 +590,7 @@ * for Linux apps if the sysctl value is set to 1. */ if (bsd_args.domain == PF_INET6 && retval_socket >= 0 -#ifndef KLD_MODULE - /* - * XXX: Avoid undefined symbol error with an IPv4 only - * kernel. - */ && V_ip6_v6only -#endif ) { int v6only; Index: compat/svr4/svr4_sockio.c =========================================================================== --- compat/svr4/svr4_sockio.c 2008/08/25 00:28:58 #7 +++ compat/svr4/svr4_sockio.c 2008/08/25 00:28:58 @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -88,6 +89,7 @@ switch (cmd) { case SVR4_SIOCGIFNUM: { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; int ifnum = 0; Index: compat/svr4/svr4_stat.c =========================================================================== --- compat/svr4/svr4_stat.c 2008/08/25 00:28:58 #6 +++ compat/svr4/svr4_stat.c 2008/08/25 00:28:58 @@ -412,6 +412,8 @@ struct thread *td; struct svr4_sys_systeminfo_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); + char *str = NULL; int error = 0; register_t *retval = td->td_retval; Index: conf/files =========================================================================== --- conf/files 2008/08/25 00:28:58 #10 +++ conf/files 2008/08/25 00:28:58 @@ -1602,6 +1602,7 @@ kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vimage.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard @@ -1898,6 +1899,7 @@ netgraph/ng_nat.c optional netgraph_nat netgraph/ng_one2many.c optional netgraph_one2many netgraph/ng_parse.c optional netgraph +netgraph/ng_pipe.c optional netgraph_pipe netgraph/ng_ppp.c optional netgraph_ppp netgraph/ng_pppoe.c optional netgraph_pppoe netgraph/ng_pptpgre.c optional netgraph_pptpgre @@ -1911,6 +1913,7 @@ netgraph/ng_tee.c optional netgraph_tee netgraph/ng_tty.c optional netgraph_tty netgraph/ng_vjc.c optional netgraph_vjc +netgraph/ng_wormhole.c optional netgraph_wormhole vimage netinet/accf_data.c optional accept_filter_data netinet/accf_dns.c optional accept_filter_dns netinet/accf_http.c optional accept_filter_http Index: conf/options =========================================================================== --- conf/options 2008/08/25 00:28:58 #8 +++ conf/options 2008/08/25 00:28:58 @@ -467,6 +467,7 @@ NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h +NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h @@ -481,6 +482,7 @@ NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h +NETGRAPH_WORMHOLE opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h Index: contrib/altq/altq/altq_subr.c =========================================================================== --- contrib/altq/altq/altq_subr.c 2008/08/25 00:28:58 #5 +++ contrib/altq/altq/altq_subr.c 2008/08/25 00:28:58 @@ -29,12 +29,10 @@ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" -#if (__FreeBSD__ != 2) -#include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" +#include "opt_inet.h" #endif -#endif #endif /* __FreeBSD__ || __NetBSD__ */ #include @@ -49,10 +47,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -461,6 +461,8 @@ #if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) IFNET_RLOCK(); #endif + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(curvnet); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { /* read from if_snd unlocked */ if (!TBR_IS_ENABLED(&ifp->if_snd)) @@ -469,6 +471,7 @@ if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) (*ifp->if_start)(ifp); } + VNET_ITERLOOP_END() #if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) IFNET_RUNLOCK(); #endif Index: contrib/ipfilter/netinet/ip_fil_freebsd.c =========================================================================== --- contrib/ipfilter/netinet/ip_fil_freebsd.c 2008/08/25 00:28:58 #8 +++ contrib/ipfilter/netinet/ip_fil_freebsd.c 2008/08/25 00:28:58 @@ -25,6 +25,8 @@ # include "opt_random_ip_id.h" #endif #include +#if __FreeBSD_version > 800000 +#endif #if defined(__FreeBSD__) && !defined(__FreeBSD_version) # if defined(IPFILTER_LKM) # ifndef __FreeBSD_cc_version @@ -121,9 +123,15 @@ #endif extern int ip_optcopy __P((struct ip *, struct ip *)); +#if __FreeBSD_version > 800000 +# include +# include +#endif #if (__FreeBSD_version > 460000) +#ifndef VIMAGE extern int path_mtu_discovery; #endif +#endif # ifdef IPFILTER_M_IPFILTER MALLOC_DEFINE(M_IPFILTER, "ipfilter", "IP Filter packet filter data structures"); @@ -239,8 +247,10 @@ bzero((char *)frcache, sizeof(frcache)); fr_running = 1; - if (fr_control_forwarding & 1) + if (fr_control_forwarding & 1) { + INIT_VNET_INET(curvnet); V_ipforwarding = 1; + } SPL_X(s); #if (__FreeBSD_version >= 300000) @@ -262,8 +272,10 @@ #ifdef USE_SPL int s; #endif - if (fr_control_forwarding & 2) + if (fr_control_forwarding & 2) { + INIT_VNET_INET(curvnet); V_ipforwarding = 0; + } SPL_NET(s); @@ -641,6 +653,7 @@ fr_info_t *fin; mb_t *m, **mpp; { + INIT_VNET_INET(curvnet); fr_info_t fnew; ip_t *ip, *oip; int hlen; Index: contrib/pf/net/pf.c =========================================================================== --- contrib/pf/net/pf.c 2008/08/25 00:28:58 #7 +++ contrib/pf/net/pf.c 2008/08/25 00:28:58 @@ -120,6 +120,7 @@ #include #include #include +#include #ifndef __FreeBSD__ #include @@ -139,6 +140,7 @@ #ifdef __FreeBSD__ #include #include +#include #endif #endif /* INET6 */ @@ -1759,6 +1761,7 @@ u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, u_int16_t rtag, struct ether_header *eh, struct ifnet *ifp) { + INIT_VNET_INET(curvnet); struct mbuf *m; int len, tlen; #ifdef INET @@ -2922,6 +2925,7 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) #endif { + INIT_VNET_INET(curvnet); struct pf_addr *saddr, *daddr; u_int16_t sport, dport; #ifdef __FreeBSD__ @@ -3101,6 +3105,7 @@ u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { + INIT_VNET_INET(curvnet); int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; @@ -3140,6 +3145,7 @@ pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) { #ifdef INET + INIT_VNET_INET(curvnet); struct sockaddr_in *dst; struct route ro; #endif /* INET */ @@ -3242,6 +3248,7 @@ struct ifqueue *ifq) #endif { + INIT_VNET_INET(curvnet); struct pf_rule *nr = NULL; struct pf_addr *saddr = pd->src, *daddr = pd->dst; struct tcphdr *th = pd->hdr.tcp; @@ -6096,6 +6103,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd) { + INIT_VNET_INET(curvnet); struct mbuf *m0, *m1; struct route iproute; struct route *ro = NULL; @@ -6633,18 +6641,30 @@ if (sum) { switch (p) { case IPPROTO_TCP: + { + INIT_VNET_INET(curvnet); V_tcpstat.tcps_rcvbadsum++; break; + } case IPPROTO_UDP: + { + INIT_VNET_INET(curvnet); V_udpstat.udps_badsum++; break; + } case IPPROTO_ICMP: + { + INIT_VNET_INET(curvnet); V_icmpstat.icps_checksum++; break; + } #ifdef INET6 case IPPROTO_ICMPV6: + { + INIT_VNET_INET6(curvnet); V_icmp6stat.icp6s_checksum++; break; + } #endif /* INET6 */ } return (1); Index: contrib/pf/net/pf_if.c =========================================================================== --- contrib/pf/net/pf_if.c 2008/08/25 00:28:58 #5 +++ contrib/pf/net/pf_if.c 2008/08/25 00:28:58 @@ -58,6 +58,7 @@ #include #include +#include #include #include @@ -110,8 +111,10 @@ void pfi_detach_group_event(void * __unused, struct ifg_group *); void pfi_ifaddr_event(void * __unused, struct ifnet *); +#ifndef VIMAGE extern struct ifgrouphead ifg_head; #endif +#endif RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); @@ -122,6 +125,7 @@ void pfi_initialize(void) { + INIT_VNET_NET(curvnet); if (pfi_all != NULL) /* already initialized */ return; Index: contrib/pf/net/pf_ioctl.c =========================================================================== --- contrib/pf/net/pf_ioctl.c 2008/08/25 00:28:58 #7 +++ contrib/pf/net/pf_ioctl.c 2008/08/25 00:28:58 @@ -97,11 +97,14 @@ #ifndef __FreeBSD__ #include #include +#else +#include #endif #include #include #include +#include #include #include @@ -3704,6 +3707,7 @@ pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, struct inpcb *inp) { + INIT_VNET_NET(curvnet); /* * IPv6 is not affected by ip_len/ip_off byte order changes. */ Index: contrib/pf/net/pf_subr.c =========================================================================== --- contrib/pf/net/pf_subr.c 2008/08/25 00:28:58 #4 +++ contrib/pf/net/pf_subr.c 2008/08/25 00:28:58 @@ -65,6 +65,8 @@ #include #include #include +#include + #include /* @@ -115,10 +117,12 @@ #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#ifndef VIMAGE static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset; static MD5_CTX isn_ctx; +#endif u_int32_t pf_new_isn(struct pf_state *s) Index: contrib/pf/net/pfvar.h =========================================================================== --- contrib/pf/net/pfvar.h 2008/08/25 00:28:58 #2 +++ contrib/pf/net/pfvar.h 2008/08/25 00:28:58 @@ -1856,6 +1856,15 @@ pf_osfp_validate(void); /* + * Stack virtualization support. + */ +#ifdef VIMAGE +struct vnet_pf { + struct vnet *parent_vnet; +}; +#endif + +/* * Symbol translation macros */ #define INIT_VNET_PF(vnet) \ Index: ddb/db_command.c =========================================================================== --- ddb/db_command.c 2008/08/25 00:28:58 #1 +++ ddb/db_command.c 2008/08/25 00:28:58 @@ -270,24 +270,40 @@ return (result); } +/* + * Print out a sorted command table. + */ static void db_cmd_list(table) struct command_table *table; { - register struct command *cmd; - register struct command **aux_cmdp; + struct command *cmd; + struct command **aux_cmdp; + char *last; + char *next = ""; - for (cmd = table->table; cmd->name != 0; cmd++) { - db_printf("%-12s", cmd->name); - db_end_line(12); - } - if (table->aux_tablep == NULL) - return; - for (aux_cmdp = table->aux_tablep; aux_cmdp < table->aux_tablep_end; - aux_cmdp++) { - db_printf("%-12s", (*aux_cmdp)->name); - db_end_line(12); - } + do { + last = next; + for (cmd = table->table; cmd->name != 0; cmd++) { + if (strcmp(cmd->name, last) > 0 && + (last == next || strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + if (table->aux_tablep != NULL) { + for (aux_cmdp = table->aux_tablep; + aux_cmdp < table->aux_tablep_end; aux_cmdp++) { + cmd = *aux_cmdp; + if (strcmp(cmd->name, last) > 0 && + (last == next || + strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + } + if (next != last) { + db_printf("%-12s", next); + db_end_line(12); + } + } while (next != last); } static void Index: ddb/db_textdump.c =========================================================================== --- ddb/db_textdump.c 2008/08/25 00:28:58 #6 +++ ddb/db_textdump.c 2008/08/25 00:28:58 @@ -184,7 +184,6 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { - INIT_VPROCG(TD_TO_VPROCG(&thread0)); bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, TEXTDUMPMAGIC, sizeof(kdh->magic)); Index: dev/cxgb/ulp/iw_cxgb/iw_cxgb.c =========================================================================== --- dev/cxgb/ulp/iw_cxgb/iw_cxgb.c 2008/08/25 00:28:58 #1 +++ dev/cxgb/ulp/iw_cxgb/iw_cxgb.c 2008/08/25 00:28:58 @@ -53,7 +53,9 @@ #include #include #include +#include +#include #include #include @@ -237,9 +239,12 @@ /* Register existing TOE interfaces by walking the ifnet chain */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { (void)ifaddr_event_handler(NULL, ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); return 0; } Index: dev/cxgb/ulp/tom/cxgb_cpl_io.c =========================================================================== --- dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008/08/25 00:28:58 #7 +++ dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008/08/25 00:28:58 @@ -53,6 +53,7 @@ #include #include +#include #include #include #include @@ -145,10 +146,6 @@ #define TCP_CLOSE 2 #define TCP_DROP 3 -extern int tcp_do_autorcvbuf; -extern int tcp_do_autosndbuf; -extern int tcp_autorcvbuf_max; -extern int tcp_autosndbuf_max; static void t3_send_reset(struct toepcb *toep); static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); @@ -267,6 +264,7 @@ static inline void make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) { + INIT_VNET_INET(so->so_vnet); struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct tx_data_wr *req; @@ -1228,6 +1226,7 @@ static unsigned long select_rcv_wnd(struct toedev *dev, struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct tom_data *d = TOM_DATA(dev); unsigned int wnd; unsigned int max_rcv_wnd; @@ -3773,6 +3772,7 @@ static void socket_act_establish(struct socket *so, struct mbuf *m) { + INIT_VNET_INET(so->so_vnet); struct cpl_act_establish *req = cplhdr(m); u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ struct tcpcb *tp = so_sototcpcb(so); Index: i386/conf/.cvsignore =========================================================================== --- i386/conf/.cvsignore 2008/08/25 00:28:58 #1 +++ i386/conf/.cvsignore 2008/08/25 00:28:58 @@ -1,1 +1,0 @@ -[A-Za-z0-9]* Index: i386/conf/VIMAGE =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- i386/conf/VIMAGE Mon Aug 25 00:29:00 2008 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC + ident VIMAGE + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: i386/conf/VIMAGE_NODEBUG =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- i386/conf/VIMAGE_NODEBUG Mon Aug 25 00:29:01 2008 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC_NODEBUG + ident VIMAGE_NODEBUG + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: i386/conf/VLINT =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- i386/conf/VLINT Mon Aug 25 00:29:01 2008 *************** *** 0 **** --- 1,15 ---- + # + # VLINT = LINT + options vimage + nooptions SCTP + # + # $FreeBSD$ + # + include LINT + ident VLINT + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: i386/ibcs2/ibcs2_socksys.c =========================================================================== --- i386/ibcs2/ibcs2_socksys.c 2008/08/25 00:28:58 #7 +++ i386/ibcs2/ibcs2_socksys.c 2008/08/25 00:28:58 @@ -174,6 +174,7 @@ struct thread *td; struct setipdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); char hname[MAXHOSTNAMELEN], *ptr; int error, sctl[2], hlen; Index: kern/init_main.c =========================================================================== --- kern/init_main.c 2008/08/25 00:28:58 #3 +++ kern/init_main.c 2008/08/25 00:28:58 @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,7 @@ #include #include #include +#include #include @@ -451,6 +453,11 @@ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ +#ifdef VIMAGE + P_TO_VIMAGE(p) = LIST_FIRST(&vimage_head); + refcount_acquire(&P_TO_VIMAGE(p)->vi_ucredrefc); + LIST_FIRST(&vprocg_head)->nprocs++; +#endif #ifdef AUDIT audit_cred_kproc0(p->p_ucred); #endif Index: kern/kern_clock.c =========================================================================== --- kern/kern_clock.c 2008/08/25 00:28:58 #1 +++ kern/kern_clock.c 2008/08/25 00:28:58 @@ -65,6 +65,7 @@ #include #include #include +#include #ifdef GPROF #include @@ -223,6 +224,11 @@ int ticks; int psratio; +#ifdef VIMAGE +u_int tot_acc_statcalls; +int last_acc_ticks; +#endif + /* * Initialize clock frequencies and start both clocks running. */ @@ -447,9 +453,11 @@ struct proc *p; long rss; long *cp_time; + int sel; td = curthread; p = td->td_proc; + INIT_VCPU(TD_TO_VCPU(td)); cp_time = (long *)PCPU_PTR(cp_time); if (usermode) { @@ -458,9 +466,9 @@ */ td->td_uticks++; if (p->p_nice > NZERO) - cp_time[CP_NICE]++; + sel = CP_NICE; else - cp_time[CP_USER]++; + sel = CP_USER; } else { /* * Came from kernel mode, so we were: @@ -477,16 +485,53 @@ if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks++; - cp_time[CP_INTR]++; + sel = CP_INTR; } else { td->td_pticks++; td->td_sticks++; if (!TD_IS_IDLETHREAD(td)) - cp_time[CP_SYS]++; + sel = CP_SYS; else - cp_time[CP_IDLE]++; + sel = CP_IDLE; + } + } + cp_time[sel]++; + +#ifdef VIMAGE + if (sel != CP_INTR) + sel = CP_IDLE; + + /* Per-vcpu average accounting */ + mtx_lock_spin(&vcpu_list_mtx); + tot_acc_statcalls++; + if (!TD_IS_IDLETHREAD(td)) + V_acc_statcalls++; + + /* Deccay processing every 1/16 seconds */ + if (last_acc_ticks + (hz >> 4) <= ticks) { + u_int weight_fixp; + u_int avg0; + + last_acc_ticks = ticks; + /* + * avg0, avg1 and avg2 are stored in 16.16 fixed point format. + * weight_fixp is in 1.31 format for better accuracy. + * + * avg1 loses half of its value in roughly 150 ms. + * avg2 loses half of its value in roughly 1350 ms. + */ + weight_fixp = 0x80000000 / tot_acc_statcalls; + /* XXX list locking? */ + LIST_FOREACH(vcpu, &vcpu_head, vcpu_le) { + avg0 = (weight_fixp * V_acc_statcalls) >> 15; + V_avg1_fixp = (3 * V_avg1_fixp + avg0) >> 2; + V_avg2_fixp = (31 * V_avg2_fixp + avg0) >> 5; + V_acc_statcalls = 0; } + tot_acc_statcalls = 0; } + mtx_unlock_spin(&vcpu_list_mtx); +#endif /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); Index: kern/kern_exit.c =========================================================================== --- kern/kern_exit.c 2008/08/25 00:28:58 #2 +++ kern/kern_exit.c 2008/08/25 00:28:58 @@ -41,6 +41,7 @@ #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_sched.h" #include #include @@ -69,6 +70,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -729,6 +731,7 @@ nfound++; PROC_SLOCK(p); if (p->p_state == PRS_ZOMBIE) { + INIT_VPROCG(P_TO_VPROCG(p)); if (rusage) { *rusage = p->p_ru; calcru(p, &rusage->ru_utime, &rusage->ru_stime); @@ -816,6 +819,9 @@ uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; +#ifdef VIMAGE + vprocg->nprocs--; +#endif sx_xunlock(&allproc_lock); return (0); } Index: kern/kern_fork.c =========================================================================== --- kern/kern_fork.c 2008/08/25 00:28:58 #3 +++ kern/kern_fork.c 2008/08/25 00:28:58 @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -343,6 +344,9 @@ * are hard-limits as to the number of processes that can run. */ nprocs++; +#ifdef VIMAGE + P_TO_VPROCG(p1)->nprocs++; +#endif /* * Find an unused process ID. We remember a range of unused IDs @@ -512,6 +516,11 @@ td2->td_sigmask = td->td_sigmask; td2->td_flags = TDF_INMEM; +#ifdef VIMAGE + td2->td_vnet = NULL; + td2->td_vnet_lpush = NULL; +#endif + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. Index: kern/kern_jail.c =========================================================================== --- kern/kern_jail.c 2008/08/25 00:28:58 #9 +++ kern/kern_jail.c 2008/08/25 00:28:58 @@ -35,6 +35,7 @@ #include #include #include + #include #include @@ -455,6 +456,10 @@ if (cred2->cr_prison != cred1->cr_prison) return (ESRCH); } +#ifdef VIMAGE + if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg) + return (ESRCH); +#endif return (0); } Index: kern/kern_linker.c =========================================================================== --- kern/kern_linker.c 2008/08/25 00:28:58 #1 +++ kern/kern_linker.c 2008/08/25 00:28:58 @@ -51,6 +51,9 @@ #include #include #include +#include + +#include #include @@ -957,6 +960,18 @@ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); +#ifdef VIMAGE + /* For now permit only the default vimage to kldload modules */ + if (!IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))) + return (EPERM); + + /* + * It's possible that kldloaded module will attach a new ifnet, + * so vnet context must be set when this ocurs. + */ + CURVNET_SET(TD_TO_VNET(td)); +#endif + /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface @@ -984,6 +999,7 @@ *fileid = lf->id; unlock: KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1021,6 +1037,11 @@ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); + /* XXX should suser catch this for us? */ + VNET_ASSERT(IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))); + + CURVNET_SET(TD_TO_VNET(td)); + KLD_LOCK(); lf = linker_find_file_by_id(fileid); if (lf) { @@ -1057,6 +1078,7 @@ PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm); #endif KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1274,12 +1296,28 @@ lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, - sizeof(lookup)); + sizeof(lookup)); break; } } +#ifdef VIMAGE + /* + * If symbol not found in global namespace, look up + * for it in the current vnet. + */ + if (lf == NULL) { + CURVNET_SET(TD_TO_VNET(td)); + error = vi_symlookup(&lookup, symstr); + CURVNET_RESTORE(); + if (error == 0) { + error = copyout(&lookup, uap->data, + sizeof(lookup)); + } + } +#else if (lf == NULL) error = ENOENT; +#endif } KLD_UNLOCK(); out: Index: kern/kern_mib.c =========================================================================== --- kern/kern_mib.c 2008/08/25 00:28:58 #8 +++ kern/kern_mib.c 2008/08/25 00:28:58 @@ -355,6 +355,7 @@ static int sysctl_domainname(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); char tmpdomainname[MAXHOSTNAMELEN]; int error; Index: kern/kern_poll.c =========================================================================== --- kern/kern_poll.c 2008/08/25 00:28:58 #6 +++ kern/kern_poll.c 2008/08/25 00:28:58 @@ -37,9 +37,11 @@ #include #include #include +#include #include /* for IFF_* flags */ #include /* for NETISR_POLL */ +#include /* for ifnet pointer */ #include #include @@ -521,6 +523,7 @@ static int poll_switch(SYSCTL_HANDLER_ARGS) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int error; int val = polling; Index: kern/kern_prot.c =========================================================================== --- kern/kern_prot.c 2008/08/25 00:28:58 #1 +++ kern/kern_prot.c 2008/08/25 00:28:58 @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -1720,6 +1721,9 @@ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); +#ifdef VIMAGE + if (!vi_child_of(TD_TO_VIMAGE(td), P_TO_VIMAGE(p))) +#endif if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC @@ -1789,6 +1793,10 @@ */ if (jailed(cr)) prison_free(cr->cr_prison); +#ifdef VIMAGE + if (cr->cr_vimage != NULL) + refcount_release(&cr->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_destroy(cr); #endif @@ -1824,6 +1832,10 @@ uihold(dest->cr_ruidinfo); if (jailed(dest)) prison_hold(dest->cr_prison); +#ifdef VIMAGE + KASSERT(src->cr_vimage != NULL, ("cr_vimage == NULL")); + refcount_acquire(&dest->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_copy(src, dest); #endif Index: kern/kern_synch.c =========================================================================== --- kern/kern_synch.c 2008/08/25 00:28:58 #3 +++ kern/kern_synch.c 2008/08/25 00:28:58 @@ -62,9 +62,12 @@ #include #include #endif +#include #include +#include + static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); @@ -496,12 +499,14 @@ int i, nrun; struct loadavg *avg; + VPROCG_ITERLOOP_BEGIN(); nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + VPROCG_ITERLOOP_END(); /* * Schedule the next update to occur after 5 seconds, but add a Index: kern/kern_sysctl.c =========================================================================== --- kern/kern_sysctl.c 2008/08/25 00:28:58 #1 +++ kern/kern_sysctl.c 2008/08/25 00:28:58 @@ -52,6 +52,7 @@ #include #include #include +#include #include @@ -845,6 +846,32 @@ } +#ifdef VIMAGE +int +sysctl_handle_v_int(SYSCTL_HANDLER_ARGS) +{ + int tmpout, error = 0; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by making a copy of the data. + */ + tmpout = *(int *)arg1; + error = SYSCTL_OUT(req, &tmpout, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} +#endif + + /* * Based on on sysctl_handle_int() convert milliseconds into ticks. */ @@ -979,6 +1006,48 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_string(SYSCTL_HANDLER_ARGS) +{ + int error=0; + char *tmparg; + size_t outlen; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by copying to a + * temporary kernel buffer. + */ +retry: + outlen = strlen((char *)arg1)+1; + tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); + + if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { + free(tmparg, M_SYSCTLTMP); + goto retry; + } + + error = SYSCTL_OUT(req, tmparg, outlen); + free(tmparg, M_SYSCTLTMP); + + if (error || !req->newptr) + return (error); + + if ((req->newlen - req->newidx) >= arg2) { + error = EINVAL; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} +#endif + + /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. @@ -1016,6 +1085,35 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_opaque(SYSCTL_HANDLER_ARGS) +{ + int error, tries; + u_int generation; + struct sysctl_req req2; + + SYSCTL_RESOLVE_V_ARG1(); + + tries = 0; + req2 = *req; +retry: + generation = curthread->td_generation; + error = SYSCTL_OUT(req, arg1, arg2); + if (error) + return (error); + tries++; + if (generation != curthread->td_generation && tries < 3) { + *req = req2; + goto retry; + } + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} +#endif + /* * Transfer functions to/from kernel space. * XXX: rather untested at this point @@ -1413,6 +1511,7 @@ req.lock = REQ_LOCKED; SYSCTL_LOCK(); + CURVNET_SET(TD_TO_VNET(curthread)); do { req.oldidx = 0; @@ -1423,6 +1522,7 @@ if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); + CURVNET_RESTORE(); SYSCTL_UNLOCK(); if (error && error != ENOMEM) Index: kern/kern_thread.c =========================================================================== --- kern/kern_thread.c 2008/08/25 00:28:58 #3 +++ kern/kern_thread.c 2008/08/25 00:28:58 @@ -47,6 +47,7 @@ #include #include #include +#include #include Index: kern/kern_uuid.c =========================================================================== --- kern/kern_uuid.c 2008/08/25 00:28:58 #7 +++ kern/kern_uuid.c 2008/08/25 00:28:58 @@ -39,6 +39,7 @@ #include #include +#include #include #include #include Index: kern/kern_vimage.c =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- kern/kern_vimage.c Mon Aug 25 00:29:01 2008 *************** *** 0 **** --- 1,992 ---- + /*- + * Copyright (c) 2004-2008 University of Zagreb + * Copyright (c) 2006-2008 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + #include "opt_ddb.h" + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #ifdef DDB + #include + #endif + + #include + #include + #include + #include + #include + + //#define DEBUG_ORDERING + + MALLOC_DEFINE(M_VIMAGE, "vimage", "virtual image resource container"); + MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); + MALLOC_DEFINE(M_VPROCG, "vprocg", "process group control block"); + MALLOC_DEFINE(M_VCPU, "vcpu", "cpu resource control block"); + + static struct vimage *vi_alloc(struct vimage *, char *); + static int vi_destroy(struct vimage *); + static void vnet_mod_complete_registration(struct vnet_modlink *); + static int vnet_mod_constructor(struct vnet_modlink *); + static int vnet_mod_destructor(struct vnet_modlink *); + + #ifdef VI_PREALLOC_SIZE + /* + * A private memory allocator can be enabled by setting VI_PREALLOC_SIZE + * to amount of memory (in bytes) to be reserved for the allocator at + * boot time. This pool is guaranteed to reside on a 4M superpage(s) on + * i386 and amd64, thus potentially reducing TLB trashing. + * + * So far I couldn't observe any significant performance impact of using + * this allocator vs. the standard malloc(), whereas in FreeBSD 4.11 + * days I recall using "uninitialized data" storage vs. malloc() would + * be an instant win... Is it possible that these days all malloc'ed + * kernel storage is automagically placed on 4M superpages, so that this + * effort is redundant? Who knows... Therefore this code is disabled by + * default, so vi_alloc() and vi_free() simply resolve to standard + * malloc() and free(). + */ + + static void *vi_malloc(unsigned long, struct malloc_type *, int); + static void vi_free(void *, struct malloc_type *); + + struct vi_mtrack { + LIST_ENTRY(vi_mtrack) vmt_le; + char *vmt_addr; + size_t vmt_size; + int vmt_flags; + }; + + static char vi_mpool[VI_PREALLOC_SIZE]; + static struct uma_zone *vi_mtrack_zone; + static LIST_HEAD(, vi_mtrack) vi_mem_free_head; + static LIST_HEAD(, vi_mtrack) vi_mem_alloc_head; + static int vi_mpool_fail_cnt = 0; + #else + #define vi_malloc(addr, type, flags) malloc((addr), (type), (flags)) + #define vi_free(addr, type) free((addr), (type)) + #endif /* VI_PREALLOC_SIZE */ + + struct vimage_list_head vimage_head; + struct vnet_list_head vnet_head; + struct vprocg_list_head vprocg_head; + struct vcpu_list_head vcpu_head; + + struct cv vnet_list_condvar; + struct mtx vnet_list_refc_mtx; + int vnet_list_refc = 0; + + struct mtx vcpu_list_mtx; + + #define VNET_LIST_LOCK() \ + mtx_lock(&vnet_list_refc_mtx); \ + while (vnet_list_refc != 0) \ + cv_wait(&vnet_list_condvar, &vnet_list_refc_mtx); + + #define VNET_LIST_UNLOCK() \ + mtx_unlock(&vnet_list_refc_mtx); + + static u_int last_vi_id = 0; + static u_int last_vnet_id = 0; + static u_int last_vprocg_id = 0; + static u_int last_vcpu_id = 0; + + static TAILQ_HEAD(vnet_modlink_head, vnet_modlink) vnet_modlink_head; + static TAILQ_HEAD(vnet_modpending_head, vnet_modlink) vnet_modpending_head; + + void + vnet_mod_register(const struct vnet_modinfo *vmi) + { + vnet_mod_register_multi(vmi, NULL, NULL); + } + + void + vnet_mod_register_multi(const struct vnet_modinfo *vmi, void *iarg, + char *iname) + { + struct vnet_modlink *vml, *vml_iter; + + /* Do not register the same module instance more than once */ + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo == vmi && vml_iter->vml_iarg == iarg) + break; + if (vml_iter != NULL) + panic("attempt to register an already registered vnet module"); + vml = vi_malloc(sizeof(struct vnet_modlink), M_VIMAGE, M_NOWAIT); + + /* + * XXX we support only statically assigned module IDs at the time. + * In principle modules should be able to get a dynamically + * assigned ID at registration time. + */ + VNET_ASSERT(vmi->vmi_id > 0 || vmi->vmi_id < VNET_MOD_MAX); + VNET_ASSERT(!((iarg == NULL) ^ (iname == NULL))); + + vml->vml_modinfo = vmi; + vml->vml_iarg = iarg; + vml->vml_iname = iname; + + /* Check whether the module we depend on is already registered */ + if (vmi->vmi_dependson != VNET_MOD_NONE) { + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_id == + vmi->vmi_dependson) + break; /* Depencency found, we are done */ + if (vml_iter == NULL) { + #ifdef DEBUG_ORDERING + printf("dependency %d missing for vnet mod %s," + "postponing registration\n", + vmi->vmi_dependson, vmi->vmi_name); + #endif /* DEBUG_ORDERING */ + TAILQ_INSERT_TAIL(&vnet_modpending_head, vml, + vml_mod_le); + return; + } + } + + vnet_mod_complete_registration(vml); + } + + void + vnet_mod_complete_registration(struct vnet_modlink *vml) + { + struct vnet_modlink *vml_iter; + + TAILQ_INSERT_TAIL(&vnet_modlink_head, vml, vml_mod_le); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_constructor(vml); + VNET_ITERLOOP_END(); + + /* Check for pending modules depending on us */ + do { + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_dependson == + vml->vml_modinfo->vmi_id) + break; + if (vml_iter != NULL) { + #ifdef DEBUG_ORDERING + printf("vnet mod %s now registering," + "dependency %d loaded\n", + vml_iter->vml_modinfo->vmi_name, + vml->vml_modinfo->vmi_id); + #endif /* DEBUG_ORDERING */ + TAILQ_REMOVE(&vnet_modpending_head, vml_iter, + vml_mod_le); + vnet_mod_complete_registration(vml_iter); + } + } while (vml_iter != NULL); + } + + void + vnet_mod_deregister(const struct vnet_modinfo *vmi) + { + vnet_mod_deregister_multi(vmi, NULL, NULL); + } + + void + vnet_mod_deregister_multi(const struct vnet_modinfo *vmi, void *iarg, + char *iname) + { + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + if (vml->vml_modinfo == vmi && vml->vml_iarg == iarg) + break; + if (vml == NULL) + panic("cannot deregister unregistered vnet module %s", + vmi->vmi_name); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_destructor(vml); + VNET_ITERLOOP_END(); + + TAILQ_REMOVE(&vnet_modlink_head, vml, vml_mod_le); + vi_free(vml, M_VIMAGE); + } + + struct vimage * + vnet2vimage(struct vnet *vnet) + { + struct vimage *vip; + + LIST_FOREACH(vip, &vimage_head, vi_le) + if (vip->v_net == vnet) + return(vip); + + panic("vnet2vimage"); /* must never happen */ + } + + char * + vnet_name(struct vnet *vnet) + { + return(vnet2vimage(vnet)->vi_name); + } + + + int + vi_child_of(struct vimage *parent, struct vimage *child) + { + if (child == parent) + return (0); + for (; child; child = child->vi_parent) + if (child == parent) + return (1); + return (0); + } + + /* + * if_reassign_common() should be called by all device specific + * ifnet reassignment routines after the interface is detached from + * current vnet and before the interface gets attached to the target + * vnet. This routine attempts to shrink if_index in current vnet, + * find an unused if_index in target vnet and calls if_grow() if + * necessary, and finally finds an unused if_xname for the target + * vnet. + * + * XXX this routine should hold a lock over if_index and return with + * such a lock held, and the caller should release that lock + * after ifattach completes! + */ + void + if_reassign_common(struct ifnet *ifp, struct vnet *new_vnet, const char *dname) + { + /* do/while construct needed to confine scope of INIT_VNET_NET() */ + do { + INIT_VNET_NET(curvnet); + + ifnet_setbyindex(ifp->if_index, NULL); + /* XXX: should be locked with if_findindex() */ + while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) + V_if_index--; + } while (0); + + CURVNET_SET_QUIET(new_vnet); + INIT_VNET_NET(new_vnet); + /* + * Try to find an empty slot below if_index. If we fail, take + * the next slot. + * + * XXX: should be locked! + */ + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { + if (ifnet_byindex(ifp->if_index) == NULL) + break; + } + /* Catch if_index overflow. */ + if (ifp->if_index < 1) + panic("vi_if_move: if_index overflow"); + + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) + if_grow(); + ifnet_setbyindex(ifp->if_index, ifp); + + /* Rename the ifnet */ + if (new_vnet == ifp->if_home_vnet) { + /* always restore the original name on return to home vnet */ + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", ifp->if_dname, + ifp->if_dunit); + } else { + int unit = 0; + struct ifnet *iter; + + do { + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", dname, unit); + TAILQ_FOREACH(iter, &V_ifnet, if_link) + if (strcmp(ifp->if_xname, iter->if_xname) == 0) + break; + unit++; + } while (iter); + } + CURVNET_RESTORE(); + } + + /* + * Move the interface to another vnet. The interface can be specified either + * by ifp argument, or by name contained in vi_req->vi_chroot if NULL is + * passed as ifp. The interface will be renamed to vi_req->vi_parent_name + * if vi_req->vi_parent_name is not an empty string (uff ugly ugly)... + * Similary, the target vnet can be specified either by vnet argument or + * by name. If vnet name equals to ".." or vi_req is set to NULL the + * interface is moved to the parent vnet. + */ + int + vi_if_move(struct vi_req *vi_req, struct ifnet *ifp, struct vimage *vip) + { + struct vimage *new_vip; + struct vnet *new_vnet = NULL; + + if (vi_req == NULL || strcmp(vi_req->vi_name, "..") == 0) { + if (IS_DEFAULT_VIMAGE(vip)) + return (ENXIO); + new_vnet = vip->vi_parent->v_net; + } else { + new_vip = vimage_by_name(vip, vi_req->vi_name); + if (new_vip == NULL) + return (ENXIO); + new_vnet = new_vip->v_net; + } + + if (ifp == NULL) + ifp = ifunit(vi_req->vi_chroot); + if (ifp == NULL) + return (ENXIO); + + /* Abort if driver did not provide a if_reassign() method */ + if (ifp->if_reassign == NULL) + return (ENODEV); + + if (vi_req != NULL) { + struct ifnet *t_ifp; + + CURVNET_SET_QUIET(new_vnet); + t_ifp = ifunit(vi_req->vi_if_xname); + CURVNET_RESTORE(); + if (t_ifp != NULL) + return (EEXIST); + } + + if (vi_req && strlen(vi_req->vi_if_xname) > 0) + ifp->if_reassign(ifp, new_vnet, vi_req->vi_if_xname); + else + ifp->if_reassign(ifp, new_vnet, NULL); + getmicrotime(&ifp->if_lastchange); + + /* Report the new if_xname back to the userland */ + if (vi_req != NULL) + sprintf(vi_req->vi_chroot, "%s", ifp->if_xname); + + return (0); + } + + + struct vimage * + vimage_by_name(struct vimage *top, char *name) + { + struct vimage *vip; + char *next_name; + int namelen; + + next_name = strchr(name, '.'); + if (next_name != NULL) { + namelen = next_name - name; + next_name++; + if (namelen == 0) { + if (strlen(next_name) == 0) + return(top); /* '.' == this vimage */ + else + return(NULL); + } + } else + namelen = strlen(name); + if (namelen == 0) + return(NULL); + LIST_FOREACH(vip, &top->vi_child_head, vi_sibling) + if (strlen(vip->vi_name) == namelen && + strncmp(name, vip->vi_name, namelen) == 0) { + if (next_name != NULL) + return(vimage_by_name(vip, next_name)); + else + return(vip); + } + return(NULL); + } + + + static void + vimage_relative_name(struct vimage *top, struct vimage *where, + char *buffer, int bufflen) + { + int used = 1; + + if (where == top) { + sprintf(buffer, "."); + return; + } else + *buffer = 0; + + do { + int namelen = strlen(where->vi_name); + + if (namelen + used + 1 >= bufflen) + panic("buffer overflow"); + + if (used > 1) { + bcopy(buffer, &buffer[namelen + 1], used); + buffer[namelen] = '.'; + used++; + } else + bcopy(buffer, &buffer[namelen], used); + bcopy(where->vi_name, buffer, namelen); + used += namelen; + where = where->vi_parent; + } while (where != top); + } + + + static struct vimage * + vimage_get_next(struct vimage *top, struct vimage *where, int recurse) + { + struct vimage *next; + + if (recurse) { + /* Try to go deeper in the hierarchy */ + next = LIST_FIRST(&where->vi_child_head); + if (next != NULL) + return(next); + } + + do { + /* Try to find next sibling */ + next = LIST_NEXT(where, vi_sibling); + if (!recurse || next != NULL) + return(next); + + /* Nothing left on this level, go one level up */ + where = where->vi_parent; + } while (where != top->vi_parent); + + /* Nothing left to be visited, we are done */ + return(NULL); + } + + + int + vi_td_ioctl(u_long cmd, struct vi_req *vi_req, struct thread *td) + { + int error; + struct vimage *vip = TD_TO_VIMAGE(td); + struct vimage *vip_r = NULL; + + error = suser(td); /* XXX replace with priv(9) */ + if (error) + return (error); + + vip_r = vimage_by_name(vip, vi_req->vi_name); + if (vip_r == NULL && !(vi_req->req_action & VI_CREATE)) + return (ESRCH); + if (vip_r != NULL && vi_req->req_action & VI_CREATE) + return (EADDRINUSE); + if (vi_req->req_action == VI_GETNEXT) { + vip_r = vimage_get_next(vip, vip_r, 0); + if (vip_r == NULL) + return (ESRCH); + } + if (vi_req->req_action == VI_GETNEXT_RECURSE) { + vip_r = vimage_get_next(vip, vip_r, 1); + if (vip_r == NULL) + return (ESRCH); + } + + if (vip_r && !vi_child_of(vip, vip_r) && /* XXX delete the rest? */ + vi_req->req_action != VI_GET && vi_req->req_action != VI_GETNEXT) + return (EPERM); + + switch (cmd) { + + case SIOCGPVIMAGE: + vimage_relative_name(vip, vip_r, vi_req->vi_name, + sizeof (vi_req->vi_name)); + vi_req->vi_proc_count = vip_r->v_procg->nprocs; + vi_req->vi_if_count = vip_r->v_net->ifccnt; + vi_req->vi_sock_count = vip_r->v_net->sockcnt; + vi_req->cp_time_avg = vip_r->v_cpu->_avg2_fixp; + break; + + case SIOCSPVIMAGE: + if (vi_req->req_action == VI_DESTROY) { + error = vi_destroy(vip_r); + break; + } + + if (vi_req->req_action == VI_SWITCHTO) { + struct proc *p = td->td_proc; + struct ucred *oldcred, *newcred; + + /* + * XXX priv_check()? + * XXX allow only a single td per proc here? + */ + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + setsugid(p); + crcopy(newcred, oldcred); + refcount_release(&newcred->cr_vimage->vi_ucredrefc); + newcred->cr_vimage = vip_r; + refcount_acquire(&newcred->cr_vimage->vi_ucredrefc); + p->p_ucred = newcred; + PROC_UNLOCK(p); + sx_xlock(&allproc_lock); + oldcred->cr_vimage->v_procg->nprocs--; + refcount_release(&oldcred->cr_vimage->vi_ucredrefc); + P_TO_VPROCG(p)->nprocs++; + #if 0 + sched_load_reassign(oldcred->cr_vimage->v_procg, + newcred->cr_vimage->v_procg); + #endif + sx_xunlock(&allproc_lock); + crfree(oldcred); + break; + } + + if (vi_req->req_action & VI_CREATE) { + char *dotpos; + + dotpos = strrchr(vi_req->vi_name, '.'); + if (dotpos != NULL) { + *dotpos = 0; + vip = vimage_by_name(vip, vi_req->vi_name); + if (vip == NULL) + return (ESRCH); + dotpos++; + vip_r = vi_alloc(vip, dotpos); + } else + vip_r = vi_alloc(vip, vi_req->vi_name); + if (vip_r == NULL) + return (ENOMEM); + } + + /* XXX What the hell is this doing here? */ + if (vip == vip_r && !IS_DEFAULT_VIMAGE(vip)) + return (EPERM); + } + + return (error); + } + + + int + vi_symlookup(struct kld_sym_lookup *lookup, char *symstr) + { + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { + struct vnet_symmap *mapentry; + + if (vml->vml_modinfo->vmi_symmap == NULL) + continue; + + for (mapentry = vml->vml_modinfo->vmi_symmap; + mapentry->name != NULL; mapentry++) { + if (strcmp(symstr, mapentry->name) == 0) { + lookup->symvalue = + (u_long) curvnet->mod_data[vml->vml_modinfo->vmi_id]; + lookup->symvalue += mapentry->offset; + lookup->symsize = mapentry->size; + return 0; + } + } + } + + return ENOENT; + } + + + struct vimage * + vi_alloc(struct vimage *parent, char *name) + { + struct vimage *vip; + struct vnet *vnet; + struct vprocg *vprocg; + struct vcpu *vcpu; + struct vnet_modlink *vml; + + /* + * XXX don't forget the locking + */ + + /* A brute force check whether there's enough mem for a new vimage */ + vip = malloc(512*1024, M_VIMAGE, M_NOWAIT); /* XXX aaaargh... */ + if (vip == NULL) + goto vi_alloc_done; + free(vip, M_VIMAGE); + + vip = vi_malloc(sizeof(struct vimage), M_VIMAGE, M_NOWAIT | M_ZERO); + if (vip == NULL) + panic("vi_alloc: malloc failed for vimage \"%s\"\n", name); + vip->vi_id = last_vi_id++; + LIST_INIT(&vip->vi_child_head); + sprintf(vip->vi_name, "%s", name); + vip->vi_parent = parent; + /* XXX locking */ + if (parent != NULL) + LIST_INSERT_HEAD(&parent->vi_child_head, vip, vi_sibling); + else if (!LIST_EMPTY(&vimage_head)) + panic("there can be only one default vimage!"); + LIST_INSERT_HEAD(&vimage_head, vip, vi_le); + + vnet = vi_malloc(sizeof(struct vnet), M_VNET, M_NOWAIT | M_ZERO); + if (vnet == NULL) + panic("vi_alloc: malloc failed for vnet \"%s\"\n", name); + vip->v_net = vnet; + vnet->vnet_id = last_vnet_id++; + vnet->vnet_magic_n = VNET_MAGIC_N; + + vprocg = vi_malloc(sizeof(struct vprocg), M_VPROCG, M_NOWAIT | M_ZERO); + if (vprocg == NULL) + panic("vi_alloc: malloc failed for vprocg \"%s\"\n", name); + vip->v_procg = vprocg; + vprocg->vprocg_id = last_vprocg_id++; + + vcpu = vi_malloc(sizeof(struct vcpu), M_VCPU, M_NOWAIT | M_ZERO); + if (vcpu == NULL) + panic ("vi_alloc: malloc failed for vcpu \"%s\"\n", name); + vip->v_cpu = vcpu; + vcpu->vcpu_id = last_vcpu_id++; + + /* Initialize / attach vnet module instances. */ + CURVNET_SET_QUIET(vnet); + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + vnet_mod_constructor(vml); + CURVNET_RESTORE(); + + VNET_LIST_LOCK(); + LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); + VNET_LIST_UNLOCK(); + + /* XXX locking */ + LIST_INSERT_HEAD(&vprocg_head, vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_INSERT_HEAD(&vcpu_head, vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + + vi_alloc_done: + return (vip); + } + + + /* + * Destroy a vnet - unlink all linked lists, free all the memory, stop all + * the timers... How can one ever be sure to have done *all* the necessary + * steps? + */ + static int + vi_destroy(struct vimage *vip) + { + struct vnet *vnet = vip->v_net; + struct vprocg *vprocg = vip->v_procg; + struct vcpu *vcpu = vip->v_cpu; + struct ifnet *ifp, *nifp; + struct vnet_modlink *vml; + + /* XXX Beware of races -> more locking to be done... */ + if (!LIST_EMPTY(&vip->vi_child_head)) + return (EBUSY); + + if (vprocg->nprocs != 0) + return (EBUSY); + + if (vnet->sockcnt != 0) + return (EBUSY); + + if (vip->vi_ucredrefc != 0) + printf("vi_destroy: %s ucredrefc %d\n", + vip->vi_name, vip->vi_ucredrefc); + + /* Point with no return - cleanup MUST succeed! */ + /* XXX locking */ + LIST_REMOVE(vip, vi_le); + LIST_REMOVE(vip, vi_sibling); + + /* XXX locking */ + LIST_REMOVE(vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_REMOVE(vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + + VNET_LIST_LOCK(); + LIST_REMOVE(vnet, vnet_le); + VNET_LIST_UNLOCK(); + + CURVNET_SET_QUIET(vnet); + INIT_VNET_NET(vnet); + + /* + * Return all inherited interfaces to their parent vnets, + * alternatively attempt to kill cloning ifnets. + */ + TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + if (ifp->if_home_vnet != ifp->if_vnet) + vi_if_move(NULL, ifp, vip); + else + if_clone_destroy(ifp->if_xname); + } + + /* Detach / free per-module state instances. */ + TAILQ_FOREACH_REVERSE(vml, &vnet_modlink_head, + vnet_modlink_head, vml_mod_le) + vnet_mod_destructor(vml); + + #if 0 + free((caddr_t)vnet->ifnet_addrs, M_IFADDR); + free((caddr_t)vnet->ifindex2ifnet, M_IFADDR); + #endif + + CURVNET_RESTORE(); + + /* hopefully, we are finally OK to free the vnet container itself! */ + vnet->vnet_magic_n = 0xdeadbeef; + vi_free(vnet, M_VNET); + vi_free(vprocg, M_VPROCG); + vi_free(vcpu, M_VCPU); + vi_free(vip, M_VIMAGE); + + return (0); + } + + static int vnet_mod_constructor(struct vnet_modlink *vml) + { + const struct vnet_modinfo *vmi = vml->vml_modinfo; + + #ifdef DEBUG_ORDERING + printf("instatiating vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_struct_size) + printf("malloc(%d); ", vmi->vmi_struct_size); + if (vmi->vmi_iattach != NULL) + printf("iattach()"); + printf("\n"); + #endif + + if (vmi->vmi_struct_size) { + void *mem = vi_malloc(vmi->vmi_struct_size, M_VNET, + M_NOWAIT | M_ZERO); + if (mem == NULL) /* XXX should return error, not panic */ + panic("vi_alloc: malloc for %s\n", vmi->vmi_name); + curvnet->mod_data[vmi->vmi_id] = mem; + } + + if (vmi->vmi_iattach != NULL) + vmi->vmi_iattach(vml->vml_iarg); + + return 0; + } + + static int vnet_mod_destructor(struct vnet_modlink *vml) + { + const struct vnet_modinfo *vmi = vml->vml_modinfo; + + #ifdef DEBUG_ORDERING + printf("destroying vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_idetach != NULL) + printf("idetach(); "); + if (vmi->vmi_struct_size) + printf("free()"); + printf("\n"); + #endif + + if (vmi->vmi_idetach) + vmi->vmi_idetach(vml->vml_iarg); + + if (vmi->vmi_struct_size) { + if (curvnet->mod_data[vmi->vmi_id] == NULL) + panic("vi_destroy: %s\n", vmi->vmi_name); + vi_free(curvnet->mod_data[vmi->vmi_id], M_VNET); + curvnet->mod_data[vmi->vmi_id] = NULL; + } + + return 0; + } + + static void + vi_init(void *unused) + { + #ifdef VI_PREALLOC_SIZE + struct vi_mtrack *vmt; + + /* Initialize our private memory allocator */ + LIST_INIT(&vi_mem_free_head); + LIST_INIT(&vi_mem_alloc_head); + vi_mtrack_zone = uma_zcreate("vi_mtrack", sizeof(struct vi_mtrack), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = vi_mpool; + vmt->vmt_size = VI_PREALLOC_SIZE; + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); + #endif /* VI_PREALLOC_SIZE */ + + /* vnet module list is both forward and reverse traversable */ + TAILQ_INIT(&vnet_modlink_head); + TAILQ_INIT(&vnet_modpending_head); + + LIST_INIT(&vimage_head); + LIST_INIT(&vnet_head); + LIST_INIT(&vprocg_head); + LIST_INIT(&vcpu_head); + + mtx_init(&vnet_list_refc_mtx, "vnet_list_refc_mtx", NULL, MTX_DEF); + cv_init(&vnet_list_condvar, "vnet_list_condvar"); + + mtx_init(&vcpu_list_mtx, "vcpu_list_mtx", NULL, MTX_SPIN); + + vi_alloc(NULL, ""); /* Default vimage has no name */ + + /* We MUST clear curvnet in vi_init_done before going SMP. */ + curvnet = LIST_FIRST(&vnet_head); + } + + static void + vi_init_done(void *unused) + { + struct vnet_modlink *vml_iter; + + curvnet = NULL; + + if (TAILQ_EMPTY(&vnet_modpending_head)) + return; + + printf("vnet modules with unresolved dependencies:\n"); + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + printf(" %s depending on %d:\n", + vml_iter->vml_modinfo->vmi_name, + vml_iter->vml_modinfo->vmi_dependson); + panic("going nowhere without my vnet modules!"); + } + + SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL); + SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL); + + #ifdef VI_PREALLOC_SIZE + void * + vi_malloc(unsigned long size, struct malloc_type *type, int flags) + { + void *addr; + struct vi_mtrack *vmt = NULL; + struct vi_mtrack *vmt_iter; + + /* Attempt to find a free chunk in our private pool */ + LIST_FOREACH(vmt_iter, &vi_mem_free_head, vmt_le) + if (vmt_iter->vmt_size >= size && + (vmt == NULL || vmt_iter->vmt_size < vmt->vmt_size)) { + vmt = vmt_iter; + /* Exact fit is an optimal choice, we are done. */ + if (vmt_iter->vmt_size == size) + break; + } + + /* Not (enough) free space in our pool, resort to malloc() */ + if (vmt == NULL) { + if (vi_mpool_fail_cnt == 0) + printf("vi_mpool exhausted," + "consider increasing VI_PREALLOC_SIZE\n"); + vi_mpool_fail_cnt++; + addr = malloc(size, type, flags); + return addr; + } + + addr = vmt->vmt_addr; + if (vmt->vmt_size == size) { + /* Move the descriptor from free to allocated list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } else { + /* Shrink the existing free space block */ + vmt->vmt_addr += size; + vmt->vmt_size -= size; + + /* Create a new descriptor and place it on allocated list */ + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = addr; + vmt->vmt_size = size; + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } + + bzero(addr, size); + return addr; + } + + void + vi_free(void *addr, struct malloc_type *type) + { + struct vi_mtrack *vmt; + + /* Attempt to find the chunk in our allocated pool */ + LIST_FOREACH(vmt, &vi_mem_alloc_head, vmt_le) + if (vmt->vmt_addr == addr) + break; + + /* Not found in our private pool, resort to free() */ + if (vmt == NULL) { + free(addr, type); + return; + } + + /* Move the descriptor from allocated to free list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); + } + #endif /* VI_PREALLOC_SIZE */ + + #ifdef DDB + static void + db_vnet_ptr(void *arg) + { + if (arg) + db_printf(" %p", arg); + else + db_printf(" 0"); + } + + DB_SHOW_COMMAND(vnets, db_show_vnets) + { + db_printf(" vnet ifs socks"); + db_printf(" net inet inet6 ipsec netgraph\n"); + VNET_ITERLOOP_BEGIN_QUIET(); + db_printf("%p %3d %5d", + vnet_iter, vnet_iter->ifccnt, vnet_iter->sockcnt); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET6]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_IPSEC]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NETGRAPH]); + db_printf("\n"); + VNET_ITERLOOP_END(); + } + #endif Index: kern/kern_xxx.c =========================================================================== --- kern/kern_xxx.c 2008/08/25 00:28:58 #8 +++ kern/kern_xxx.c 2008/08/25 00:28:58 @@ -286,7 +286,7 @@ if (error == 0) { tmpdomainname[domainnamelen] = 0; mtx_lock(&hostname_mtx); - bcopy(tmpdomainname, V_domainname, sizeof(domainname)); + bcopy(tmpdomainname, V_domainname, sizeof(V_domainname)); mtx_unlock(&hostname_mtx); } return (error); Index: kern/subr_pcpu.c =========================================================================== --- kern/subr_pcpu.c 2008/08/25 00:28:58 #1 +++ kern/subr_pcpu.c 2008/08/25 00:28:58 @@ -132,6 +132,10 @@ db_printf("none\n"); db_show_mdpcpu(pc); +#ifdef VIMAGE + db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); +#endif + #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks); Index: kern/sys_socket.c =========================================================================== --- kern/sys_socket.c 2008/08/25 00:28:58 #1 +++ kern/sys_socket.c 2008/08/25 00:28:58 @@ -50,7 +50,9 @@ #include #include #include +#include +#include #include #include @@ -74,16 +76,19 @@ int flags, struct thread *td) { struct socket *so = fp->f_data; -#ifdef MAC int error; +#ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(active_cred, so); SOCK_UNLOCK(so); if (error) return (error); #endif - return (soreceive(so, 0, uio, 0, 0, 0)); + CURVNET_SET(so->so_vnet); + error = soreceive(so, 0, uio, 0, 0, 0); + CURVNET_RESTORE(); + return (error); } /* ARGSUSED */ @@ -125,6 +130,7 @@ struct socket *so = fp->f_data; int error = 0; + CURVNET_SET(so->so_vnet); switch (cmd) { case FIONBIO: SOCK_LOCK(so); @@ -205,6 +211,7 @@ (so, cmd, data, 0, td)); break; } + CURVNET_RESTORE(); return (error); } Index: kern/tty.c =========================================================================== --- kern/tty.c 2008/08/25 00:28:58 #4 +++ kern/tty.c 2008/08/25 00:28:58 @@ -725,12 +725,6 @@ done: tty_unlock(tp); return (error); } -#if 0 - /* in the old code we did this in the ioctl code.. remember this! */ - CURVNET_SET(TD_TO_VNET(curthread)); - /* fumble line discipline */ - CURVNET_RESTORE(); -#endif static struct cdevsw ttyil_cdevsw = { .d_version = D_VERSION, @@ -1508,6 +1502,8 @@ tty_flush(tp, flags); return (0); } + case TIOCDRAIN: + /* Drain TTY output. */ return tty_drain(tp); case TIOCCONS: /* Set terminal as console TTY. */ Index: kern/uipc_accf.c =========================================================================== --- kern/uipc_accf.c 2008/08/25 00:28:58 #1 +++ kern/uipc_accf.c 2008/08/25 00:28:58 @@ -58,11 +58,12 @@ MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); -static int unloadable = 0; +int accf_unloadable = 0; SYSCTL_DECL(_net_inet); /* XXX: some header should do this for me */ SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters"); -SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, +SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, + &accf_unloadable, 0, "Allow unload of accept filters (not recommended)"); /* @@ -144,7 +145,7 @@ * having it called is a bad thing. A simple fix would be to * track the refcount in the struct accept_filter. */ - if (unloadable != 0) { + if (accf_unloadable != 0) { error = accept_filt_del(accfp->accf_name); } else error = EOPNOTSUPP; Index: kern/uipc_domain.c =========================================================================== --- kern/uipc_domain.c 2008/08/25 00:28:58 #1 +++ kern/uipc_domain.c 2008/08/25 00:28:58 @@ -43,6 +43,7 @@ #include #include #include +#include #include /* @@ -64,6 +65,11 @@ SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL); +static vnet_attach_fn net_init_domain; +#ifdef VIMAGE +static vnet_detach_fn net_detach_domain; +#endif + static struct callout pffast_callout; static struct callout pfslow_callout; @@ -100,6 +106,9 @@ .pru_sopoll = pru_sopoll_notsupp, }; +VNET_MOD_DECLARE_STATELESS(DOMAIN, domain, net_init_domain, net_detach_domain, + NET) + static void protosw_init(struct protosw *pr) { @@ -128,13 +137,12 @@ } /* - * Add a new protocol domain to the list of supported domains - * Note: you cant unload it again because a socket may be using it. - * XXX can't fail at this time. + * Initialize a domain instance. */ -static void -net_init_domain(struct domain *dp) +static int +net_init_domain(const void *arg) { + const struct domain *dp = arg; struct protosw *pr; if (dp->dom_init) @@ -148,9 +156,30 @@ max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); + return 0; } +#ifdef VIMAGE /* + * Detach / free a domain instance. + */ +static int +net_detach_domain(const void *arg) +{ + const struct domain *dp = arg; + struct protosw *pr; + + if (dp->dom_destroy) + (*dp->dom_destroy)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_destroy) + (*pr->pr_destroy)(); + + return 0; +} +#endif + +/* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. @@ -183,7 +212,11 @@ "domainfinalize()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); +#ifdef VIMAGE + vnet_mod_register_multi(&vnet_domain_modinfo, dp, dp->dom_name); +#else net_init_domain(dp); +#endif } static void Index: kern/uipc_socket.c =========================================================================== --- kern/uipc_socket.c 2008/08/25 00:28:58 #4 +++ kern/uipc_socket.c 2008/08/25 00:28:58 @@ -129,6 +129,9 @@ #include #include #include +#include + +#include #include @@ -260,7 +263,7 @@ * soalloc() returns a socket with a ref count of 0. */ static struct socket * -soalloc(void) +soalloc(struct vnet *vnet) { struct socket *so; @@ -281,6 +284,10 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; +#ifdef VIMAGE + so->so_vnet = vnet; + vnet->sockcnt++; +#endif mtx_unlock(&so_global_mtx); return (so); } @@ -300,6 +307,9 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ +#ifdef VIMAGE + so->so_vnet->sockcnt--; +#endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, @@ -353,7 +363,11 @@ if (prp->pr_type != type) return (EPROTOTYPE); - so = soalloc(); +#ifdef VIMAGE + so = soalloc(TD_TO_VNET(td)); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (ENOBUFS); @@ -379,7 +393,9 @@ * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ + CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); + CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); @@ -421,7 +437,12 @@ if (over) #endif return (NULL); - so = soalloc(); +#ifdef VIMAGE + VNET_ASSERT(head->so_vnet); + so = soalloc(head->so_vnet); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (NULL); if ((head->so_options & SO_ACCEPTFILTER) != 0) @@ -493,8 +514,12 @@ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { + int error; - return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); + CURVNET_SET(so->so_vnet); + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); + CURVNET_RESTORE(); + return error; } /* @@ -642,6 +667,7 @@ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { @@ -693,6 +719,7 @@ KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); + CURVNET_RESTORE(); return (error); } @@ -768,7 +795,9 @@ * biting us. */ so->so_error = 0; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); + CURVNET_RESTORE(); } return (error); @@ -1284,13 +1313,17 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + int error; /* XXXRW: Temporary debugging. */ KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, ("sosend: protocol calls sosend")); - return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, - control, flags, td)); + CURVNET_SET(so->so_vnet); + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, + control, flags, td); + CURVNET_RESTORE(); + return (error); } /* @@ -2104,8 +2137,13 @@ } if (how != SHUT_WR) sorflush(so); - if (how != SHUT_RD) - return ((*pr->pr_usrreqs->pru_shutdown)(so)); + if (how != SHUT_RD) { + int error; + CURVNET_SET(so->so_vnet); + error = (*pr->pr_usrreqs->pru_shutdown)(so); + CURVNET_RESTORE(); + return (error); + } return (0); } @@ -2129,6 +2167,7 @@ * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ + CURVNET_SET(so->so_vnet); socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); @@ -2152,6 +2191,7 @@ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease_internal(&asb, so); + CURVNET_RESTORE(); } /* Index: kern/uipc_syscalls.c =========================================================================== --- kern/uipc_syscalls.c 2008/08/25 00:28:58 #1 +++ kern/uipc_syscalls.c 2008/08/25 00:28:58 @@ -64,6 +64,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -264,7 +265,9 @@ if (error) goto done; #endif + CURVNET_SET(so->so_vnet); error = solisten(so, uap->backlog, td); + CURVNET_RESTORE(); #ifdef MAC done: #endif @@ -429,7 +432,9 @@ tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; + CURVNET_SET(so->so_vnet); error = soaccept(so, &sa); + CURVNET_RESTORE(); if (error) { /* * return a namelen of zero for older code which might @@ -976,9 +981,11 @@ ktruio = cloneuio(&auio); #endif len = auio.uio_resid; + CURVNET_SET(so->so_vnet); error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); + CURVNET_RESTORE(); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -1322,7 +1329,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sosetopt(so, &sopt); + CURVNET_RESTORE(); fdrop(fp, td); } return(error); @@ -1400,7 +1409,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sogetopt(so, &sopt); + CURVNET_RESTORE(); *valsize = sopt.sopt_valsize; fdrop(fp, td); } @@ -1463,7 +1474,9 @@ return (error); so = fp->f_data; *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -1564,7 +1577,9 @@ goto done; } *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -2176,9 +2191,11 @@ goto done; } SOCKBUF_UNLOCK(&so->so_snd); + CURVNET_SET(so->so_vnet); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); + CURVNET_RESTORE(); if (err == 0) { /* * We need two counters to get the Index: kern/uipc_usrreq.c =========================================================================== --- kern/uipc_usrreq.c 2008/08/25 00:28:58 #3 +++ kern/uipc_usrreq.c 2008/08/25 00:28:58 @@ -90,6 +90,7 @@ #include #include #include +#include #ifdef DDB #include @@ -1651,6 +1652,10 @@ unp_init(void) { +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) Index: kern/vfs_export.c =========================================================================== --- kern/vfs_export.c 2008/08/25 00:28:58 #1 +++ kern/vfs_export.c 2008/08/25 00:28:58 @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -135,6 +136,7 @@ } #endif + CURVNET_SET(TD_TO_VNET(curthread)); /* XXX MARKO */ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); @@ -204,8 +206,10 @@ bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); refcount_init(&np->netc_anon.cr_ref, 1); + CURVNET_RESTORE(); return (0); out: + CURVNET_RESTORE(); free(np, M_NETADDR); return (error); } Index: kern/vfs_lookup.c =========================================================================== --- kern/vfs_lookup.c 2008/08/25 00:28:58 #1 +++ kern/vfs_lookup.c 2008/08/25 00:28:58 @@ -54,6 +54,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -66,6 +67,15 @@ #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC +#ifdef VIMAGE +#define IMUNES_SYMLINK_HACK +#endif + +#ifdef IMUNES_SYMLINK_HACK +SYSCTL_V_INT(V_PROCG, vprocg, _vfs, OID_AUTO, morphing_symlinks, CTLFLAG_RW, + morphing_symlinks, 0, "Resolve @ to vimage name in symlinks"); +#endif + /* * Allocation zone for namei */ @@ -130,6 +140,9 @@ struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; int vfslocked; +#ifdef IMUNES_SYMLINK_HACK + INIT_VPROCG(TD_TO_VPROCG(td)); +#endif KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0, ("NOT MPSAFE and Giant not held")); @@ -304,6 +317,25 @@ error = ENOENT; break; } +#ifdef IMUNES_SYMLINK_HACK + if (V_morphing_symlinks) { + char *sp = strchr(cp, '@'); + int vnamelen = strlen(TD_TO_VIMAGE(td)->vi_name); + + if (sp) { + if (vnamelen >= auio.uio_resid) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + bcopy(sp + 1, sp + vnamelen, + linklen - (sp - cp)); + bcopy(TD_TO_VIMAGE(td)->vi_name, sp, vnamelen); + linklen += (vnamelen - 1); + } + } +#endif if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); Index: modules/netgraph/Makefile =========================================================================== --- modules/netgraph/Makefile 2008/08/25 00:28:58 #2 +++ modules/netgraph/Makefile 2008/08/25 00:28:58 @@ -34,6 +34,7 @@ netflow \ netgraph \ one2many \ + pipe \ ppp \ pppoe \ pptpgre \ @@ -50,7 +51,8 @@ tee \ UI \ vjc \ - vlan + vlan \ + ${_wormhole} .if ${MACHINE_ARCH} == "i386" _sync_ar= sync_ar @@ -65,4 +67,9 @@ _mppc= mppc .endif +VIMAGE!= grep VIMAGE ${KERNBUILDDIR}/opt_vimage.h | cut -d" " -f3 || true +.if ${VIMAGE} == 1 +_wormhole= wormhole +.endif + .include Index: modules/netgraph/pipe/Makefile =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- modules/netgraph/pipe/Makefile Mon Aug 25 00:29:02 2008 *************** *** 0 **** --- 1,6 ---- + # $FreeBSD: $ + + KMOD= ng_pipe + SRCS= ng_pipe.c + + .include Index: modules/netgraph/wormhole/Makefile =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- modules/netgraph/wormhole/Makefile Mon Aug 25 00:29:02 2008 *************** *** 0 **** --- 1,6 ---- + # $FreeBSD: $ + + KMOD= ng_wormhole + SRCS= ng_wormhole.c opt_vimage.h + + .include Index: net/bpf.c =========================================================================== --- net/bpf.c 2008/08/25 00:28:58 #4 +++ net/bpf.c 2008/08/25 00:28:58 @@ -61,9 +61,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -878,9 +880,11 @@ BPFD_UNLOCK(d); #endif + CURVNET_SET(ifp->if_vnet); error = (*ifp->if_output)(ifp, m, &dst, NULL); if (error) d->bd_wdcount++; + CURVNET_RESTORE(); if (mc != NULL) { if (error == 0) @@ -993,6 +997,7 @@ return (EPERM); } } + CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: @@ -1322,6 +1327,7 @@ case BIOCROTZBUF: return (bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr)); } + CURVNET_RESTORE(); return (error); } @@ -1418,9 +1424,33 @@ struct bpf_if *bp; struct ifnet *theywant; +#define IMUNES_BPF_HACK +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + struct vnet *target_vnet = curvnet; + char *c; + + /* Hack to support tapping in foreign vnets */ + c = rindex(ifr->ifr_name, '@'); + if ( c != NULL ) { +printf("bpf_setif: %s\n", c); + struct vimage *target_vimage; + + *c++ = 0; + target_vimage = vimage_by_name(TD_TO_VIMAGE(curthread), c); + if (target_vimage == NULL) + return ENXIO; + target_vnet = target_vimage->v_net; + } + CURVNET_SET_QUIET(target_vnet); +#endif + theywant = ifunit(ifr->ifr_name); - if (theywant == NULL || theywant->if_bpf == NULL) + if (theywant == NULL || theywant->if_bpf == NULL) { +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (ENXIO); + } bp = theywant->if_bpf; @@ -1460,6 +1490,9 @@ BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (0); } Index: net/bridgestp.c =========================================================================== --- net/bridgestp.c 2008/08/25 00:28:58 #6 +++ net/bridgestp.c 2008/08/25 00:28:58 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -2017,6 +2018,7 @@ void bstp_reinit(struct bstp_state *bs) { + INIT_VNET_NET(curvnet); struct bstp_port *bp; struct ifnet *ifp, *mif; u_char *e_addr; Index: net/if.c =========================================================================== --- net/if.c 2008/08/25 00:28:58 #12 +++ net/if.c 2008/08/25 00:28:58 @@ -60,6 +60,7 @@ #include +#include #include #include #include @@ -111,7 +112,6 @@ static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); -static void if_grow(void); static void if_init(void *); static void if_qflush(struct ifaltq *); static void if_route(struct ifnet *, int flag, int fam); @@ -134,21 +134,28 @@ extern void nd6_setmtu(struct ifnet *); #endif -int if_index = 0; +static int vnet_net_iattach(const void *); +#ifdef VIMAGE +static int vnet_net_idetach(const void *); +#endif + int ifqmaxlen = IFQ_MAXLEN; -struct ifnethead ifnet; /* depend on static init XXX */ -struct ifgrouphead ifg_head; struct mtx ifnet_lock; static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; +#ifndef VIMAGE +int if_index = 0; +struct ifnethead ifnet; /* depend on static init XXX */ +struct ifgrouphead ifg_head; -static int if_indexlim = 8; +static int if_indexlim; static struct knlist ifklist; /* * Table of ifnet/cdev by index. Locked with ifnet_lock. */ static struct ifindex_entry *ifindex_table = NULL; +#endif /* !VIMAGE */ static void filt_netdetach(struct knote *kn); static int filt_netdev(struct knote *kn, long hint); @@ -156,6 +163,19 @@ static struct filterops netdev_filtops = { 1, NULL, filt_netdetach, filt_netdev }; +#ifdef VIMAGE +static struct vnet_symmap vnet_net_symmap[] = { + VNET_SYMMAP(net, ifnet), + VNET_SYMMAP(net, rt_tables), + VNET_SYMMAP(net, rtstat), + VNET_SYMMAP(net, rttrash), + VNET_SYMMAP_END +}; +#endif + +VNET_MOD_DECLARE(NET, net, vnet_net_iattach, vnet_net_idetach, + NONE, vnet_net_symmap) + /* * System initialization */ @@ -169,6 +189,7 @@ struct ifnet * ifnet_byindex(u_short idx) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; IFNET_RLOCK(); @@ -177,10 +198,11 @@ return (ifp); } -static void +void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { + INIT_VNET_NET(curvnet); IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = ifp; @@ -200,6 +222,7 @@ struct cdev * ifdev_byindex(u_short idx) { + INIT_VNET_NET(curvnet); struct cdev *cdev; IFNET_RLOCK(); @@ -212,6 +235,7 @@ ifdev_setbyindex(u_short idx, struct cdev *cdev) { + INIT_VNET_NET(curvnet); IFNET_WLOCK(); V_ifindex_table[idx].ife_dev = cdev; IFNET_WUNLOCK(); @@ -350,20 +374,60 @@ static void if_init(void *dummy __unused) { + INIT_VNET_NET(curvnet); +#ifdef VIMAGE + vnet_mod_register(&vnet_net_modinfo); +#else + vnet_net_iattach(NULL); +#endif IFNET_LOCK_INIT(); TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); - if_grow(); /* create initial table */ ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "network")); if_clone_init(); } -static void +static int +vnet_net_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + TAILQ_INIT(&V_ifnet); + TAILQ_INIT(&V_ifg_head); + knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); + V_if_indexlim = 8; + if_grow(); /* create initial table */ + + return 0; +} + +#ifdef VIMAGE +static int +vnet_net_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); +#ifdef NOTYET + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); +#endif + VNET_ASSERT(SLIST_EMPTY(&V_ifklist.kl_list)); + + free((caddr_t)V_ifindex_table, M_IFNET); + + return 0; +} +#endif + +void if_grow(void) { + INIT_VNET_NET(curvnet); u_int n; struct ifindex_entry *e; @@ -496,6 +560,11 @@ panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); +#ifdef VIMAGE + ifp->if_vnet = curvnet; + if (ifp->if_home_vnet == NULL) + ifp->if_home_vnet = curvnet; +#endif TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); @@ -518,11 +587,17 @@ mac_ifnet_create(ifp); #endif +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw, unit2minor(ifp->if_index), UID_ROOT, GID_WHEEL, 0600, "%s/%s", net_cdevsw.d_name, ifp->if_xname)); make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", net_cdevsw.d_name, ifp->if_index); +#ifdef VIMAGE + } +#endif mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); @@ -578,12 +653,18 @@ IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); +#ifdef VIMAGE + curvnet->ifccnt++; +#endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ @@ -610,7 +691,7 @@ splx(s); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, - if_attachdomain, NULL); + if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) @@ -720,6 +801,14 @@ struct ifnet *iter; int found = 0; + /* + * Detach from any vlan, bridge or lagg ifnets linked to us. + * A small though unlikely window for a race from here to ifp + * unlinking from ifnet list is possible, hence we repeat the + * procedure once again further bellow. XXX. + */ + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { @@ -727,6 +816,10 @@ found = 1; break; } +#ifdef VIMAGE + if (found) + curvnet->ifccnt--; +#endif IFNET_WUNLOCK(); if (!found) return; @@ -770,7 +863,13 @@ * Clean up all addresses. */ ifp->if_addr = NULL; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif destroy_dev(ifdev_byindex(ifp->if_index)); +#ifdef VIMAGE + } +#endif ifdev_setbyindex(ifp->if_index, NULL); /* We can now free link ifaddr. */ @@ -800,6 +899,9 @@ /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); IF_AFDATA_LOCK(ifp); @@ -818,6 +920,9 @@ knlist_destroy(&ifp->if_klist); mtx_destroy(&ifp->if_snd.ifq_mtx); IF_AFDATA_DESTROY(ifp); +#ifdef VIMAGE + ifp->if_vnet = NULL; +#endif splx(s); } @@ -1459,6 +1564,9 @@ (*lagg_linkstate_p)(ifp, link_state); } +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) @@ -1920,6 +2028,24 @@ ifr = (struct ifreq *)data; switch (cmd) { +#ifdef VIMAGE + case SIOCSIFVIMAGE: + error = suser(td); + if (error == 0) + error = vi_if_move((struct vi_req *) data, NULL, + TD_TO_VIMAGE(td)); + return (error); + + /* + * XXX Should be implemented as separate system calls. This is + * just a temporary hack! + */ + case SIOCSPVIMAGE: + case SIOCGPVIMAGE: + error = vi_td_ioctl(cmd, (struct vi_req *) data, td); + return (error); +#endif + case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); @@ -2532,6 +2658,9 @@ void if_delmulti_ifma(struct ifmultiaddr *ifma) { +#ifdef DIAGNOSTIC + INIT_VNET_NET(curvnet); +#endif struct ifnet *ifp; int lastref; @@ -2812,7 +2941,7 @@ if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { - + KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, @@ -2825,7 +2954,7 @@ void if_deregister_com_alloc(u_char type) { - + KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, Index: net/if.h =========================================================================== --- net/if.h 2008/08/25 00:28:58 #3 +++ net/if.h 2008/08/25 00:28:58 @@ -457,8 +457,4 @@ #include #endif -#ifdef _KERNEL -#include -#endif - #endif /* !_NET_IF_H_ */ Index: net/if_bridge.c =========================================================================== --- net/if_bridge.c 2008/08/25 00:28:58 #7 +++ net/if_bridge.c 2008/08/25 00:28:58 @@ -114,10 +114,12 @@ #include #include #include +#include #include #ifdef INET6 #include #include +#include #endif #ifdef DEV_CARP #include @@ -128,10 +130,12 @@ #include #include #include +#include #include #include #include +#include /* * Size of the route hash table. Must be a power of two. @@ -3034,6 +3038,7 @@ } if (IPFW_LOADED && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { + INIT_VNET_IPFW(curvnet); error = -1; args.rule = ip_dn_claim_rule(*mp); if (args.rule != NULL && V_fw_one_pass) @@ -3218,6 +3223,7 @@ static int bridge_ip_checkbasic(struct mbuf **mp) { + INIT_VNET_INET(curvnet); struct mbuf *m = *mp; struct ip *ip; int len, hlen; @@ -3313,6 +3319,7 @@ static int bridge_ip6_checkbasic(struct mbuf **mp) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct ip6_hdr *ip6; @@ -3403,8 +3410,10 @@ m_freem(m); } - if (error == 0) + if (error == 0) { + INIT_VNET_INET(curvnet); V_ipstat.ips_fragmented++; + } return (error); Index: net/if_clone.c =========================================================================== --- net/if_clone.c 2008/08/25 00:28:58 #1 +++ net/if_clone.c 2008/08/25 00:28:58 @@ -39,7 +39,9 @@ #include #include #include +#include +#include #include #include #if 0 @@ -204,15 +206,14 @@ { int err; - if (ifc->ifc_destroy == NULL) { - err = EOPNOTSUPP; - goto done; - } + if (ifc->ifc_destroy == NULL) + return(EOPNOTSUPP); IF_CLONE_LOCK(ifc); IFC_IFLIST_REMOVE(ifc, ifp); IF_CLONE_UNLOCK(ifc); + CURVNET_SET_QUIET(ifp->if_vnet); if_delgroup(ifp, ifc->ifc_name); err = (*ifc->ifc_destroy)(ifc, ifp); @@ -224,8 +225,7 @@ IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } - -done: + CURVNET_RESTORE(); return (err); } @@ -402,6 +402,24 @@ * Find a free unit if none was given. */ if (wildcard) { +#ifdef VIMAGE + INIT_VNET_NET(curvnet); + char name[IFNAMSIZ]; + struct ifnet *ifp; + int i = 0; + + IFNET_RLOCK(); +again: + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + sprintf(name, "%s%d", ifc->ifc_name, i); + if (strcmp(name, ifp->if_xname) == 0) { + i++; + goto again; + } + } + IFNET_RUNLOCK(); + *unit = i; +#else while ((bytoff < ifc->ifc_bmlen) && (ifc->ifc_units[bytoff] == 0xff)) bytoff++; @@ -412,6 +430,7 @@ while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) bitoff++; *unit = (bytoff << 3) + bitoff; +#endif } if (*unit > ifc->ifc_maxunit) { @@ -419,6 +438,7 @@ goto done; } +#ifndef VIMAGE if (!wildcard) { bytoff = *unit >> 3; bitoff = *unit - (bytoff << 3); @@ -434,6 +454,7 @@ KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, ("%s: bit is already set", __func__)); ifc->ifc_units[bytoff] |= (1 << bitoff); +#endif IF_CLONE_ADDREF_LOCKED(ifc); done: @@ -444,6 +465,7 @@ void ifc_free_unit(struct if_clone *ifc, int unit) { +#ifndef VIMAGE int bytoff, bitoff; @@ -458,6 +480,7 @@ ("%s: bit is already cleared", __func__)); ifc->ifc_units[bytoff] &= ~(1 << bitoff); IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ +#endif } void Index: net/if_ef.c =========================================================================== --- net/if_ef.c 2008/08/25 00:28:58 #5 +++ net/if_ef.c 2008/08/25 00:28:58 @@ -50,6 +50,7 @@ #include #include #include +#include #ifdef INET #include @@ -490,35 +491,40 @@ int error = 0, d; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type != IFT_ETHER) continue; - EFDEBUG("Found interface %s\n", ifp->if_xname); - efl = (struct ef_link*)malloc(sizeof(struct ef_link), - M_IFADDR, M_WAITOK | M_ZERO); - if (efl == NULL) { - error = ENOMEM; - break; - } + { + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type != IFT_ETHER) continue; + EFDEBUG("Found interface %s\n", ifp->if_xname); + efl = (struct ef_link*)malloc(sizeof(struct ef_link), + M_IFADDR, M_WAITOK | M_ZERO); + if (efl == NULL) { + error = ENOMEM; + break; + } - efl->el_ifp = ifp; + efl->el_ifp = ifp; #ifdef ETHER_II - error = ef_clone(efl, ETHER_FT_EII); - if (error) break; + error = ef_clone(efl, ETHER_FT_EII); + if (error) break; #endif #ifdef ETHER_8023 - error = ef_clone(efl, ETHER_FT_8023); - if (error) break; + error = ef_clone(efl, ETHER_FT_8023); + if (error) break; #endif #ifdef ETHER_8022 - error = ef_clone(efl, ETHER_FT_8022); - if (error) break; + error = ef_clone(efl, ETHER_FT_8022); + if (error) break; #endif #ifdef ETHER_SNAP - error = ef_clone(efl, ETHER_FT_SNAP); - if (error) break; + error = ef_clone(efl, ETHER_FT_SNAP); + if (error) break; #endif - efcount++; - SLIST_INSERT_HEAD(&efdev, efl, el_next); + efcount++; + SLIST_INSERT_HEAD(&efdev, efl, el_next); + } + VNET_ITERLOOP_END(); } IFNET_RUNLOCK(); if (error) { Index: net/if_ethersubr.c =========================================================================== --- net/if_ethersubr.c 2008/08/25 00:28:58 #7 +++ net/if_ethersubr.c 2008/08/25 00:28:58 @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -137,8 +138,10 @@ int ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared); +#ifndef VIMAGE static int ether_ipfw; #endif +#endif /* * Ethernet output routine. @@ -562,6 +565,8 @@ } #endif + CURVNET_SET_QUIET(ifp->if_vnet); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; @@ -598,6 +603,7 @@ /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); + CURVNET_RESTORE(); return; } @@ -646,8 +652,10 @@ ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } /* @@ -658,8 +666,10 @@ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } #ifdef DEV_CARP @@ -695,6 +705,7 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); + CURVNET_RESTORE(); } /* @@ -876,6 +887,25 @@ return (etherbuf); } +#ifdef VIMAGE +static void +ether_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "eth"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + CURVNET_RESTORE(); +} +#endif + /* * Perform common duties while attaching to interface list */ @@ -885,6 +915,9 @@ int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; +#ifdef VIMAGE + struct vnet *home_vnet_0 = ifp->if_home_vnet; +#endif ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; @@ -893,6 +926,9 @@ ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; +#ifdef VIMAGE + ifp->if_reassign = ether_reassign; +#endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; @@ -912,7 +948,11 @@ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; +#ifdef VIMAGE + if (i != ifp->if_addrlen && home_vnet_0 != ifp->if_home_vnet) +#else if (i != ifp->if_addrlen) +#endif if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); } @@ -936,7 +976,7 @@ SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) SYSCTL_V_INT(V_NET, vnet_net, _net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, - ether_ipfw, 0, "Pass ether pkts through firewall"); + ether_ipfw, 0, "Pass ether pkts through firewall"); #endif #if 0 Index: net/if_faith.c =========================================================================== --- net/if_faith.c 2008/08/25 00:28:58 #6 +++ net/if_faith.c 2008/08/25 00:28:58 @@ -77,6 +77,7 @@ #include #include #include +#include #endif #define FAITHNAME "faith" Index: net/if_gif.c =========================================================================== --- net/if_gif.c 2008/08/25 00:28:58 #7 +++ net/if_gif.c 2008/08/25 00:28:58 @@ -95,7 +95,9 @@ */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); +#ifndef VIMAGE static LIST_HEAD(, gif_softc) gif_softc_list; +#endif void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); @@ -105,6 +107,7 @@ static void gif_start(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); +static int vnet_gif_iattach(const void *); IFC_SIMPLE_DECLARE(gif, 0); @@ -141,14 +144,13 @@ * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ -#ifdef XBONEHACK -static int parallel_tunnels = 1; -#else -static int parallel_tunnels = 0; +#ifndef VIMAGE +static int parallel_tunnels; #endif SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?"); +VNET_MOD_DECLARE(GIF, gif, NULL, vnet_gif_iattach, NET, NULL) /* copy from src/sys/net/if_ethersubr.c */ static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -241,29 +243,46 @@ } static int +vnet_gif_iattach(unused) + const void *unused; +{ + INIT_VNET_GIF(curvnet); + + LIST_INIT(&V_gif_softc_list); + V_max_gif_nesting = MAX_GIF_NEST; +#ifdef XBONEHACK + V_parallel_tunnels = 1; +#endif + V_ip_gif_ttl = GIF_TTL; +#ifdef INET6 + V_ip6_gif_hlim = GIF_HLIM; +#endif + + return 0; +} + +static int gifmodevent(mod, type, data) module_t mod; int type; void *data; { - switch (type) { case MOD_LOAD: mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - LIST_INIT(&V_gif_softc_list); - if_clone_attach(&gif_cloner); - -#ifdef INET6 - V_ip6_gif_hlim = GIF_HLIM; +#ifdef VIMAGE + vnet_mod_register(&vnet_gif_modinfo); +#else + vnet_gif_iattach(NULL); #endif - break; case MOD_UNLOAD: if_clone_detach(&gif_cloner); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_gif_modinfo); +#endif mtx_destroy(&gif_mtx); -#ifdef INET6 - V_ip6_gif_hlim = 0; -#endif + break; default: return EOPNOTSUPP; Index: net/if_gre.c =========================================================================== --- net/if_gre.c 2008/08/25 00:28:58 #8 +++ net/if_gre.c 2008/08/25 00:28:58 @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -72,6 +73,7 @@ #include #ifdef INET +#include #include #include #include Index: net/if_loop.c =========================================================================== --- net/if_loop.c 2008/08/25 00:28:58 #8 +++ net/if_loop.c 2008/08/25 00:28:58 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include +#include #include #include #include @@ -89,27 +91,55 @@ #define LOMTU 16384 #endif +#define LONAME "lo" + +struct lo_softc { + struct ifnet *sc_ifp; + LIST_ENTRY(lo_softc) sc_next; +}; + int loioctl(struct ifnet *, u_long, caddr_t); static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); int looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); +static int vnet_loif_iattach(const void *); +#ifdef VIMAGE +static int vnet_loif_idetach(const void *); +#endif +#ifndef VIMAGE struct ifnet *loif = NULL; /* Used externally */ +static LIST_HEAD(lo_list, lo_softc) lo_list; +#endif /* !VIMAGE */ + +static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface"); + +static struct mtx lo_mtx; IFC_SIMPLE_DECLARE(lo, 1); static void lo_clone_destroy(struct ifnet *ifp) { + struct lo_softc *sc; +#ifdef INVARIANTS + INIT_VNET_NET(ifp->if_vnet); +#endif + + sc = ifp->if_softc; /* XXX: destroying lo0 will lead to panics. */ KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); + mtx_lock(&lo_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&lo_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); + free(sc, M_LO); } static int @@ -117,10 +147,16 @@ { INIT_VNET_NET(curvnet); struct ifnet *ifp; + struct lo_softc *sc; - ifp = if_alloc(IFT_LOOP); - if (ifp == NULL) + MALLOC(sc, struct lo_softc *, sizeof(*sc), M_LO, M_WAITOK | M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_LOOP); + if (ifp == NULL) { + free(sc, M_LO); return (ENOSPC); + } + if (V_loif == NULL) + V_loif = ifp; if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; @@ -128,21 +164,76 @@ ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - if (V_loif == NULL) - V_loif = ifp; + mtx_lock(&lo_mtx); + LIST_INSERT_HEAD(&V_lo_list, sc, sc_next); + mtx_unlock(&lo_mtx); return (0); } +VNET_MOD_DECLARE_STATELESS(LOIF, loif, vnet_loif_iattach, vnet_loif_idetach, + NET) + +static int vnet_loif_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + LIST_INIT(&V_lo_list); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) + if_clone_attach(&lo_cloner); + else + lo_cloner.ifc_attach(&lo_cloner); +#else + if_clone_attach(&lo_cloner); +#endif + return 0; +} + +#ifdef VIMAGE +static int vnet_loif_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + struct lo_softc *sc, *nsc; + + LIST_FOREACH_SAFE(sc, &V_lo_list, sc_next, nsc) { + struct ifnet *ifp = sc->sc_ifp; + + if (ifp == V_loif) { + /* + * A hack to allow lo0 to be detached: + * bump if_unit number from 0 to 1. By + * setting V_loif to NULL we prevent queuing + * of routing messages that would have + * m_pkthdr.rcvif pointing to a nonexisting + * ifnet, i.e. the lo0 we just destroyed. + */ + ifp->if_dunit = 1; + V_loif = NULL; + } + if_clone_destroy(ifp->if_xname); + } + return 0; +} +#endif + static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - if_clone_attach(&lo_cloner); + mtx_init(&lo_mtx, "lo_mtx", NULL, MTX_DEF); +#ifdef VIMAGE + vnet_mod_register(&vnet_loif_modinfo); +#else + vnet_loif_iattach(NULL); +#endif break; case MOD_UNLOAD: @@ -174,7 +265,7 @@ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } ifp->if_opackets++; Index: net/if_mib.c =========================================================================== --- net/if_mib.c 2008/08/25 00:28:58 #8 +++ net/if_mib.c 2008/08/25 00:28:58 @@ -36,6 +36,7 @@ #include #include +#include #include #include Index: net/if_ppp.c =========================================================================== --- net/if_ppp.c 2008/08/25 00:28:58 #2 +++ net/if_ppp.c 2008/08/25 00:28:58 @@ -98,6 +98,7 @@ #include #include #include +#include #include #include @@ -1397,6 +1398,7 @@ struct mbuf *mp, *dmp = NULL; u_char *iphdr; u_int hlen; + CURVNET_SET(ifp->if_vnet); sc->sc_stats.ppp_ipackets++; @@ -1431,7 +1433,7 @@ m_freem(m); if (dmp == NULL) { /* no error, but no decompressed packet produced */ - return; + goto done; } m = dmp; cp = mtod(m, u_char *); @@ -1588,7 +1590,7 @@ ilen, 0) == 0) { /* drop this packet */ m_freem(m); - return; + goto done; } if (sc->sc_active_filt.bf_insns == 0 || bpf_filter(sc->sc_active_filt.bf_insns, (u_char *) m, ilen, 0)) @@ -1617,13 +1619,13 @@ || sc->sc_npmode[NP_IP] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; m->m_len -= PPP_HDRLEN; if ((m = ip_fastforward(m)) == NULL) - return; + goto done; isr = NETISR_IP; break; #endif @@ -1636,7 +1638,7 @@ || sc->sc_npmode[NP_IPV6] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1653,7 +1655,7 @@ /* XXX: || sc->sc_npmode[NP_IPX] != NPMODE_PASS*/) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1688,6 +1690,8 @@ if (isr == -1) (*sc->sc_ctlp)(sc); + done: + CURVNET_RESTORE(); return; bad: @@ -1695,6 +1699,7 @@ m_freem(m); PPP2IFP(sc)->if_ierrors++; sc->sc_stats.ppp_ierrors++; + CURVNET_RESTORE(); } #define MAX_DUMP_BYTES 128 Index: net/if_spppsubr.c =========================================================================== --- net/if_spppsubr.c 2008/08/25 00:28:58 #8 +++ net/if_spppsubr.c 2008/08/25 00:28:58 @@ -40,10 +40,13 @@ #include +#include #include #include #include #include + +#include #include #include #include @@ -4875,6 +4878,7 @@ static void sppp_set_ip_addr(struct sppp *sp, u_long src) { + INIT_VNET_INET(curvnet); STDDCL; struct ifaddr *ifa; struct sockaddr_in *si; Index: net/if_stf.c =========================================================================== --- net/if_stf.c 2008/08/25 00:28:58 #6 +++ net/if_stf.c 2008/08/25 00:28:58 @@ -94,6 +94,7 @@ #include #include +#include #include #include #include @@ -101,6 +102,7 @@ #include #include +#include #include #include #include Index: net/if_tap.c =========================================================================== --- net/if_tap.c 2008/08/25 00:28:58 #2 +++ net/if_tap.c 2008/08/25 00:28:58 @@ -58,7 +58,9 @@ #include #include #include +#include +#include #include #include #include Index: net/if_var.h =========================================================================== --- net/if_var.h 2008/08/25 00:28:58 #5 +++ net/if_var.h 2008/08/25 00:28:58 @@ -70,6 +70,7 @@ struct ether_header; struct carp_if; struct ifvlantrunk; +struct vnet; #endif #include /* get TAILQ macros */ @@ -160,6 +161,10 @@ (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); + void (*if_reassign) /* reassign to vnet routine */ + (struct ifnet *, struct vnet *, char *); + struct vnet *if_vnet; /* network stack instance */ + struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_llsoftc; /* link layer softc */ int if_drv_flags; /* driver-managed status flags */ @@ -645,6 +650,7 @@ struct cdev *ife_dev; }; +void ifnet_setbyindex(u_short idx, struct ifnet *ifp); struct ifnet *ifnet_byindex(u_short idx); /* @@ -655,10 +661,12 @@ struct ifaddr *ifaddr_byindex(u_short idx); struct cdev *ifdev_byindex(u_short idx); +extern int ifqmaxlen; +#ifndef VIMAGE extern struct ifnethead ifnet; -extern int ifqmaxlen; +extern int if_index; extern struct ifnet *loif; /* first loopback interface */ -extern int if_index; +#endif /* !VIMAGE */ int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); @@ -666,6 +674,7 @@ int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); +void if_grow(void); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); Index: net/if_vlan.c =========================================================================== --- net/if_vlan.c 2008/08/25 00:28:58 #8 +++ net/if_vlan.c 2008/08/25 00:28:58 @@ -57,6 +57,7 @@ #include #include +#include #include #include #include @@ -1356,6 +1357,12 @@ error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); if (error) break; +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; @@ -1383,6 +1390,12 @@ case SIOCGETVLAN: bzero(&vlr, sizeof(vlr)); +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif VLAN_LOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, Index: net/netisr.c =========================================================================== --- net/netisr.c 2008/08/25 00:28:58 #2 +++ net/netisr.c 2008/08/25 00:28:58 @@ -49,7 +49,9 @@ #include #include +#include +#include #include #include #include @@ -142,7 +144,10 @@ IF_DEQUEUE(ni->ni_queue, m); if (m == NULL) break; + VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); ni->ni_handler(m); + CURVNET_RESTORE(); } } @@ -163,6 +168,7 @@ m_freem(m); return; } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) /* * Unless NETISR_FORCEQUEUE is set on the netisr (generally @@ -172,11 +178,31 @@ * by virtue of callers consistently calling one of queued or direct * dispatch, and the forcequeue flag being immutable after * registration. + * + * If the kernel was compiled with options VIMAGE, also defer + * dispatch of netisr handlers for mbufs that have crossed a + * boundary between two vnets. Direct dispatching in such + * cases could lead to various LORs, or in most extreme + * circumstances cause the kernel stack to overflow. */ +#ifndef VIMAGE if (netisr_direct && !(ni->ni_flags & NETISR_FORCEQUEUE)) { +#else + if (netisr_direct && !(ni->ni_flags & NETISR_FORCEQUEUE) && + !(m->m_flags & M_REMOTE_VNET)) { +#endif isrstat.isrs_directed++; ni->ni_handler(m); } else { +#ifdef VIMAGE + /* + * Once direct netisr dispatching is avoided using the + * M_REMOTE_VNET flag, it should not be observed any + * more, so clear it here in order to avoid further + * defering of direct netisr dispatching. + */ + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_deferred++; if (IF_HANDOFF(ni->ni_queue, m, NULL)) schednetisr(num); @@ -203,6 +229,10 @@ m_freem(m); return (ENXIO); } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) +#ifdef VIMAGE + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_queued++; if (!IF_HANDOFF(ni->ni_queue, m, NULL)) return (ENOBUFS); /* IF_HANDOFF has free'd the mbuf */ Index: net/raw_cb.c =========================================================================== --- net/raw_cb.c 2008/08/25 00:28:58 #8 +++ net/raw_cb.c 2008/08/25 00:28:58 @@ -44,7 +44,7 @@ #include #include -#include +#include #include /* @@ -57,7 +57,9 @@ */ struct mtx rawcb_mtx; +#ifndef VIMAGE struct rawcb_list_head rawcb_list; +#endif SYSCTL_NODE(_net, OID_AUTO, raw, CTLFLAG_RW, 0, "Raw socket infrastructure"); Index: net/raw_cb.h =========================================================================== --- net/raw_cb.h 2008/08/25 00:28:58 #2 +++ net/raw_cb.h 2008/08/25 00:28:58 @@ -55,7 +55,11 @@ #define RAWRCVQ 8192 #ifdef _KERNEL + +#ifndef VIMAGE extern LIST_HEAD(rawcb_list_head, rawcb) rawcb_list; +#endif + extern struct mtx rawcb_mtx; /* Index: net/raw_usrreq.c =========================================================================== --- net/raw_usrreq.c 2008/08/25 00:28:58 #8 +++ net/raw_usrreq.c 2008/08/25 00:28:58 @@ -46,7 +46,7 @@ #include #include -#include +#include #include MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); @@ -57,9 +57,11 @@ void raw_init(void) { +#ifndef VIMAGE INIT_VNET_NET(curvnet); LIST_INIT(&V_rawcb_list); +#endif } /* Index: net/route.c =========================================================================== --- net/route.c 2008/08/25 00:28:58 #8 +++ net/route.c 2008/08/25 00:28:58 @@ -51,6 +51,7 @@ #include #include +#include #include #include @@ -104,6 +105,7 @@ &rt_add_addr_allfibs, 0, ""); TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); +#ifndef VIMAGE static struct rtstat rtstat; /* by default only the first 'row' of tables will be accessed. */ @@ -116,6 +118,7 @@ struct radix_node_head *rt_tables[RT_MAXFIBS][AF_MAX+1]; static int rttrash; /* routes not in table but not freed */ +#endif /* !VIMAGE */ static void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); @@ -135,7 +138,7 @@ */ #define RNTORT(p) ((struct rtentry *)(p)) -static uma_zone_t rtzone; /* Routing table UMA zone. */ +static uma_zone_t rtzone; /* Routing table UMA zone. */ #if 0 /* default fib for tunnels to use */ @@ -160,28 +163,20 @@ SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); -static void -route_init(void) +static int +rtable_init(const void *unused) { int table; + int fam; struct domain *dom; - int fam; - - /* whack the tunable ints into line. */ - if (rt_numfibs > RT_MAXFIBS) - rt_numfibs = RT_MAXFIBS; - if (rt_numfibs == 0) - rt_numfibs = 1; - rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); - rn_init(); /* initialize all zeroes, all ones, mask table */ + INIT_VNET_NET(curvnet); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach) { for (table = 0; table < rt_numfibs; table++) { if ( (fam = dom->dom_family) == AF_INET || table == 0) { - /* for now only AF_INET has > 1 table */ + /* for now only AF_INET has > 1 table */ /* XXX MRT * rtattach will be also called * from vfs_export.c but the @@ -198,11 +193,62 @@ } } } + return (0); +} + +#ifdef VIMAGE +static int +rtable_idetach(const void *unused) +{ + int table; + int fam; + struct domain *dom; + INIT_VNET_NET(curvnet); + + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_rtdetach) { + for (table = 0; table < rt_numfibs; table++) { + if ( (fam = dom->dom_family) == AF_INET || + table == 0) { + /* for now only AF_INET has > 1 table */ + dom->dom_rtdetach( + (void **)&V_rt_tables[table][fam], + dom->dom_rtoffset); + } else { + break; + } + } + } + } + return (0); +} + +VNET_MOD_DECLARE_STATELESS(RTABLE, rtable, rtable_init, rtable_idetach, NET); +#endif + +static void +route_init(void) +{ + + /* whack the tunable ints into line. */ + if (rt_numfibs > RT_MAXFIBS) + rt_numfibs = RT_MAXFIBS; + if (rt_numfibs == 0) + rt_numfibs = 1; + rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + rn_init(); /* initialize all zeroes, all ones, mask table */ + +#ifdef VIMAGE + vnet_mod_register(&vnet_rtable_modinfo); +#else + rtable_init(NULL); +#endif } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { - int fibnum; + int fibnum; }; #endif int Index: net/rtsock.c =========================================================================== --- net/rtsock.c 2008/08/25 00:28:58 #9 +++ net/rtsock.c 2008/08/25 00:28:58 @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -1095,6 +1096,14 @@ *(unsigned short *)(tag + 1) = sa->sa_family; m_tag_prepend(m, tag); } +#ifdef VIMAGE + if (V_loif) + m->m_pkthdr.rcvif = V_loif; + else { + m_freem(m); + return; + } +#endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } Index: net80211/ieee80211.c =========================================================================== --- net80211/ieee80211.c 2008/08/25 00:28:58 #6 +++ net80211/ieee80211.c 2008/08/25 00:28:58 @@ -36,12 +36,15 @@ #include #include #include +#include #include #include #include #include +#include #include +#include #include #include @@ -218,6 +221,9 @@ struct ifaddr *ifa; KASSERT(ifp->if_type == IFT_IEEE80211, ("if_type %d", ifp->if_type)); +#ifdef VIMAGE + ifp->if_reassign = NULL; /* Override ether_reassign() */ +#endif IEEE80211_LOCK_INIT(ic, ifp->if_xname); TAILQ_INIT(&ic->ic_vaps); @@ -657,6 +663,30 @@ IEEE80211_UNLOCK(ic); } +#ifdef VIMAGE +void +ieee80211_reassign( struct ieee80211vap *vap, struct vnet *vnet, char *dname) +{ + struct ifnet *ifp = vap->iv_ifp; + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + bpfdetach(ifp); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + vap->iv_rawbpf = NULL; + if_reassign_common(ifp, vnet, ifp->if_dname); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + bpfattach2(ifp, DLT_IEEE802_11, + sizeof(struct ieee80211_frame_addr4), &vap->iv_rawbpf); + CURVNET_RESTORE(); +} +#endif + static __inline int mapgsm(u_int freq, u_int flags) { Index: net80211/ieee80211_ddb.c =========================================================================== --- net80211/ieee80211_ddb.c 2008/08/25 00:28:58 #2 +++ net80211/ieee80211_ddb.c 2008/08/25 00:28:58 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,8 @@ break; } + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(vnet_iter); TAILQ_FOREACH(ifp, &V_ifnet, if_list) if (ifp->if_type == IFT_IEEE80211) { const struct ieee80211com *ic = ifp->if_l2com; @@ -208,6 +211,7 @@ } else _db_show_com(ic, 1, 1, 1); } + VNET_ITERLOOP_END(); } static void Index: net80211/ieee80211_freebsd.c =========================================================================== --- net80211/ieee80211_freebsd.c 2008/08/25 00:28:58 #2 +++ net80211/ieee80211_freebsd.c 2008/08/25 00:28:58 @@ -39,8 +39,8 @@ #include #include #include - #include +#include #include #include @@ -478,9 +478,11 @@ { struct ieee80211_join_event iev; + CURVNET_SET(ifp->if_vnet); memset(&iev, 0, sizeof(iev)); IEEE80211_ADDR_COPY(iev.iev_addr, mac); rt_ieee80211msg(ifp, op, &iev, sizeof(iev)); + CURVNET_RESTORE(); } void @@ -489,6 +491,7 @@ struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; + CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode join", (ni == vap->iv_bss) ? "bss " : ""); @@ -500,6 +503,7 @@ notify_macaddr(ifp, newassoc ? RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, ni->ni_macaddr); } + CURVNET_RESTORE(); } void @@ -508,6 +512,7 @@ struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; + CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode leave", (ni == vap->iv_bss) ? "bss " : ""); @@ -518,6 +523,7 @@ /* fire off wireless event station leaving */ notify_macaddr(ifp, RTM_IEEE80211_LEAVE, ni->ni_macaddr); } + CURVNET_RESTORE(); } void @@ -528,7 +534,9 @@ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", "notify scan done"); /* dispatch wireless event indicating scan completed */ + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0); + CURVNET_RESTORE(); } void @@ -556,7 +564,9 @@ iev.iev_keyix = k->wk_keyix; iev.iev_keyrsc = k->wk_keyrsc[0]; /* XXX need tid */ iev.iev_rsc = rsc; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } @@ -577,7 +587,9 @@ IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = IEEE80211_CIPHER_TKIP; iev.iev_keyix = keyix; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } Index: net80211/ieee80211_ioctl.c =========================================================================== --- net80211/ieee80211_ioctl.c 2008/08/25 00:28:58 #2 +++ net80211/ieee80211_ioctl.c 2008/08/25 00:28:58 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include Index: net80211/ieee80211_var.h =========================================================================== --- net80211/ieee80211_var.h 2008/08/25 00:28:58 #1 +++ net80211/ieee80211_var.h 2008/08/25 00:28:58 @@ -544,6 +544,8 @@ int ieee80211_vap_attach(struct ieee80211vap *, ifm_change_cb_t, ifm_stat_cb_t); void ieee80211_vap_detach(struct ieee80211vap *); +void ieee80211_reassign(struct ieee80211vap *, struct vnet *, char *); + const struct ieee80211_rateset *ieee80211_get_suprates(struct ieee80211com *ic, const struct ieee80211_channel *); void ieee80211_announce(struct ieee80211com *); Index: netgraph/atm/ng_atm.c =========================================================================== --- netgraph/atm/ng_atm.c 2008/08/25 00:28:58 #5 +++ netgraph/atm/ng_atm.c 2008/08/25 00:28:58 @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -1401,10 +1402,15 @@ ng_atm_input_orphan_p = ng_atm_input_orphans; ng_atm_event_p = ng_atm_event; - /* Create nodes for existing ATM interfaces */ - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type == IFT_ATM) - ng_atm_attach(ifp); + { + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); + /* Create nodes for existing ATM interfaces */ + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ATM) + ng_atm_attach(ifp); + } + VNET_ITERLOOP_END(); } IFNET_RUNLOCK(); break; @@ -1419,9 +1425,14 @@ ng_atm_input_orphan_p = NULL; ng_atm_event_p = NULL; - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type == IFT_ATM) - ng_atm_detach(ifp); + { + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ATM) + ng_atm_detach(ifp); + } + VNET_ITERLOOP_END(); } IFNET_RUNLOCK(); break; Index: netgraph/netgraph.h =========================================================================== --- netgraph/netgraph.h 2008/08/25 00:28:58 #3 +++ netgraph/netgraph.h 2008/08/25 00:28:58 @@ -352,6 +352,7 @@ LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */ struct ng_queue nd_input_queue; /* input queue for locking */ int nd_refs; /* # of references to this node */ + struct vnet *nd_vnet; /* network stack instance */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define ND_MAGIC 0x59264837 int nd_magic; @@ -1123,6 +1124,7 @@ struct ng_type *ng_findtype(const char *type); int ng_make_node_common(struct ng_type *typep, node_p *nodep); int ng_name_node(node_p node, const char *name); +node_p ng_name2noderef(node_p node, const char *name); int ng_newtype(struct ng_type *tp); ng_ID_t ng_node2ID(node_p node); item_p ng_package_data(struct mbuf *m, int flags); @@ -1184,69 +1186,3 @@ #define ng_copy_meta(meta) NULL #endif /* _NETGRAPH_NETGRAPH_H_ */ -/*- - * Copyright (c) 2006-2008 University of Zagreb - * Copyright (c) 2006-2008 FreeBSD Foundation - * - * This software was developed by the University of Zagreb and the - * FreeBSD Foundation under sponsorship by the Stichting NLnet and the - * FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _NETGRAPH_VNETGRPAH_H_ -#define _NETGRAPH_VNETGRAPH_H_ - -#include - -#define INIT_VNET_NETGRAPH(vnet) \ - INIT_FROM_VNET(vnet, VNET_MOD_NETGRAPH, \ - struct vnet_netgraph, vnet_netgraph) - -#define VNET_NETGRAPH(sym) VSYM(vnet_netgraph, sym) - -#define NG_ID_HASH_SIZE 32 /* most systems wont need even this many */ -#define NG_NAME_HASH_SIZE 128 /* most systems wont need even this many */ - -#ifdef VIMAGE -struct vnet_netgraph { - LIST_HEAD(, ng_node) _ng_ID_hash[NG_ID_HASH_SIZE]; - LIST_HEAD(, ng_node) _ng_name_hash[NG_ID_HASH_SIZE]; - LIST_HEAD(, ng_node) _ng_nodelist; - ng_ID_t _nextID; - struct unrhdr *_ng_iface_unit; - struct unrhdr *_ng_eiface_unit; - struct unrhdr *_ng_wormhole_unit; -}; -#endif - -/* Symbol translation macros */ -#define V_ng_ID_hash VNET_NETGRAPH(ng_ID_hash) -#define V_ng_name_hash VNET_NETGRAPH(ng_name_hash) -#define V_ng_nodelist VNET_NETGRAPH(ng_nodelist) -#define V_nextID VNET_NETGRAPH(nextID) -#define V_ng_iface_unit VNET_NETGRAPH(ng_iface_unit) -#define V_ng_eiface_unit VNET_NETGRAPH(ng_eiface_unit) -#define V_ng_wormhole_unit VNET_NETGRAPH(ng_wormhole_unit) - -#endif /* !_NETGRAPH_VNETGRAPH_H_ */ Index: netgraph/ng_base.c =========================================================================== --- netgraph/ng_base.c 2008/08/25 00:28:58 #9 +++ netgraph/ng_base.c 2008/08/25 00:28:58 @@ -63,9 +63,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -87,8 +89,8 @@ static void ng_dumpitems(void); static void ng_dumpnodes(void); static void ng_dumphooks(void); +#endif /* NETGRAPH_DEBUG */ -#endif /* NETGRAPH_DEBUG */ /* * DEAD versions of the structures. * In order to avoid races, it is sometimes neccesary to point @@ -128,10 +130,10 @@ }, 1, /* refs */ #ifdef NETGRAPH_DEBUG - ND_MAGIC, - __FILE__, - __LINE__, - {NULL} + .nd_magic = ND_MAGIC, + .lastfile = __FILE__, + .lastline = __LINE__, + .nd_all = {NULL} #endif /* NETGRAPH_DEBUG */ }; @@ -167,8 +169,12 @@ /* Hash related definitions */ /* XXX Don't need to initialise them because it's a LIST */ +#ifndef VIMAGE static LIST_HEAD(, ng_node) ng_ID_hash[NG_ID_HASH_SIZE]; +static LIST_HEAD(, ng_node) ng_name_hash[NG_NAME_HASH_SIZE]; +#endif static struct mtx ng_idhash_mtx; +static struct mtx ng_namehash_mtx; /* Method to find a node.. used twice so do it here */ #define NG_IDHASH_FN(ID) ((ID) % (NG_ID_HASH_SIZE)) #define NG_IDHASH_FIND(ID, node) \ @@ -183,9 +189,6 @@ } \ } while (0) -#define NG_NAME_HASH_SIZE 128 /* most systems wont need even this many */ -static LIST_HEAD(, ng_node) ng_name_hash[NG_NAME_HASH_SIZE]; -static struct mtx ng_namehash_mtx; #define NG_NAMEHASH(NAME, HASH) \ do { \ u_char h = 0; \ @@ -215,7 +218,6 @@ /* Imported, these used to be externally visible, some may go back. */ void ng_destroy_hook(hook_p hook); -node_p ng_name2noderef(node_p node, const char *name); int ng_path2noderef(node_p here, const char *path, node_p *dest, hook_p *lasthook); int ng_make_node(const char *type, node_p *nodepp); @@ -251,6 +253,14 @@ #define NG_WORKLIST_UNLOCK() \ mtx_unlock(&ng_worklist_mtx) +static vnet_attach_fn vnet_netgraph_iattach; +#ifdef VIMAGE +static vnet_detach_fn vnet_netgraph_idetach; +#endif /* VIMAGE */ + +VNET_MOD_DECLARE(NETGRAPH, netgraph, vnet_netgraph_iattach, + vnet_netgraph_idetach, LOIF, NULL) + #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ /* * In debug mode: @@ -349,7 +359,9 @@ #define TRAP_ERROR() #endif -static ng_ID_t nextID = 1; +#ifndef VIMAGE +static ng_ID_t nextID; +#endif #ifdef INVARIANTS #define CHECK_DATA_MBUF(m) do { \ @@ -573,7 +585,8 @@ return (EINVAL); } - /* Locate the node type. If we fail we return. Do not try to load + /* + * Locate the node type. If we fail we return. Do not try to load * module. */ if ((type = ng_findtype(typename)) == NULL) @@ -627,6 +640,9 @@ return (ENOMEM); } node->nd_type = type; +#ifdef VIMAGE + node->nd_vnet = curvnet; +#endif NG_NODE_REF(node); /* note reference */ type->refs++; @@ -3073,6 +3089,11 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(ng_qdzone, maxdata); netisr_register(NETISR_NETGRAPH, (netisr_t *)ngintr, NULL, 0); +#ifdef VIMAGE + vnet_mod_register(&vnet_netgraph_modinfo); +#else + vnet_netgraph_iattach(NULL); +#endif /* !VIMAGE */ break; case MOD_UNLOAD: /* You can't unload it because an interface may be using it. */ @@ -3085,6 +3106,44 @@ return (error); } +static int vnet_netgraph_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + +#ifdef VIMAGE + LIST_INIT(&V_ng_nodelist); /* XXX should go away */ +#endif + V_nextID = 1; + + return 0; +} + +#ifdef VIMAGE +static int vnet_netgraph_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node, last_killed = NULL; + + while ((node = LIST_FIRST(&V_ng_nodelist)) != NULL) { + if (node == last_killed) { + /* This should never happen */ + node->nd_flags |= NGF_REALLY_DIE; + printf("netgraph node %s needs NGF_REALLY_DIE\n", + node->nd_name); + ng_rmnode(node, NULL, NULL, 0); + /* This must never happen */ + if (node == LIST_FIRST(&V_ng_nodelist)) + panic("netgraph node %s won't die", + node->nd_name); + } + ng_rmnode(node, NULL, NULL, 0); + last_killed = node; + } + + return 0; +} +#endif /* VIMAGE */ + static moduledata_t netgraph_mod = { "netgraph", ngb_mod_event, @@ -3246,6 +3305,7 @@ NG_WORKLIST_UNLOCK(); break; } + CURVNET_SET(node->nd_vnet); STAILQ_REMOVE_HEAD(&ng_worklist, nd_input_queue.q_work); NG_WORKLIST_UNLOCK(); CTR3(KTR_NET, "%20s: node [%x] (%p) taken off worklist", @@ -3277,6 +3337,7 @@ } } NG_NODE_UNREF(node); + CURVNET_RESTORE(); } } @@ -3610,7 +3671,9 @@ { item_p item = arg; + CURVNET_SET(NGI_NODE(item)->nd_vnet); ng_snd_item(item, 0); + CURVNET_RESTORE(); } Index: netgraph/ng_bridge.c =========================================================================== --- netgraph/ng_bridge.c 2008/08/25 00:28:58 #7 +++ netgraph/ng_bridge.c 2008/08/25 00:28:58 @@ -96,13 +96,14 @@ /* Per-node private data */ struct ng_bridge_private { struct ng_bridge_bucket *tab; /* hash table bucket array */ - struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS]; + struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS + 1]; struct ng_bridge_config conf; /* node configuration */ node_p node; /* netgraph node */ u_int numHosts; /* num entries in table */ u_int numBuckets; /* num buckets in table */ u_int hashMask; /* numBuckets - 1 */ int numLinks; /* num connected links */ + int persistent; /* can exist w/o any hooks */ struct callout timer; /* one second periodic timer */ }; typedef struct ng_bridge_private *priv_p; @@ -343,13 +344,13 @@ ng_bridge_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); + int linkNum = -1; /* Check for a link hook */ if (strncmp(name, NG_BRIDGE_HOOK_LINK_PREFIX, strlen(NG_BRIDGE_HOOK_LINK_PREFIX)) == 0) { const char *cp; char *eptr; - u_long linkNum; cp = name + strlen(NG_BRIDGE_HOOK_LINK_PREFIX); if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0')) @@ -357,6 +358,14 @@ linkNum = strtoul(cp, &eptr, 10); if (*eptr != '\0' || linkNum >= NG_BRIDGE_MAX_LINKS) return (EINVAL); + } else if (strcmp(name, "anchor") == 0) { + linkNum = NG_BRIDGE_MAX_LINKS; + if (priv->persistent) + return (EISCONN); + priv->persistent = 1; + } + + if (linkNum >= 0 ) { if (priv->links[linkNum] != NULL) return (EISCONN); MALLOC(priv->links[linkNum], struct ng_bridge_link *, @@ -364,7 +373,7 @@ if (priv->links[linkNum] == NULL) return (ENOMEM); priv->links[linkNum]->hook = hook; - NG_HOOK_SET_PRIVATE(hook, (void *)linkNum); + NG_HOOK_SET_PRIVATE(hook, (void *)(intptr_t)linkNum); priv->numLinks++; return (0); } @@ -783,7 +792,7 @@ /* Get link number */ linkNum = (intptr_t)NG_HOOK_PRIVATE(hook); - KASSERT(linkNum >= 0 && linkNum < NG_BRIDGE_MAX_LINKS, + KASSERT(linkNum >= 0 && linkNum <= NG_BRIDGE_MAX_LINKS, ("%s: linkNum=%u", __func__, linkNum)); /* Remove all hosts associated with this link */ @@ -797,7 +806,8 @@ /* If no more hooks, go away */ if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) - && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) { + && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + && !priv->persistent) { ng_rmnode_self(NG_HOOK_NODE(hook)); } return (0); Index: netgraph/ng_eiface.c =========================================================================== --- netgraph/ng_eiface.c 2008/08/25 00:28:58 #7 +++ netgraph/ng_eiface.c 2008/08/25 00:28:58 @@ -43,7 +43,9 @@ #include #include #include +#include +#include #include #include #include @@ -112,7 +114,15 @@ }; NETGRAPH_INIT(eiface, &typestruct); +static vnet_attach_fn ng_eiface_iattach; +static vnet_detach_fn ng_eiface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_eiface_unit; +#endif + +VNET_MOD_DECLARE_STATELESS(NG_EIFACE, ng_eiface, ng_eiface_iattach, + ng_eiface_idetach, NETGRAPH) /************************************************************************ INTERFACE STUFF @@ -245,6 +255,14 @@ * Send packet; if hook is not connected, mbuf will get * freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(node->nd_vnet); + NG_SEND_DATA_ONLY(error, priv->ether, m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, priv->ether, m); /* Update stats */ @@ -369,12 +387,10 @@ ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; ifp->if_flags = (IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST); -#if 0 - /* Give this node name */ - bzero(ifname, sizeof(ifname)); - sprintf(ifname, "if%s", ifp->if_xname); - (void)ng_name_node(node, ifname); -#endif + /* Give this node the same name as the interface (if possible) */ + if (ng_name_node(node, ifp->if_xname) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", + ifp->if_xname); /* Attach the interface */ ether_ifattach(ifp, eaddr); @@ -447,8 +463,6 @@ caddr_t ptr; int buflen; -#define SA_SIZE(s) ((s)->sa_lensa_len) - /* Determine size of response and allocate it */ buflen = 0; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) @@ -534,6 +548,12 @@ /* Update interface stats */ ifp->if_ipackets++; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + (*ifp->if_input)(ifp, m); /* Done */ @@ -583,10 +603,18 @@ switch (event) { case MOD_LOAD: - V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_eiface_modinfo); +#else + ng_eiface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(V_ng_eiface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_eiface_modinfo); +#else + ng_eiface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -594,3 +622,32 @@ } return (error); } + +static int ng_eiface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_eiface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_eiface_unit); + + return 0; +} Index: netgraph/ng_ether.c =========================================================================== --- netgraph/ng_ether.c 2008/08/25 00:28:58 #6 +++ netgraph/ng_ether.c 2008/08/25 00:28:58 @@ -56,6 +56,7 @@ #include #include +#include #include #include #include @@ -71,6 +72,12 @@ #define IFP2NG(ifp) (IFP2AC((ifp))->ac_netgraph) +static vnet_attach_fn ng_ether_iattach; +static vnet_detach_fn ng_ether_idetach; + +VNET_MOD_DECLARE_STATELESS(NG_ETHER, ng_ether, ng_ether_iattach, + ng_ether_idetach, NETGRAPH) + /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ @@ -283,6 +290,17 @@ priv_p priv; node_p node; + /* + * Do not create / attach an ether node to this ifnet if + * a netgraph node with the same name already exists. + * This should prevent ether nodes to be attached to + * eiface nodes in the same vnet, which is pointless. + */ + if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) { + NG_NODE_UNREF(node); + return; + } + /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { @@ -731,53 +749,25 @@ static int ng_ether_mod_event(module_t mod, int event, void *data) { - struct ifnet *ifp; int error = 0; int s; s = splnet(); switch (event) { case MOD_LOAD: - - /* Register function hooks */ - if (ng_ether_attach_p != NULL) { - error = EEXIST; - break; - } - ng_ether_attach_p = ng_ether_attach; - ng_ether_detach_p = ng_ether_detach; - ng_ether_output_p = ng_ether_output; - ng_ether_input_p = ng_ether_input; - ng_ether_input_orphan_p = ng_ether_input_orphan; - ng_ether_link_state_p = ng_ether_link_state; - - /* Create nodes for any already-existing Ethernet interfaces */ - IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type == IFT_ETHER - || ifp->if_type == IFT_L2VLAN) - ng_ether_attach(ifp); - } - IFNET_RUNLOCK(); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_ether_modinfo); +#else + error = ng_ether_iattach(NULL); +#endif break; case MOD_UNLOAD: - - /* - * Note that the base code won't try to unload us until - * all nodes have been removed, and that can't happen - * until all Ethernet interfaces are removed. In any - * case, we know there are no nodes left if the action - * is MOD_UNLOAD, so there's no need to detach any nodes. - */ - - /* Unregister function hooks */ - ng_ether_attach_p = NULL; - ng_ether_detach_p = NULL; - ng_ether_output_p = NULL; - ng_ether_input_p = NULL; - ng_ether_input_orphan_p = NULL; - ng_ether_link_state_p = NULL; +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_ether_modinfo); +#else + ng_ether_idetach(NULL); +#endif break; default: @@ -788,3 +778,62 @@ return (error); } +static int ng_ether_iattach(const void *unused) +{ + INIT_VNET_NET(curvnet); + struct ifnet *ifp; + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)){ +#endif + /* Register function hooks */ + if (ng_ether_attach_p != NULL) + return(EEXIST); + ng_ether_attach_p = ng_ether_attach; + ng_ether_detach_p = ng_ether_detach; + ng_ether_output_p = ng_ether_output; + ng_ether_input_p = ng_ether_input; + ng_ether_input_orphan_p = ng_ether_input_orphan; + ng_ether_link_state_p = ng_ether_link_state; +#ifdef VIMAGE + } +#endif + + /* Create nodes for any already-existing Ethernet interfaces */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ETHER + || ifp->if_type == IFT_L2VLAN) + ng_ether_attach(ifp); + } + IFNET_RUNLOCK(); + + return 0; +} + +static int ng_ether_idetach(const void *unused) +{ + /* + * Note that the base code won't try to unload us until + * all nodes have been removed, and that can't happen + * until all Ethernet interfaces are removed. In any + * case, we know there are no nodes left if the action + * is MOD_UNLOAD, so there's no need to detach any nodes. + */ + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return(0); +#endif + + /* Unregister function hooks */ + ng_ether_attach_p = NULL; + ng_ether_detach_p = NULL; + ng_ether_output_p = NULL; + ng_ether_input_p = NULL; + ng_ether_input_orphan_p = NULL; + ng_ether_link_state_p = NULL; + + return 0; +} + Index: netgraph/ng_gif.c =========================================================================== --- netgraph/ng_gif.c 2008/08/25 00:28:58 #6 +++ netgraph/ng_gif.c 2008/08/25 00:28:58 @@ -79,6 +79,7 @@ #include #include +#include #include #include #include @@ -561,10 +562,13 @@ /* Create nodes for any already-existing gif interfaces */ IFNET_RLOCK(); + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type == IFT_GIF) ng_gif_attach(ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); break; Index: netgraph/ng_hub.c =========================================================================== --- netgraph/ng_hub.c 2008/08/25 00:28:58 #1 +++ netgraph/ng_hub.c 2008/08/25 00:28:58 @@ -37,6 +37,7 @@ #include static ng_constructor_t ng_hub_constructor; +static ng_newhook_t ng_hub_newhook; static ng_rcvdata_t ng_hub_rcvdata; static ng_disconnect_t ng_hub_disconnect; @@ -44,6 +45,7 @@ .version = NG_ABI_VERSION, .name = NG_HUB_NODE_TYPE, .constructor = ng_hub_constructor, + .newhook = ng_hub_newhook, .rcvdata = ng_hub_rcvdata, .disconnect = ng_hub_disconnect, }; @@ -57,6 +59,14 @@ return (0); } +static int +ng_hub_newhook(node_p node, hook_p hook, const char *name) +{ + if (strcmp(name, "anchor") == 0) + node->nd_private = (void *) 1; + return (0); +} + static int ng_hub_rcvdata(hook_p hook, item_p item) { @@ -94,7 +104,7 @@ { if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0 && - NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + NG_NODE_IS_VALID(NG_HOOK_NODE(hook)) && !hook->hk_node->nd_private) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } Index: netgraph/ng_iface.c =========================================================================== --- netgraph/ng_iface.c 2008/08/25 00:28:58 #7 +++ netgraph/ng_iface.c 2008/08/25 00:28:58 @@ -78,6 +78,7 @@ #include +#include #include #include #include @@ -122,6 +123,10 @@ static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct rtentry *rt0); +#ifdef VIMAGE +static void ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, + char *dname); +#endif static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, @@ -208,8 +213,16 @@ }; NETGRAPH_INIT(iface, &typestruct); +static vnet_attach_fn ng_iface_iattach; +static vnet_detach_fn ng_iface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_iface_unit; +#endif +VNET_MOD_DECLARE_STATELESS(NG_IFACE, ng_iface, ng_iface_iattach, + ng_iface_idetach, NETGRAPH) + /************************************************************************ HELPER STUFF ************************************************************************/ @@ -450,6 +463,14 @@ /* Send packet. If hook is not connected, mbuf will get freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != priv->node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(priv->node->nd_vnet); + NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); /* Update stats. */ @@ -536,6 +557,9 @@ ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; +#ifdef VIMAGE + ifp->if_reassign = ng_iface_reassign; +#endif ifp->if_watchdog = NULL; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); @@ -560,6 +584,24 @@ return (0); } +#ifdef VIMAGE +static void +ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + bpfdetach(ifp); + if_detach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "ser"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + CURVNET_RESTORE(); +} +#endif + /* * Give our ok for a hook to be added */ @@ -722,6 +764,12 @@ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; @@ -809,10 +857,18 @@ switch (event) { case MOD_LOAD: - V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_iface_modinfo); +#else + ng_iface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(V_ng_iface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_iface_modinfo); +#else + ng_iface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -820,3 +876,32 @@ } return (error); } + +static int ng_iface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_iface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_iface_unit); + + return 0; +} Index: netgraph/ng_pipe.c =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- netgraph/ng_pipe.c Mon Aug 25 00:29:02 2008 *************** *** 0 **** --- 1,1051 ---- + /* + * Copyright (c) 2004-2008 University of Zagreb + * Copyright (c) 2007-2008 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + /* + * This node permits simple traffic shaping by emulating bandwidth + * and delay, as well as random packet losses. + * The node has two hooks, upper and lower. Traffic flowing from upper to + * lower hook is referenced as downstream, and vice versa. Parameters for + * both directions can be set separately, except for delay. + */ + + + #include + #include + #include + #include + #include + #include + #include + #include + + #include + + #include + #include + #include + + #include + #include + #include + #include + + static MALLOC_DEFINE(M_NG_PIPE, "ng_pipe", "ng_pipe"); + + struct mtx ng_pipe_giant; + + /* Packet header struct */ + struct ngp_hdr { + TAILQ_ENTRY(ngp_hdr) ngp_link; /* next pkt in queue */ + struct timeval when; /* this packet's due time */ + struct mbuf *m; /* ptr to the packet data */ + }; + TAILQ_HEAD(p_head, ngp_hdr); + + /* FIFO queue struct */ + struct ngp_fifo { + TAILQ_ENTRY(ngp_fifo) fifo_le; /* list of active queues only */ + struct p_head packet_head; /* FIFO queue head */ + u_int32_t hash; /* flow signature */ + struct timeval vtime; /* virtual time, for WFQ */ + u_int32_t rr_deficit; /* for DRR */ + u_int32_t packets; /* # of packets in this queue */ + }; + + /* Per hook info */ + struct hookinfo { + hook_p hook; + int noqueue; /* bypass any processing */ + TAILQ_HEAD(, ngp_fifo) fifo_head; /* FIFO queues */ + TAILQ_HEAD(, ngp_hdr) qout_head; /* delay queue head */ + LIST_ENTRY(hookinfo) active_le; /* active hooks */ + struct timeval qin_utime; + struct ng_pipe_hookcfg cfg; + struct ng_pipe_hookrun run; + struct ng_pipe_hookstat stats; + uint64_t *ber_p; /* loss_p(BER,psize) map */ + }; + + /* Per node info */ + struct node_priv { + u_int64_t delay; + u_int32_t overhead; + u_int32_t header_offset; + struct hookinfo lower; + struct hookinfo upper; + }; + typedef struct node_priv *priv_p; + + /* Macro for calculating the virtual time for packet dequeueing in WFQ */ + #define FIFO_VTIME_SORT(plen) \ + if (hinfo->cfg.wfq && hinfo->cfg.bandwidth) { \ + ngp_f->vtime.tv_usec = now->tv_usec + ((uint64_t) (plen) \ + + priv->overhead ) * hinfo->run.fifo_queues * \ + 8000000 / hinfo->cfg.bandwidth; \ + ngp_f->vtime.tv_sec = now->tv_sec + \ + ngp_f->vtime.tv_usec / 1000000; \ + ngp_f->vtime.tv_usec = ngp_f->vtime.tv_usec % 1000000; \ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) \ + if (ngp_f1->vtime.tv_sec > ngp_f->vtime.tv_sec || \ + (ngp_f1->vtime.tv_sec == ngp_f->vtime.tv_sec && \ + ngp_f1->vtime.tv_usec > ngp_f->vtime.tv_usec)) \ + break; \ + if (ngp_f1 == NULL) \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + else \ + TAILQ_INSERT_BEFORE(ngp_f1, ngp_f, fifo_le); \ + } else \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + + + static void parse_cfg(struct ng_pipe_hookcfg *, struct ng_pipe_hookcfg *, + struct hookinfo *, priv_p); + static void pipe_dequeue(struct hookinfo *, struct timeval *); + static void pipe_scheduler(void *); + static void pipe_poll(void); + static int ngp_modevent(module_t, int, void *); + + /* linked list of active "pipe" hooks */ + static LIST_HEAD(, hookinfo) active_head; + static int active_gen_id = 0; + + /* timeout handle for pipe_scheduler */ + static struct callout polling_timer; + + /* zone for storing ngp_hdr-s */ + static uma_zone_t ngp_zone; + + /* Netgraph methods */ + static ng_constructor_t ngp_constructor; + static ng_rcvmsg_t ngp_rcvmsg; + static ng_shutdown_t ngp_shutdown; + static ng_newhook_t ngp_newhook; + static ng_rcvdata_t ngp_rcvdata; + static ng_disconnect_t ngp_disconnect; + + /* Parse type for struct ng_pipe_hookstat */ + static const struct ng_parse_struct_field + ng_pipe_hookstat_type_fields[] = NG_PIPE_HOOKSTAT_INFO; + static const struct ng_parse_type ng_pipe_hookstat_type = { + &ng_parse_struct_type, + &ng_pipe_hookstat_type_fields + }; + + /* Parse type for struct ng_pipe_stats */ + static const struct ng_parse_struct_field ng_pipe_stats_type_fields[] = + NG_PIPE_STATS_INFO(&ng_pipe_hookstat_type); + static const struct ng_parse_type ng_pipe_stats_type = { + &ng_parse_struct_type, + &ng_pipe_stats_type_fields + }; + + /* Parse type for struct ng_pipe_hookrun */ + static const struct ng_parse_struct_field + ng_pipe_hookrun_type_fields[] = NG_PIPE_HOOKRUN_INFO; + static const struct ng_parse_type ng_pipe_hookrun_type = { + &ng_parse_struct_type, + &ng_pipe_hookrun_type_fields + }; + + /* Parse type for struct ng_pipe_run */ + static const struct ng_parse_struct_field + ng_pipe_run_type_fields[] = NG_PIPE_RUN_INFO(&ng_pipe_hookrun_type); + static const struct ng_parse_type ng_pipe_run_type = { + &ng_parse_struct_type, + &ng_pipe_run_type_fields + }; + + /* Parse type for struct ng_pipe_hookcfg */ + static const struct ng_parse_struct_field + ng_pipe_hookcfg_type_fields[] = NG_PIPE_HOOKCFG_INFO; + static const struct ng_parse_type ng_pipe_hookcfg_type = { + &ng_parse_struct_type, + &ng_pipe_hookcfg_type_fields + }; + + /* Parse type for struct ng_pipe_cfg */ + static const struct ng_parse_struct_field + ng_pipe_cfg_type_fields[] = NG_PIPE_CFG_INFO(&ng_pipe_hookcfg_type); + static const struct ng_parse_type ng_pipe_cfg_type = { + &ng_parse_struct_type, + &ng_pipe_cfg_type_fields + }; + + /* List of commands and how to convert arguments to/from ASCII */ + static const struct ng_cmdlist ngp_cmds[] = { + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_STATS, + .name = "getstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_CLR_STATS, + .name = "clrstats" + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GETCLR_STATS, + .name = "getclrstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_RUN, + .name = "getrun", + .respType = &ng_pipe_run_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_CFG, + .name = "getcfg", + .respType = &ng_pipe_cfg_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_SET_CFG, + .name = "setcfg", + .mesgType = &ng_pipe_cfg_type, + }, + { 0 } + }; + + /* Netgraph type descriptor */ + static struct ng_type ng_pipe_typestruct = { + .version = NG_ABI_VERSION, + .name = NG_PIPE_NODE_TYPE, + .mod_event = ngp_modevent, + .constructor = ngp_constructor, + .shutdown = ngp_shutdown, + .rcvmsg = ngp_rcvmsg, + .newhook = ngp_newhook, + .rcvdata = ngp_rcvdata, + .disconnect = ngp_disconnect, + .cmdlist = ngp_cmds + }; + NETGRAPH_INIT(pipe, &ng_pipe_typestruct); + + /* Node constructor */ + static int + ngp_constructor(node_p node) + { + priv_p priv; + + MALLOC(priv, priv_p, sizeof(*priv), M_NG_PIPE, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + NG_NODE_SET_PRIVATE(node, priv); + + return (0); + } + + /* Add a hook */ + static int + ngp_newhook(node_p node, hook_p hook, const char *name) + { + const priv_p priv = NG_NODE_PRIVATE(node); + struct hookinfo *hinfo; + + if (strcmp(name, NG_PIPE_HOOK_UPPER) == 0) { + bzero(&priv->upper, sizeof(priv->upper)); + priv->upper.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->upper); + } else if (strcmp(name, NG_PIPE_HOOK_LOWER) == 0) { + bzero(&priv->lower, sizeof(priv->lower)); + priv->lower.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->lower); + } else + return (EINVAL); + + /* Load non-zero initial cfg values */ + hinfo = NG_HOOK_PRIVATE(hook); + hinfo->cfg.qin_size_limit = 50; + hinfo->cfg.fifo = 1; + hinfo->cfg.droptail = 1; + TAILQ_INIT(&hinfo->fifo_head); + TAILQ_INIT(&hinfo->qout_head); + return (0); + } + + /* Receive a control message */ + static int + ngp_rcvmsg(node_p node, item_p item, hook_p lasthook) + { + const priv_p priv = NG_NODE_PRIVATE(node); + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + struct ng_pipe_stats *stats; + struct ng_pipe_run *run; + struct ng_pipe_cfg *cfg; + int error = 0; + + mtx_lock(&ng_pipe_giant); + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_PIPE_COOKIE: + switch (msg->header.cmd) { + case NGM_PIPE_GET_STATS: + case NGM_PIPE_CLR_STATS: + case NGM_PIPE_GETCLR_STATS: + if (msg->header.cmd != NGM_PIPE_CLR_STATS) { + NG_MKRESPONSE(resp, msg, + sizeof(*stats), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + stats = (struct ng_pipe_stats *)resp->data; + bcopy(&priv->upper.stats, &stats->downstream, + sizeof(stats->downstream)); + bcopy(&priv->lower.stats, &stats->upstream, + sizeof(stats->upstream)); + } + if (msg->header.cmd != NGM_PIPE_GET_STATS) { + bzero(&priv->upper.stats, + sizeof(priv->upper.stats)); + bzero(&priv->lower.stats, + sizeof(priv->lower.stats)); + } + break; + case NGM_PIPE_GET_RUN: + NG_MKRESPONSE(resp, msg, sizeof(*run), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + run = (struct ng_pipe_run *)resp->data; + bcopy(&priv->upper.run, &run->downstream, + sizeof(run->downstream)); + bcopy(&priv->lower.run, &run->upstream, + sizeof(run->upstream)); + break; + case NGM_PIPE_GET_CFG: + NG_MKRESPONSE(resp, msg, sizeof(*cfg), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + cfg = (struct ng_pipe_cfg *)resp->data; + bcopy(&priv->upper.cfg, &cfg->downstream, + sizeof(cfg->downstream)); + bcopy(&priv->lower.cfg, &cfg->upstream, + sizeof(cfg->upstream)); + cfg->delay = priv->delay; + cfg->overhead = priv->overhead; + cfg->header_offset = priv->header_offset; + if (cfg->upstream.bandwidth == + cfg->downstream.bandwidth) { + cfg->bandwidth = cfg->upstream.bandwidth; + cfg->upstream.bandwidth = 0; + cfg->downstream.bandwidth = 0; + } else + cfg->bandwidth = 0; + break; + case NGM_PIPE_SET_CFG: + cfg = (struct ng_pipe_cfg *)msg->data; + if (msg->header.arglen != sizeof(*cfg)) { + error = EINVAL; + break; + } + + if (cfg->delay == -1) + priv->delay = 0; + else if (cfg->delay > 0 && cfg->delay < 10000000) + priv->delay = cfg->delay; + + if (cfg->bandwidth == -1) { + priv->upper.cfg.bandwidth = 0; + priv->lower.cfg.bandwidth = 0; + priv->overhead = 0; + } else if (cfg->bandwidth >= 100 && + cfg->bandwidth <= 1000000000) { + priv->upper.cfg.bandwidth = cfg->bandwidth; + priv->lower.cfg.bandwidth = cfg->bandwidth; + if (cfg->bandwidth >= 10000000) + priv->overhead = 8+4+12; /* Ethernet */ + else + priv->overhead = 10; /* HDLC */ + } + + if (cfg->overhead == -1) + priv->overhead = 0; + else if (cfg->overhead > 0 && cfg->overhead < 256) + priv->overhead = cfg->overhead; + + if (cfg->header_offset == -1) + priv->header_offset = 0; + else if (cfg->header_offset > 0 && + cfg->header_offset < 64) + priv->header_offset = cfg->header_offset; + + parse_cfg(&priv->upper.cfg, &cfg->downstream, + &priv->upper, priv); + parse_cfg(&priv->lower.cfg, &cfg->upstream, + &priv->lower, priv); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + + mtx_unlock(&ng_pipe_giant); + + return (error); + } + + static void + parse_cfg(struct ng_pipe_hookcfg *current, struct ng_pipe_hookcfg *new, + struct hookinfo *hinfo, priv_p priv) + { + + if (new->ber == -1) { + current->ber = 0; + if (hinfo->ber_p) { + FREE(hinfo->ber_p, M_NG_PIPE); + hinfo->ber_p = NULL; + } + } else if (new->ber >= 1 && new->ber <= 1000000000000) { + static const uint64_t one = 0x1000000000000; /* = 2^48 */ + uint64_t p0, p; + uint32_t fsize, i; + + if (hinfo->ber_p == NULL) + MALLOC(hinfo->ber_p, uint64_t *, \ + (MAX_FSIZE + MAX_OHSIZE)*sizeof(uint64_t), \ + M_NG_PIPE, M_NOWAIT); + current->ber = new->ber; + + /* + * For given BER and each frame size N (in bytes) calculate + * the probability P_OK that the frame is clean: + * + * P_OK(BER,N) = (1 - 1/BER)^(N*8) + * + * We use a 64-bit fixed-point format with decimal point + * positioned between bits 47 and 48. + */ + p0 = one - one / new->ber; + p = one; + for (fsize = 0; fsize < MAX_FSIZE + MAX_OHSIZE; fsize++) { + hinfo->ber_p[fsize] = p; + for (i=0; i<8; i++) + p = (p*(p0&0xffff)>>48) + \ + (p*((p0>>16)&0xffff)>>32) + \ + (p*(p0>>32)>>16); + } + } + + if (new->qin_size_limit == -1) + current->qin_size_limit = 0; + else if (new->qin_size_limit >= 5) + current->qin_size_limit = new->qin_size_limit; + + if (new->qout_size_limit == -1) + current->qout_size_limit = 0; + else if (new->qout_size_limit >= 5) + current->qout_size_limit = new->qout_size_limit; + + if (new->duplicate == -1) + current->duplicate = 0; + else if (new->duplicate > 0 && new->duplicate <= 50) + current->duplicate = new->duplicate; + + if (new->fifo) { + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } + + if (new->wfq) { + current->fifo = 0; + current->wfq = 1; + current->drr = 0; + } + + if (new->drr) { + current->fifo = 0; + current->wfq = 0; + /* DRR quantum */ + if (new->drr >= 32) + current->drr = new->drr; + else + current->drr = 2048; /* default quantum */ + } + + if (new->droptail) { + current->droptail = 1; + current->drophead = 0; + } + + if (new->drophead) { + current->droptail = 0; + current->drophead = 1; + } + + if (new->bandwidth == -1) { + current->bandwidth = 0; + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } else if (new->bandwidth >= 100 && new->bandwidth <= 1000000000) + current->bandwidth = new->bandwidth; + + if (current->bandwidth | priv->delay | + current->duplicate | current->ber) + hinfo->noqueue = 0; + else + hinfo->noqueue = 1; + } + + /* + * Compute a hash signature for a packet. This function suffers from the + * NIH sindrome, so probably it would be wise to look around what other + * folks have found out to be a good and efficient IP hash function... + */ + static int + ip_hash(struct mbuf *m, int offset) + { + u_int64_t i; + struct ip *ip = (struct ip *)(mtod(m, u_char *) + offset); + + if (m->m_len < sizeof(struct ip) + offset || + ip->ip_v != 4 || ip->ip_hl << 2 != sizeof(struct ip)) + return 0; + + i = ((u_int64_t) ip->ip_src.s_addr ^ + ((u_int64_t) ip->ip_src.s_addr << 13) ^ + ((u_int64_t) ip->ip_dst.s_addr << 7) ^ + ((u_int64_t) ip->ip_dst.s_addr << 19)); + return (i ^ (i >> 32)); + } + + /* + * Receive data on a hook - both in upstream and downstream direction. + * We put the frame on the inbound queue, and try to initiate dequeuing + * sequence immediately. If inbound queue is full, discard one frame + * depending on dropping policy (from the head or from the tail of the + * queue). + */ + static int + ngp_rcvdata(hook_p hook, item_p item) + { + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + struct timeval uuptime; + struct timeval *now = &uuptime; + struct ngp_fifo *ngp_f = NULL, *ngp_f1; + struct ngp_hdr *ngp_h = NULL; + struct mbuf *m; + int hash; + int error = 0; + + if (hinfo->noqueue) { + struct hookinfo *dest; + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + NG_FWD_ITEM_HOOK(error, item, dest->hook); + return error; + } + + mtx_lock(&ng_pipe_giant); + microuptime(now); + + /* + * Attach us to the list of active ng_pipes if this was an empty + * one before, and also update the queue service deadline time. + */ + if (hinfo->run.qin_frames == 0) { + struct timeval *when = &hinfo->qin_utime; + if (when->tv_sec < now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec < now->tv_usec)) { + when->tv_sec = now->tv_sec; + when->tv_usec = now->tv_usec; + } + if (hinfo->run.qout_frames == 0) + LIST_INSERT_HEAD(&active_head, hinfo, active_le); + } + + /* Populate the packet header */ + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT((ngp_h != NULL), ("ngp_h zalloc failed (1)")); + NGI_GET_M(item, m); + KASSERT(m != NULL, ("NGI_GET_M failed")); + ngp_h->m = m; + NG_FREE_ITEM(item); + + if (hinfo->cfg.fifo) + hash = 0; /* all packets go into a single FIFO queue */ + else + hash = ip_hash(m, priv->header_offset); + + /* Find the appropriate FIFO queue for the packet and enqueue it*/ + TAILQ_FOREACH(ngp_f, &hinfo->fifo_head, fifo_le) + if (hash == ngp_f->hash) + break; + if (ngp_f == NULL) { + ngp_f = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (2)")); + TAILQ_INIT(&ngp_f->packet_head); + ngp_f->hash = hash; + ngp_f->packets = 1; + ngp_f->rr_deficit = hinfo->cfg.drr; /* DRR quantum */ + hinfo->run.fifo_queues++; + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + FIFO_VTIME_SORT(m->m_pkthdr.len); + } else { + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + ngp_f->packets++; + } + hinfo->run.qin_frames++; + hinfo->run.qin_octets += m->m_pkthdr.len; + + /* Discard a frame if inbound queue limit has been reached */ + if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } else if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } + + /* + * Try to start the dequeuing process immediately. We must + * hold the ng_pipe_giant lock here and pipe_dequeue() will + * release it + */ + pipe_dequeue(hinfo, now); + + return (0); + } + + + /* + * Dequeueing sequence - we basically do the following: + * 1) Try to extract the frame from the inbound (bandwidth) queue; + * 2) In accordance to BER specified, discard the frame randomly; + * 3) If the frame survives BER, prepend it with delay info and move it + * to outbound (delay) queue; + * 4) Loop to 2) until bandwidth quota for this timeslice is reached, or + * inbound queue is flushed completely; + * 5) Extract the first frame from the outbound queue, if it's time has + * come. Queue the frame for transmission on the outbound hook; + * 6) Loop to 5) until outbound queue is flushed completely, or the next + * frame in the queue is not scheduled to be dequeued yet; + * 7) Transimit all frames queued in 5) + * + * Note: the caller must hold the ng_pipe_giant lock; this function + * returns with the lock released. + */ + static void + pipe_dequeue(struct hookinfo *hinfo, struct timeval *now) { + static uint64_t rand, oldrand; + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hinfo->hook)); + struct hookinfo *dest; + struct ngp_fifo *ngp_f, *ngp_f1; + struct ngp_hdr *ngp_h; + struct timeval *when; + struct mbuf *q_head = NULL; + struct mbuf *q_tail = NULL; + struct mbuf *m; + int error = 0; + + /* Which one is the destination hook? */ + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + + /* Bandwidth queue processing */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + when = &hinfo->qin_utime; + if (when->tv_sec > now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec > now->tv_usec)) + break; + + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + m = ngp_h->m; + + /* Deficit Round Robin (DRR) processing */ + if (hinfo->cfg.drr) { + if (ngp_f->rr_deficit >= m->m_pkthdr.len) { + ngp_f->rr_deficit -= m->m_pkthdr.len; + } else { + ngp_f->rr_deficit += hinfo->cfg.drr; + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + TAILQ_INSERT_TAIL(&hinfo->fifo_head, + ngp_f, fifo_le); + continue; + } + } + + /* + * Either create a duplicate and pass it on, or dequeue + * the original packet... + */ + if (hinfo->cfg.duplicate && + random() % 100 <= hinfo->cfg.duplicate) { + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (3)")); + ngp_h->m = m_dup(m, M_NOWAIT); + KASSERT(ngp_h->m != NULL, ("m_dup failed")); + } else { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + hinfo->run.qin_frames--; + hinfo->run.qin_octets -= m->m_pkthdr.len; + ngp_f->packets--; + } + + /* Calculate the serialization delay */ + if (hinfo->cfg.bandwidth) { + hinfo->qin_utime.tv_usec += ((uint64_t) m->m_pkthdr.len + + priv->overhead ) * + 8000000 / hinfo->cfg.bandwidth; + hinfo->qin_utime.tv_sec += + hinfo->qin_utime.tv_usec / 1000000; + hinfo->qin_utime.tv_usec = + hinfo->qin_utime.tv_usec % 1000000; + } + when = &ngp_h->when; + when->tv_sec = hinfo->qin_utime.tv_sec; + when->tv_usec = hinfo->qin_utime.tv_usec; + + /* Sort / rearrange inbound queues */ + if (ngp_f->packets) { + if (hinfo->cfg.wfq) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + FIFO_VTIME_SORT(TAILQ_FIRST( + &ngp_f->packet_head)->m->m_pkthdr.len) + } + } else { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + + /* Randomly discard the frame, according to BER setting */ + if (hinfo->cfg.ber && + ((oldrand = rand) ^ (rand = random())<<17) >= + hinfo->ber_p[priv->overhead + m->m_pkthdr.len] ) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Discard frame if outbound queue size limit exceeded */ + if (hinfo->cfg.qout_size_limit && + hinfo->run.qout_frames>=hinfo->cfg.qout_size_limit) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Calculate the propagation delay */ + when->tv_usec += priv->delay; + when->tv_sec += when->tv_usec / 1000000; + when->tv_usec = when->tv_usec % 1000000; + + /* Put the frame into the delay queue */ + TAILQ_INSERT_TAIL(&hinfo->qout_head, ngp_h, ngp_link); + hinfo->run.qout_frames++; + hinfo->run.qout_octets += m->m_pkthdr.len; + } + + /* Delay queue processing */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + struct mbuf *m = ngp_h->m; + + when = &ngp_h->when; + if (when->tv_sec > now->tv_sec || + (when->tv_sec == now->tv_sec && + when->tv_usec > now->tv_usec)) + break; + + /* Update outbound queue stats */ + hinfo->stats.fwd_frames++; + hinfo->stats.fwd_octets += m->m_pkthdr.len; + hinfo->run.qout_frames--; + hinfo->run.qout_octets -= m->m_pkthdr.len; + + /* Dequeue the packet from qout */ + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + uma_zfree(ngp_zone, ngp_h); + + /* Enqueue locally for sending downstream */ + if (q_head == NULL) + q_head = m; + if (q_tail) + q_tail->m_nextpkt = m; + q_tail = m; + m->m_nextpkt = NULL; + } + + /* If both queues are empty detach us from the list of active queues */ + if (hinfo->run.qin_frames + hinfo->run.qout_frames == 0) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + + mtx_unlock(&ng_pipe_giant); + + while ((m = q_head) != NULL) { + q_head = m->m_nextpkt; + m->m_nextpkt = NULL; + NG_SEND_DATA(error, dest->hook, m, meta); + } + } + + + /* + * This routine is called on every clock tick. We poll all nodes/hooks + * for queued frames by calling pipe_dequeue(). + */ + static void + pipe_scheduler(void *arg) + { + pipe_poll(); + + /* Reschedule */ + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); + } + + + /* + * Traverse the list of all active hooks and attempt to dequeue + * some packets. Hooks with empty queues are not traversed since + * they are not linked into this list. + */ + static void + pipe_poll(void) + { + struct hookinfo *hinfo; + struct timeval now; + int old_gen_id = active_gen_id; + + mtx_lock(&ng_pipe_giant); + microuptime(&now); + LIST_FOREACH(hinfo, &active_head, active_le) { + CURVNET_SET(NG_HOOK_NODE(hinfo->hook)->nd_vnet); + pipe_dequeue(hinfo, &now); + CURVNET_RESTORE(); + mtx_lock(&ng_pipe_giant); + if (old_gen_id != active_gen_id) { + /* the list was updated; restart traversing */ + hinfo = LIST_FIRST(&active_head); + if (hinfo == NULL) + break; + old_gen_id = active_gen_id; + continue; + } + } + mtx_unlock(&ng_pipe_giant); + } + + + /* + * Shutdown processing + * + * This is tricky. If we have both a lower and upper hook, then we + * probably want to extricate ourselves and leave the two peers + * still linked to each other. Otherwise we should just shut down as + * a normal node would. + */ + static int + ngp_shutdown(node_p node) + { + const priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->lower.hook && priv->upper.hook) + ng_bypass(priv->lower.hook, priv->upper.hook); + else { + if (priv->upper.hook != NULL) + ng_rmhook_self(priv->upper.hook); + if (priv->lower.hook != NULL) + ng_rmhook_self(priv->lower.hook); + } + NG_NODE_UNREF(node); + FREE(priv, M_NG_PIPE); + return (0); + } + + + /* + * Hook disconnection + */ + static int + ngp_disconnect(hook_p hook) + { + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + struct ngp_fifo *ngp_f; + struct ngp_hdr *ngp_h; + int removed = 0; + + mtx_lock(&ng_pipe_giant); + + KASSERT(hinfo != NULL, ("%s: null info", __FUNCTION__)); + hinfo->hook = NULL; + + /* Flush all fifo queues associated with the hook */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + while ((ngp_h = TAILQ_FIRST(&ngp_f->packet_head))) { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + } + + /* Flush the delay queue */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + + /* + * Both queues should be empty by now, so detach us from + * the list of active queues + */ + if (removed) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + if (hinfo->run.qin_frames + hinfo->run.qout_frames != removed) + printf("Mismatch: queued=%d but removed=%d !?!", + hinfo->run.qin_frames + hinfo->run.qout_frames, removed); + + /* Release the packet loss probability table (BER) */ + if (hinfo->ber_p) + FREE(hinfo->ber_p, M_NG_PIPE); + + mtx_unlock(&ng_pipe_giant); + + return (0); + } + + static int + ngp_modevent(module_t mod, int type, void *unused) + { + int error = 0; + + switch (type) { + case MOD_LOAD: + ngp_zone = uma_zcreate("ng_pipe", max(sizeof(struct ngp_hdr), + sizeof (struct ngp_fifo)), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + if (ngp_zone == NULL) + panic("ng_pipe: couldn't allocate descriptor zone"); + + mtx_init(&ng_pipe_giant, "ng_pipe_giant", NULL, MTX_DEF); + LIST_INIT(&active_head); + callout_init(&polling_timer, CALLOUT_MPSAFE); + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); + break; + case MOD_UNLOAD: + callout_drain(&polling_timer); + uma_zdestroy(ngp_zone); + mtx_destroy(&ng_pipe_giant); + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); + } Index: netgraph/ng_pipe.h =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- netgraph/ng_pipe.h Mon Aug 25 00:29:03 2008 *************** *** 0 **** --- 1,171 ---- + /* + * Copyright (c) 2004-2008 University of Zagreb + * Copyright (c) 2007-2008 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + #ifndef _NETGRAPH_PIPE_H_ + #define _NETGRAPH_PIPE_H_ + + /* Node type name and magic cookie */ + #define NG_PIPE_NODE_TYPE "pipe" + #define NGM_PIPE_COOKIE 200708191 + + /* Hook names */ + #define NG_PIPE_HOOK_UPPER "upper" + #define NG_PIPE_HOOK_LOWER "lower" + + #define MAX_FSIZE 16384 /* Largest supported frame size, in bytes, for BER */ + #define MAX_OHSIZE 256 /* Largest supported dummy-framing size, in bytes */ + + /* Statistics structure for one hook */ + struct ng_pipe_hookstat { + u_int64_t fwd_octets; + u_int64_t fwd_frames; + u_int64_t in_disc_octets; + u_int64_t in_disc_frames; + u_int64_t out_disc_octets; + u_int64_t out_disc_frames; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_HOOKSTAT_INFO { \ + { "FwdOctets", &ng_parse_uint64_type }, \ + { "FwdFrames", &ng_parse_uint64_type }, \ + { "queueDropOctets", &ng_parse_uint64_type }, \ + { "queueDropFrames", &ng_parse_uint64_type }, \ + { "delayDropOctets", &ng_parse_uint64_type }, \ + { "delayDropFrames", &ng_parse_uint64_type }, \ + { NULL }, \ + } + + /* Statistics structure returned by NGM_PIPE_GET_STATS */ + struct ng_pipe_stats { + struct ng_pipe_hookstat downstream; + struct ng_pipe_hookstat upstream; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_STATS_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ + } + + /* Runtime structure for one hook */ + struct ng_pipe_hookrun { + u_int32_t fifo_queues; + u_int32_t qin_octets; + u_int32_t qin_frames; + u_int32_t qout_octets; + u_int32_t qout_frames; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_HOOKRUN_INFO { \ + { "queues", &ng_parse_uint32_type }, \ + { "queuedOctets", &ng_parse_uint32_type }, \ + { "queuedFrames", &ng_parse_uint32_type }, \ + { "delayedOctets", &ng_parse_uint32_type }, \ + { "delayedFrames", &ng_parse_uint32_type }, \ + { NULL }, \ + } + + /* Runtime structure returned by NGM_PIPE_GET_RUN */ + struct ng_pipe_run { + struct ng_pipe_hookrun downstream; + struct ng_pipe_hookrun upstream; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_RUN_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ + } + + /* Config structure for one hook */ + struct ng_pipe_hookcfg { + u_int64_t bandwidth; + u_int64_t ber; + u_int32_t qin_size_limit; + u_int32_t qout_size_limit; + u_int32_t duplicate; + u_int32_t fifo; + u_int32_t drr; + u_int32_t wfq; + u_int32_t droptail; + u_int32_t drophead; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_HOOKCFG_INFO { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "BER", &ng_parse_uint64_type }, \ + { "queuelen", &ng_parse_uint32_type }, \ + { "delaylen", &ng_parse_uint32_type }, \ + { "duplicate", &ng_parse_uint32_type }, \ + { "fifo", &ng_parse_uint32_type }, \ + { "drr", &ng_parse_uint32_type }, \ + { "wfq", &ng_parse_uint32_type }, \ + { "droptail", &ng_parse_uint32_type }, \ + { "drophead", &ng_parse_uint32_type }, \ + { NULL }, \ + } + + /* Config structure returned by NGM_PIPE_GET_CFG */ + struct ng_pipe_cfg { + u_int64_t bandwidth; + u_int64_t delay; + u_int32_t header_offset; + u_int32_t overhead; + struct ng_pipe_hookcfg downstream; + struct ng_pipe_hookcfg upstream; + }; + + /* Keep this in sync with the above structure definition */ + #define NG_PIPE_CFG_INFO(hstype) { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "delay", &ng_parse_uint64_type }, \ + { "header_offset", &ng_parse_uint32_type }, \ + { "overhead", &ng_parse_uint32_type }, \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ + } + + /* Netgraph commands */ + enum { + NGM_PIPE_GET_STATS=1, /* get stats */ + NGM_PIPE_CLR_STATS, /* clear stats */ + NGM_PIPE_GETCLR_STATS, /* atomically get and clear stats */ + NGM_PIPE_GET_RUN, /* get current runtime status */ + NGM_PIPE_GET_CFG, /* get configurable parameters */ + NGM_PIPE_SET_CFG, /* set configurable parameters */ + }; + + #endif /* _NETGRAPH_PIPE_H_ */ Index: netgraph/ng_source.c =========================================================================== --- netgraph/ng_source.c 2008/08/25 00:28:58 #6 +++ netgraph/ng_source.c 2008/08/25 00:28:58 @@ -616,7 +616,7 @@ ifp = ifunit(ifname); if (ifp == NULL) { - printf("%s: can't find interface %d\n", __func__, V_if_index); + printf("%s: can't find interface %s\n", __func__, ifname); return (EINVAL); } sc->output_ifp = ifp; Index: netgraph/ng_wormhole.c =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- netgraph/ng_wormhole.c Mon Aug 25 00:29:03 2008 *************** *** 0 **** --- 1,448 ---- + /*- + * Copyright (c) 2007-2008 University of Zagreb + * Copyright (c) 2007-2008 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + /* + * A "worm" node can be used to establish a datapath between independent + * netgraph address spaces, i.e. between two virtual network stacks. A + * wormhole path is defined by a pair of wormhole nodes each residing in + * a different stack instance. Each node accepts only a single + * arbitrarily named hook. Once a wormhole datapath is established, all + * data messages received on the local hook will be forwarded to the + * hook connected to the remote node, and vice versa. + * + * "worm" nodes understand two node-specific messages: "peer" and + * "status". The "peer" message is used to specify the remote + * endpoint in form of "remote_worm_node_name@remote_vnet_name", or + * to fetch the current peering configuration if invoked without + * arguments. Both involved nodes must configure their peerings before + * the datapath will be established. The "status" command can be used + * to check the current state of the wormhole path, which can be one of + * unconfigured, pending or active. + * + * NB while the vnet addressing space is currently flat, it is reasonable + * to expect that this could change in the nearest future, which may be + * reflected in the addressing model for ng_wormhole datapaths. + * + * The following example shows how a netgraph path can be established + * between two network stack instances, "1" and "2": + * + * #!/bin/csh + * + * foreach vi (1 2) + * vimage -c $vi + * vimage $vi ngctl mkpeer eiface ether ether + * vimage $vi ngctl mkpeer ngeth0: worm ether ether + * vimage $vi ifconfig ngeth0 ether 40:0:0:0:0:$vi + * vimage $vi ifconfig ngeth0 10.0.0.$vi/24 + * end + * vimage 1 ngctl msg worm0: peer worm0@2 + * vimage 2 ngctl msg worm0: peer worm0@1 + * + */ + + + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + + struct ng_wormhole; + typedef struct ng_wormhole_priv *priv_p; + + #define NG_WORMHOLE_NODE_TYPE "worm" + #define NGM_WORMHOLE_COOKIE 20070806 + + static int ng_wormhole_mod_event(module_t, int, void *); + static ng_constructor_t ng_wormhole_constructor; + static ng_shutdown_t ng_wormhole_shutdown; + static ng_newhook_t ng_wormhole_newhook; + static ng_disconnect_t ng_wormhole_disconnect; + static ng_rcvdata_t ng_wormhole_rcvdata; + static ng_rcvmsg_t ng_wormhole_rcvmsg; + static vnet_attach_fn ng_wormhole_iattach; + static vnet_detach_fn ng_wormhole_idetach; + static void ng_wormhole_update_status(priv_p); + static ng_parse_t ng_wormhole_peer_parse; + static ng_unparse_t ng_wormhole_peer_unparse; + static ng_unparse_t ng_wormhole_status_unparse; + + /* Node state */ + enum { + NG_WORMHOLE_UNCONFIGURED = 0, + NG_WORMHOLE_PENDING, + NG_WORMHOLE_ACTIVE + }; + + /* Netgraph commands */ + enum { + NGM_WORMHOLE_PEER = 1, + NGM_WORMHOLE_STATUS + }; + + static const struct ng_parse_type ng_wormhole_peer_type = { + .parse = &ng_wormhole_peer_parse, + .unparse = &ng_wormhole_peer_unparse, + }; + + static const struct ng_parse_type ng_wormhole_status_type = { + .unparse = &ng_wormhole_status_unparse, + }; + + static const struct ng_cmdlist ng_wormhole_cmds[] = { + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_PEER, + .name = "peer", + .mesgType = &ng_wormhole_peer_type, + .respType = &ng_wormhole_peer_type, + }, + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_STATUS, + .name = "status", + .respType = &ng_wormhole_status_type, + }, + { 0 } + }; + + static struct ng_type typestruct = { + .version = NG_ABI_VERSION, + .name = NG_WORMHOLE_NODE_TYPE, + .mod_event = ng_wormhole_mod_event, + .constructor = ng_wormhole_constructor, + .rcvmsg = ng_wormhole_rcvmsg, + .shutdown = ng_wormhole_shutdown, + .newhook = ng_wormhole_newhook, + .rcvdata = ng_wormhole_rcvdata, + .disconnect = ng_wormhole_disconnect, + .cmdlist = ng_wormhole_cmds + }; + NETGRAPH_INIT(ng_wormhole, &typestruct); + + VNET_MOD_DECLARE_STATELESS(NG_WORMHOLE, ng_wormhole, ng_wormhole_iattach, + ng_wormhole_idetach, NETGRAPH) + + struct ng_wormhole_priv { + int status; + priv_p remote_priv; + struct vnet *vnet; + hook_p hook; + node_p node; + LIST_ENTRY(ng_wormhole_priv) all_wormholes_le; + int unit; + }; + + LIST_HEAD(, ng_wormhole_priv) all_wormholes_head; + /* XXX need a lock around the above list */ + + static int + ng_wormhole_constructor(node_p node) + { + INIT_VNET_NETGRAPH(curvnet); + priv_p priv; + char buf[NG_NODESIZ]; + + MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + + NG_NODE_SET_PRIVATE(node, priv); + priv->unit = alloc_unr(V_ng_wormhole_unit); + snprintf(buf, NG_NODESIZ, "%s%d", typestruct.name, priv->unit); + if (ng_name_node(node, buf) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", buf); + priv->vnet = curvnet; + priv->node = node; + priv->hook = NULL; + priv->status = NG_WORMHOLE_UNCONFIGURED; + LIST_INSERT_HEAD(&all_wormholes_head, priv, all_wormholes_le); + return (0); + } + + static int + ng_wormhole_newhook(node_p node, hook_p hook, const char *name) + { + priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->hook) + return(EBUSY); + priv->hook = hook; + ng_wormhole_update_status(priv); + return (0); + } + + static int + ng_wormhole_disconnect(hook_p hook) + { + priv_p priv = NG_NODE_PRIVATE(hook->hk_node); + + priv->hook = NULL; + ng_wormhole_update_status(priv); + return (0); + } + + static int + ng_wormhole_rcvmsg(node_p node, item_p item, hook_p lasthook) + { + priv_p priv = NG_NODE_PRIVATE(node); + priv_p *remote_priv; + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + int error = 0; + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_WORMHOLE_COOKIE: + switch (msg->header.cmd) { + case NGM_WORMHOLE_PEER: + remote_priv = (priv_p *) &msg->data; + if (*remote_priv) { + if (*remote_priv == priv) + error = EINVAL; + else + priv->remote_priv = *remote_priv; + /* XXX drop all wormhole lock */ + ng_wormhole_update_status(priv); + } else { + NG_MKRESPONSE(resp, msg, + sizeof(priv->remote_priv), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->remote_priv, resp->data, + sizeof(priv->remote_priv)); + } + + break; + case NGM_WORMHOLE_STATUS: + NG_MKRESPONSE(resp, msg, + sizeof(priv->status), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->status, resp->data, + sizeof(priv->status)); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + return (error); + } + + static int + ng_wormhole_peer_parse(const struct ng_parse_type *type, + const char *s, int *off, const u_char *const start, + u_char *const buf, int *buflen) + { + char node_name_buf[NG_NODESIZ]; + char *t; + int len; + int error = 0; + priv_p *remote_priv = (priv_p *)buf; + + *buflen = sizeof(priv_p); + + while (isspace(s[*off])) + (*off)++; + if (strlen(&s[*off]) == 0) { + /* XXX to drop or not to drop the lock? */ + *remote_priv = NULL; + return (error); + } + if ((t = index(s + *off, '@')) == NULL) + return (EINVAL); + if ((len = t - (s + *off)) > sizeof(node_name_buf) - 1) + return (EINVAL); + strncpy(node_name_buf, s + *off, len); + node_name_buf[len] = '\0'; + *off += len + 1; /* vnet name should be in &s[*off] now */ + + /* XXX should lock all wormhole list here */ + LIST_FOREACH(*remote_priv, &all_wormholes_head, all_wormholes_le) + if (strcmp((*remote_priv)->node->nd_name, node_name_buf) == 0 && + strcmp(vnet_name((*remote_priv)->vnet), &s[*off]) == 0) + break; + if (*remote_priv) { + /* XXX should return with the lock held, drop it in rcvmsg */ + } else { + error = ENOENT; + /* XXX should unlock the all wormholes list now */ + } + return (error); + } + + static int + ng_wormhole_peer_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) + { + const priv_p *remote_priv = (const priv_p *)(data + *off); + + if (*remote_priv) { + /* XXX lock all wormhole list; check whether remote exists */ + snprintf(cbuf, cbuflen, "%s@%s", + (*remote_priv)->node->nd_name, + vnet_name((*remote_priv)->vnet)); + *off += sizeof(*remote_priv); + } + return (0); + } + + static int + ng_wormhole_status_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) + { + const int *status = (const int *)(data + *off); + + switch (*status) { + case NG_WORMHOLE_UNCONFIGURED: + snprintf(cbuf, cbuflen, "unconfigured"); + break; + case NG_WORMHOLE_PENDING: + snprintf(cbuf, cbuflen, "pending"); + break; + case NG_WORMHOLE_ACTIVE: + snprintf(cbuf, cbuflen, "active"); + break; + default: + panic("unknown status %d", *status); + } + *off += sizeof(*status); + return (0); + } + + static void + ng_wormhole_update_status(priv_p priv) + { + priv_p remote_priv; + + /* XXX lock / unlock the all wormhole list while doing this */ + remote_priv = priv->remote_priv; + if (remote_priv == NULL) + priv->status = NG_WORMHOLE_UNCONFIGURED; + else if (remote_priv->remote_priv != priv) + priv->status = NG_WORMHOLE_PENDING; + else if (remote_priv->hook == NULL || priv->hook == NULL) + priv->status = remote_priv->status = NG_WORMHOLE_PENDING; + else + priv->status = remote_priv->status = NG_WORMHOLE_ACTIVE; + } + + static int + ng_wormhole_rcvdata(hook_p hook, item_p item) + { + priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + int error = 0; + priv_p remote_priv = priv->remote_priv; + struct mbuf *m; + + if (priv->status != NG_WORMHOLE_ACTIVE) { + NG_FREE_ITEM(item); + error = ENOTCONN; + } else { + m = NGI_M(item); + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(remote_priv->vnet); + NG_FWD_ITEM_HOOK(error, item, remote_priv->hook); + CURVNET_RESTORE(); + } + return (error); + } + + static int + ng_wormhole_shutdown(node_p node) + { + priv_p priv = NG_NODE_PRIVATE(node); + INIT_VNET_NETGRAPH(priv->vnet); + + LIST_REMOVE(priv, all_wormholes_le); + free_unr(V_ng_wormhole_unit, priv->unit); + FREE(priv, M_NETGRAPH); + NG_NODE_SET_PRIVATE(node, NULL); + NG_NODE_UNREF(node); + return (0); + } + + static int + ng_wormhole_mod_event(module_t mod, int event, void *data) + { + int error = 0; + + switch (event) { + case MOD_LOAD: + vnet_mod_register(&vnet_ng_wormhole_modinfo); + break; + case MOD_UNLOAD: + vnet_mod_deregister(&vnet_ng_wormhole_modinfo); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); + } + + static int ng_wormhole_iattach(const void *unused) + { + INIT_VNET_NETGRAPH(curvnet); + + V_ng_wormhole_unit = new_unrhdr(0, 0xffff, NULL); + return (0); + } + + static int ng_wormhole_idetach(const void *unused) + { + INIT_VNET_NETGRAPH(curvnet); + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) { + ng_rmnode_self(node); + break; + } + } while (node != NULL); + delete_unrhdr(V_ng_wormhole_unit); + return (0); + } Index: netinet/accf_http.c =========================================================================== --- netinet/accf_http.c 2008/08/25 00:28:58 #1 +++ netinet/accf_http.c 2008/08/25 00:28:58 @@ -37,6 +37,7 @@ #include #include #include +#include /* check for GET/HEAD */ static void sohashttpget(struct socket *so, void *arg, int waitflag); @@ -51,6 +52,8 @@ int max, char *cmp); /* socketbuffer is full */ static int sbfull(struct sockbuf *sb); +static int +accept_filt_http_mod_event(module_t mod, int event, void *data); static struct accept_filter accf_http_filter = { "httpready", @@ -61,19 +64,42 @@ static moduledata_t accf_http_mod = { "accf_http", - accept_filt_generic_mod_event, - &accf_http_filter + accept_filt_http_mod_event, + NULL, }; DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); -static int parse_http_version = 1; +#ifndef VIMAGE +static int parse_http_version; +#endif + +/* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ +struct vnet_accf_http { + int _parse_http_version; +}; + +#define INIT_VNET_ACCF_HTTP(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_ACCF_HTTP, struct vnet_accf_http, vnet_accf_http) + +#define VNET_ACCF_HTTP(sym) VSYM(vnet_accf_http, sym) + +#define V_parse_http_version VNET_ACCF_HTTP(parse_http_version) + +#define V_MOD_vnet_accf_http VNET_MOD_ACCF_HTTP + +static vnet_attach_fn vnet_accf_http_iattach; + +VNET_MOD_DECLARE(ACCF_HTTP, accf_http, vnet_accf_http_iattach, + NULL, INET, NULL) + +/* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, "HTTP accept filter"); -SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, -&parse_http_version, 1, -"Parse http version so that non 1.x requests work"); +SYSCTL_V_INT(V_NET, vnet_accf_http, _net_inet_accf_http, OID_AUTO, + parsehttpversion, CTLFLAG_RW, parse_http_version, 1, + "Parse http version so that non 1.x requests work"); #ifdef ACCF_HTTP_DEBUG #define DPRINT(fmt, args...) \ @@ -161,6 +187,7 @@ static void sohashttpget(struct socket *so, void *arg, int waitflag) { + INIT_VNET_ACCF_HTTP(so->so_vnet); if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { struct mbuf *m; @@ -192,7 +219,7 @@ } if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) { DPRINT("mbufstrcmp ok"); - if (parse_http_version == 0) + if (V_parse_http_version == 0) soishttpconnected(so, arg, waitflag); else soparsehttpvers(so, arg, waitflag); @@ -360,3 +387,58 @@ soisconnected(so); return; } + +static int +accept_filt_http_mod_event(module_t mod, int event, void *data) +{ + struct accept_filter *p; + int error; + + switch (event) { + case MOD_LOAD: +#ifdef VIMAGE + vnet_mod_register(&vnet_accf_http_modinfo); +#else + vnet_accf_http_iattach(NULL); +#endif /* !VIMAGE */ + + MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, + M_WAITOK); + bcopy(&accf_http_filter, p, sizeof(*p)); + error = accept_filt_add(p); + break; + + case MOD_UNLOAD: + /* + * Do not support unloading yet. we don't keep track of + * refcounts and unloading an accept filter callback and then + * having it called is a bad thing. A simple fix would be to + * track the refcount in the struct accept_filter. + */ + if (accf_unloadable != 0) { + error = accept_filt_del(accf_http_filter.accf_name); + } else + error = EOPNOTSUPP; + break; + + case MOD_SHUTDOWN: + error = 0; + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +static int vnet_accf_http_iattach(const void *unused) +{ + INIT_VNET_ACCF_HTTP(curvnet); + + V_parse_http_version = 1; + + return 0; +} + Index: netinet/icmp_var.h =========================================================================== --- netinet/icmp_var.h 2008/08/25 00:28:58 #1 +++ netinet/icmp_var.h 2008/08/25 00:28:58 @@ -74,7 +74,9 @@ #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); +#ifndef VIMAGE extern struct icmpstat icmpstat; /* icmp statistics */ +#endif extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 Index: netinet/if_ether.c =========================================================================== --- netinet/if_ether.c 2008/08/25 00:28:58 #10 +++ netinet/if_ether.c 2008/08/25 00:28:58 @@ -49,10 +49,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -61,6 +63,7 @@ #include #include +#include #include #include #include @@ -81,10 +84,12 @@ SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); /* timer values */ -static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +#ifndef VIMAGE +static int arpt_keep; +#endif -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, "ARP entry lifetime in seconds"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, max_age, + CTLFLAG_RW, arpt_keep, 0, "ARP entry lifetime in seconds"); #define rt_expire rt_rmx.rmx_expire @@ -97,11 +102,12 @@ }; static struct ifqueue arpintrq; -static int arp_allocated; -static int arp_maxtries = 5; -static int useloopback = 1; /* use loopback interface for local traffic */ -static int arp_proxyall = 0; +#ifndef VIMAGE +static int arp_maxtries; +static int useloopback; +static int arp_proxyall; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, arp_maxtries, 0, @@ -114,6 +120,7 @@ "Enable proxy ARP for all suitable requests"); static void arp_init(void); +static int arp_iattach(const void *); static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, u_char *); @@ -125,6 +132,8 @@ static void in_arpinput(struct mbuf *); #endif +VNET_MOD_DECLARE_STATELESS(ARP, arp, arp_iattach, NULL, INET) + /* * Timeout routine. */ @@ -142,8 +151,10 @@ */ RT_UNLOCK(rt); + CURVNET_SET(rt->rt_ifp->if_vnet); in_rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL, rt->rt_fibnum); + CURVNET_RESTORE(); } /* @@ -220,7 +231,6 @@ log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } - arp_allocated++; /* * We are storing a route entry outside of radix tree. So, * it can be found and accessed by other means than radix @@ -870,10 +880,6 @@ (*ifp->if_output)(ifp, hold, rt_key(rt), rt); } /* end of FIB loop */ reply: - - /* - * Decide if we have to respond to something. - */ if (op != ARPOP_REQUEST) goto drop; if (itaddr.s_addr == myaddr.s_addr) { @@ -1065,12 +1071,31 @@ ifa->ifa_flags |= RTF_CLONING; } +static int +arp_iattach(unused) + const void *unused; +{ + INIT_VNET_INET(curvnet); + + V_arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ + V_arp_maxtries = 5; + V_useloopback = 1; /* use loopback interface for local traffic */ + V_arp_proxyall = 0; + + return 0; +} + static void arp_init(void) { - +#ifdef VIMAGE + vnet_mod_register(&vnet_arp_modinfo); +#else + arp_iattach(NULL); +#endif arpintrq.ifq_maxlen = 50; mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); netisr_register(NETISR_ARP, arpintr, &arpintrq, 0); } + SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: netinet/igmp.c =========================================================================== --- netinet/igmp.c 2008/08/25 00:28:58 #7 +++ netinet/igmp.c 2008/08/25 00:28:58 @@ -59,9 +59,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -80,7 +82,9 @@ static struct router_info *find_rti(struct ifnet *ifp); static void igmp_sendpkt(struct in_multi *, int, unsigned long); +#ifndef VIMAGE static struct igmpstat igmpstat; +#endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, igmpstat, igmpstat, ""); @@ -93,7 +97,9 @@ * when accessed via an in_multi read-only. */ static struct mtx igmp_mtx; +#ifndef VIMAGE static SLIST_HEAD(, router_info) router_info_head; +#endif static int igmp_timers_are_running; /* @@ -116,8 +122,12 @@ void igmp_init(void) { + INIT_VNET_INET(curvnet); struct ipoption *ra; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif /* * To avoid byte-swapping the same value over and over again. */ @@ -139,6 +149,10 @@ router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); +#ifdef VIMAGE + } +#endif + SLIST_INIT(&V_router_info_head); } @@ -425,6 +439,8 @@ IN_MULTI_LOCK(); igmp_timers_are_running = 0; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_timer == 0) { @@ -437,6 +453,7 @@ } IN_NEXT_MULTI(step, inm); } + VNET_ITERLOOP_END(); IN_MULTI_UNLOCK(); } @@ -447,6 +464,7 @@ IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n"); mtx_lock(&igmp_mtx); + VNET_ITERLOOP_BEGIN() INIT_VNET_INET(vnet_iter); SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_type == IGMP_V1_ROUTER) { @@ -455,6 +473,7 @@ rti->rti_type = IGMP_V2_ROUTER; } } + VNET_ITERLOOP_END() mtx_unlock(&igmp_mtx); IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n"); } Index: netinet/in.c =========================================================================== --- netinet/in.c 2008/08/25 00:28:58 #8 +++ netinet/in.c 2008/08/25 00:28:58 @@ -45,10 +45,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -66,18 +68,20 @@ struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static int subnetsarelocal = 0; +#ifndef VIMAGE +static int subnetsarelocal; +static int sameprefixcarponly; +extern struct inpcbinfo ripcbinfo; +extern struct inpcbinfo udbinfo; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, subnetsarelocal, 0, "Treat all subnets as directly connected"); -static int sameprefixcarponly = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, sameprefixcarponly, 0, "Refuse to create same prefixes on different interfaces"); -extern struct inpcbinfo ripcbinfo; -extern struct inpcbinfo udbinfo; - /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal Index: netinet/in.h =========================================================================== --- netinet/in.h 2008/08/25 00:28:58 #2 +++ netinet/in.h 2008/08/25 00:28:58 @@ -743,9 +743,4 @@ #undef __KAME_NETINET_IN_H_INCLUDED_ #endif - -#ifdef _KERNEL -#include -#endif - #endif /* !_NETINET_IN_H_*/ Index: netinet/in_gif.c =========================================================================== --- netinet/in_gif.c 2008/08/25 00:28:58 #7 +++ netinet/in_gif.c 2008/08/25 00:28:58 @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -85,7 +86,9 @@ .pr_usrreqs = &rip_usrreqs }; -static int ip_gif_ttl = GIF_TTL; +#ifndef VIMAGE +int ip_gif_ttl; +#endif SYSCTL_V_INT(V_NET, vnet_gif, _net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, ip_gif_ttl, 0, ""); Index: netinet/in_gif.h =========================================================================== --- netinet/in_gif.h 2008/08/25 00:28:58 #1 +++ netinet/in_gif.h 2008/08/25 00:28:58 @@ -35,6 +35,9 @@ #define GIF_TTL 30 +#ifndef VIMAGE +extern int ip_gif_ttl; +#endif struct gif_softc; void in_gif_input(struct mbuf *, int); int in_gif_output(struct ifnet *, int, struct mbuf *); Index: netinet/in_mcast.c =========================================================================== --- netinet/in_mcast.c 2008/08/25 00:28:58 #7 +++ netinet/in_mcast.c 2008/08/25 00:28:58 @@ -53,7 +53,9 @@ #include #include #include +#include +#include #include #include #include @@ -86,7 +88,9 @@ * ip_output() to send IGMP packets while holding the lock; this probably is * not quite desirable. */ +#ifndef VIMAGE struct in_multihead in_multihead; /* XXX BSS initialization */ +#endif struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); @@ -114,6 +118,8 @@ static int inp_leave_group(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static struct ifnet * + ip_multicast_if(struct in_addr *a); /* * Resize the ip_moptions vector to the next power-of-two minus 1. @@ -1026,9 +1032,9 @@ * If all of these conditions fail, return EADDRNOTAVAIL, and * reject the IPv4 multicast join. */ - if (mreqs.imr_interface.s_addr != INADDR_ANY) { - INADDR_TO_IFP(mreqs.imr_interface, ifp); - } else { + if (mreqs.imr_interface.s_addr != INADDR_ANY) + ifp = ip_multicast_if(&mreqs.imr_interface); + else { struct route ro; ro.ro_rt = NULL; @@ -1447,7 +1453,7 @@ if (addr.s_addr == INADDR_ANY) { ifp = NULL; } else { - INADDR_TO_IFP(addr, ifp); + ifp = ip_multicast_if(&addr); if (ifp == NULL) return (EADDRNOTAVAIL); } @@ -1833,3 +1839,25 @@ return (error); } + +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(struct in_addr *a) +{ + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); + int ifindex; + struct ifnet *ifp; + + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + if (ifindex < 0 || V_if_index < ifindex) + return NULL; + ifp = ifnet_byindex(ifindex); + } else + INADDR_TO_IFP(*a, ifp); + return ifp; +} + Index: netinet/in_pcb.c =========================================================================== --- netinet/in_pcb.c 2008/08/25 00:28:58 #11 +++ netinet/in_pcb.c 2008/08/25 00:28:58 @@ -60,10 +60,12 @@ #include +#include #include #include #include +#include #include #include #include @@ -75,8 +77,8 @@ #include #include #endif /* INET6 */ +#include - #ifdef IPSEC #include #include @@ -84,32 +86,34 @@ #include +#ifndef VIMAGE /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ -int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ -int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ -int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ -int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ +int ipport_lowfirstauto; +int ipport_lowlastauto; +int ipport_firstauto; +int ipport_lastauto; +int ipport_hifirstauto; +int ipport_hilastauto; /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ -int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_reservedlow = 0; +int ipport_reservedhigh; +int ipport_reservedlow; /* Variables dealing with random ephemeral port allocation. */ -int ipport_randomized = 1; /* user controlled via sysctl */ -int ipport_randomcps = 10; /* user controlled via sysctl */ -int ipport_randomtime = 45; /* user controlled via sysctl */ -int ipport_stoprandom = 0; /* toggled by ipport_tick */ +int ipport_randomized; +int ipport_randomcps; +int ipport_randomtime; +int ipport_stoprandom; int ipport_tcpallocs; int ipport_tcplastcount; +#endif #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ @@ -118,9 +122,13 @@ static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { +#ifdef VIMAGE + INIT_VNET_INET(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int error; - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); @@ -1213,6 +1221,7 @@ void ipport_tick(void *xtp) { + VNET_ITERLOOP_BEGIN(); INIT_VNET_INET(curvnet); if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { @@ -1272,10 +1281,11 @@ void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { + INIT_VNET_INET(curvnet); struct inpcb *inp; INP_INFO_RLOCK(&V_tcbinfo); - LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { + LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); Index: netinet/in_pcb.h =========================================================================== --- netinet/in_pcb.h 2008/08/25 00:28:58 #4 +++ netinet/in_pcb.h 2008/08/25 00:28:58 @@ -226,6 +226,8 @@ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ + +#define inp_vnet inp_pcbinfo->ipi_vnet }; /* * The range of the generation count, as used in this implementation, is 9e19. @@ -307,7 +309,8 @@ * vimage 1 * general use 1 */ - void *ipi_pspare[2]; + struct vnet *ipi_vnet; + void *ipi_pspare[1]; }; #define INP_LOCK_INIT(inp, d, t) \ @@ -440,6 +443,7 @@ #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL +#ifndef VIMAGE extern int ipport_reservedhigh; extern int ipport_reservedlow; extern int ipport_lowfirstauto; @@ -448,6 +452,11 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; +extern int ipport_randomized; +extern int ipport_randomcps; +extern int ipport_randomtime; +extern int ipport_stoprandom; +#endif extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); Index: netinet/in_proto.c =========================================================================== --- netinet/in_proto.c 2008/08/25 00:28:58 #5 +++ netinet/in_proto.c 2008/08/25 00:28:58 @@ -124,6 +124,9 @@ .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = udp_init, +#ifdef VIMAGE + .pr_destroy = udp_destroy, +#endif .pr_usrreqs = &udp_usrreqs }, { @@ -135,6 +138,9 @@ .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, +#ifdef VIMAGE + .pr_destroy = tcp_destroy, +#endif .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs @@ -345,11 +351,15 @@ .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, +#ifdef VIMAGE + .pr_destroy = rip_destroy, +#endif .pr_usrreqs = &rip_usrreqs }, }; extern int in_inithead(void **, int); +extern int in_detachhead(void **, int); struct domain inetdomain = { .dom_family = AF_INET, @@ -361,6 +371,9 @@ #else .dom_rtattach = in_inithead, #endif +#ifdef VIMAGE + .dom_rtdetach = in_detachhead, +#endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in) }; Index: netinet/in_rmx.c =========================================================================== --- netinet/in_rmx.c 2008/08/25 00:28:58 #8 +++ netinet/in_rmx.c 2008/08/25 00:28:58 @@ -53,13 +53,18 @@ #include #include +#include #include #include +#include #include #include #include -extern int in_inithead(void **head, int off); +int in_inithead(void **head, int off); +#ifdef VIMAGE +int in_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -151,17 +156,20 @@ return rn; } -static int rtq_reallyold = 60*60; /* one hour is "really old" */ +#ifndef VIMAGE +static int rtq_reallyold; +static int rtq_minreallyold; +static int rtq_toomany; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, rtq_reallyold, 0, "Default expiration time on dynamically learned routes"); -static int rtq_minreallyold = 10; /* never automatically crank down to less */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0, "Minimum time to attempt to hold onto dynamically learned routes"); -static int rtq_toomany = 128; /* 128 cached routes is "too many" */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, rtq_toomany, 0, "Upper limit on dynamically learned routes"); @@ -256,33 +264,15 @@ } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout = RTQ_TIMEOUT; +#ifndef VIMAGE +static int rtq_timeout; static struct callout rtq_timer; - -static void in_rtqtimo_one(void *rock); +#endif static void -in_rtqtimo(void *rock) +in_rtqtimo_one(struct radix_node_head *rnh) { - int fibnum; - void *newrock; - struct timeval atv; - - KASSERT((rock == (void *)V_rt_tables[0][AF_INET]), - ("in_rtqtimo: unexpected arg")); - for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { - if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL) - in_rtqtimo_one(newrock); - } - atv.tv_usec = 0; - atv.tv_sec = V_rtq_timeout; - callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); -} - -static void -in_rtqtimo_one(void *rock) -{ - struct radix_node_head *rnh = rock; + INIT_VNET_INET(curvnet); struct rtqk_arg arg; static time_t last_adjusted_timeout = 0; @@ -321,7 +311,26 @@ rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } +} +static void +in_rtqtimo(void *rock) +{ + int fibnum; + void *newrock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET((struct vnet *) rock); + struct timeval atv; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL) + in_rtqtimo_one(newrock); + } + atv.tv_usec = 0; + atv.tv_sec = V_rtq_timeout; + callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + CURVNET_RESTORE(); } void @@ -331,6 +340,9 @@ struct rtqk_arg arg; int fibnum; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(vnet_iter); + for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = V_rt_tables[fibnum][AF_INET]; arg.found = arg.killed = 0; @@ -342,6 +354,7 @@ rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } + VNET_ITERLOOP_END(); } static int _in_rt_was_here; @@ -368,18 +381,33 @@ if (off == 0) /* XXX MRT see above */ return 1; /* only do the rest for a real routing table */ + V_rtq_reallyold = 60*60; /* one hour is "really old" */ + V_rtq_minreallyold = 10; /* never automatically crank down to less */ + V_rtq_toomany = 128; /* 128 cached routes is "too many" */ + V_rtq_timeout = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in_addroute; rnh->rnh_matchaddr = in_matroute; rnh->rnh_close = in_clsroute; if (_in_rt_was_here == 0 ) { callout_init(&V_rtq_timer, CALLOUT_MPSAFE); - in_rtqtimo(rnh); /* kick off timeout first time */ + in_rtqtimo(curvnet); /* kick off timeout first time */ _in_rt_was_here = 1; } return 1; } +#ifdef VIMAGE +int +in_detachhead(void **head, int off) +{ + INIT_VNET_INET(curvnet); + + callout_drain(&V_rtq_timer); + return 1; +} +#endif + /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes Index: netinet/in_var.h =========================================================================== --- netinet/in_var.h 2008/08/25 00:28:58 #3 +++ netinet/in_var.h 2008/08/25 00:28:58 @@ -84,20 +84,33 @@ /* * Hash table for IP addresses. */ -extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl; -extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +LIST_HEAD(in_ifaddrhashhead, in_ifaddr); +TAILQ_HEAD(in_ifaddrhead, in_ifaddr); +#ifndef VIMAGE +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; +extern struct in_ifaddrhead in_ifaddrhead; extern u_long in_ifaddrhmask; /* mask for hash table */ +#endif + +/* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) -#define INADDR_NHASH_LOG2 9 +/* + * Macro for finding the internet address structure (in_ifaddr) + * corresponding to one of our IP addresses (in_addr). + */ +#define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) -/* - * Macro for finding the internet address structure (in_ifaddr) - * corresponding to one of our IP addresses (in_addr). - */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ @@ -218,7 +231,11 @@ SYSCTL_DECL(_net_inet_raw); #endif -extern LIST_HEAD(in_multihead, in_multi) in_multihead; +LIST_HEAD(in_multihead, in_multi); + +#ifndef VIMAGE +extern struct in_multihead in_multihead; +#endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes Index: netinet/ip6.h =========================================================================== --- netinet/ip6.h 2008/08/25 00:28:58 #4 +++ netinet/ip6.h 2008/08/25 00:28:58 @@ -346,9 +346,6 @@ } \ } \ } while (/*CONSTCOND*/ 0) - -#include - #endif /*_KERNEL*/ #endif /* not _NETINET_IP6_H_ */ Index: netinet/ip_carp.c =========================================================================== --- netinet/ip_carp.c 2008/08/25 00:28:58 #7 +++ netinet/ip_carp.c 2008/08/25 00:28:58 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,7 @@ #include #ifdef INET +#include #include #include #include @@ -77,6 +79,7 @@ #endif #ifdef INET6 +#include #include #include #include Index: netinet/ip_divert.c =========================================================================== --- netinet/ip_divert.c 2008/08/25 00:28:58 #7 +++ netinet/ip_divert.c 2008/08/25 00:28:58 @@ -62,10 +62,12 @@ #include +#include #include #include #include +#include #include #include #include @@ -111,9 +113,12 @@ * will cause it to be effectively considered as a standard packet). */ +#ifndef VIMAGE /* Internal variables. */ static struct inpcbhead divcb; static struct inpcbinfo divcbinfo; +#endif +static struct uma_zone *divcbzone; static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ @@ -125,7 +130,7 @@ div_zone_change(void *tag) { - uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(divcbzone, maxsockets); } static int @@ -148,7 +153,21 @@ void div_init(void) { - + INIT_VNET_INET(curvnet); + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + divcbzone = uma_zcreate("divcb", sizeof(struct inpcb), + NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE); + uma_zone_set_max(divcbzone, maxsockets); + EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, + NULL, EVENTHANDLER_PRI_ANY); +#ifdef VIMAGE + } + V_divcbinfo.ipi_vnet = curvnet; +#endif INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); LIST_INIT(&V_divcb); V_divcbinfo.ipi_listhead = &V_divcb; @@ -160,12 +179,6 @@ V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_porthashmask); - V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), - NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE); - uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, - NULL, EVENTHANDLER_PRI_ANY); } /* @@ -175,6 +188,7 @@ void div_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); V_ipstat.ips_noproto++; m_freem(m); } @@ -188,6 +202,7 @@ static void divert_packet(struct mbuf *m, int incoming) { + INIT_VNET_INET(curvnet); struct ip *ip; struct inpcb *inp; struct socket *sa; @@ -290,7 +305,7 @@ m_freem(m); V_ipstat.ips_noproto++; V_ipstat.ips_delivered--; - } + } } /* @@ -304,6 +319,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { + INIT_VNET_INET(so->so_vnet); struct m_tag *mtag; struct divert_tag *dt; int error = 0; @@ -456,6 +472,7 @@ static int div_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -487,6 +504,7 @@ static void div_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -501,6 +519,7 @@ static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -541,6 +560,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(so->so_vnet); /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { @@ -556,11 +576,11 @@ void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { - struct in_addr faddr; + struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) - return; + return; if (PRC_IS_REDIRECT(cmd)) return; } @@ -568,6 +588,7 @@ static int div_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -692,6 +713,7 @@ static int div_modevent(module_t mod, int type, void *unused) { + INIT_VNET_INET(curvnet); int err = 0; int n; @@ -736,7 +758,7 @@ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); INP_INFO_LOCK_DESTROY(&V_divcbinfo); - uma_zdestroy(V_divcbinfo.ipi_zone); + uma_zdestroy(divcbzone); break; default: err = EOPNOTSUPP; @@ -746,9 +768,9 @@ } static moduledata_t ipdivertmod = { - "ipdivert", - div_modevent, - 0 + "ipdivert", + div_modevent, + 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); Index: netinet/ip_fastfwd.c =========================================================================== --- netinet/ip_fastfwd.c 2008/08/25 00:28:58 #7 +++ netinet/ip_fastfwd.c 2008/08/25 00:28:58 @@ -89,6 +89,7 @@ #include #include +#include #include #include #include @@ -96,6 +97,7 @@ #include #include +#include #include #include #include @@ -106,7 +108,9 @@ #include -static int ipfastforward_active = 0; +#ifndef VIMAGE +static int ipfastforward_active; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, ipfastforward_active, 0, "Enable fast IP forwarding"); Index: netinet/ip_fw.h =========================================================================== --- netinet/ip_fw.h 2008/08/25 00:28:58 #2 +++ netinet/ip_fw.h 2008/08/25 00:28:58 @@ -552,7 +552,6 @@ * Main firewall chains definitions and global var's definitions. */ #ifdef _KERNEL -#ifdef NOTYET /* still in ip_fw2.c */ /* * Data structure to cache our ucred related @@ -566,9 +565,7 @@ uid_t fw_uid; int fw_prid; }; -#endif - #define IPFW_TABLES_MAX 128 struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ @@ -578,13 +575,11 @@ struct rwlock rwmtx; }; -#ifdef NOTYET /* still in ip_fw2.c */ struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; -#endif /* Return values from ipfw_chk() */ enum { @@ -652,9 +647,7 @@ int ipfw_init(void); void ipfw_destroy(void); -#ifdef NOTYET void ipfw_nat_destroy(void); -#endif typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; Index: netinet/ip_fw2.c =========================================================================== --- netinet/ip_fw2.c 2008/08/25 00:28:58 #15 +++ netinet/ip_fw2.c 2008/08/25 00:28:58 @@ -66,6 +66,7 @@ #include #include +#include #include #include #include @@ -73,6 +74,7 @@ #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ +#include #include #include #include @@ -108,6 +110,11 @@ #include +static int vnet_ipfw_iattach(const void *); +static int vnet_ipfw_idetach(const void *); + +VNET_MOD_DECLARE(IPFW, ipfw, vnet_ipfw_iattach, vnet_ipfw_idetach, INET, NULL) + /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set @@ -116,32 +123,24 @@ * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ +#ifndef VIMAGE static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; +#endif + static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 /* - * Data structure to cache our ucred related - * information. This structure only gets used if - * the user specified UID/GID based constraints in - * a firewall rule. - */ -struct ip_fw_ugid { - gid_t fw_groups[NGROUPS]; - int fw_ngroups; - uid_t fw_uid; - int fw_prid; -}; - -/* * list of rules for layer 3 */ +#ifndef VIMAGE struct ip_fw_chain layer3_chain; +#endif MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); @@ -152,14 +151,10 @@ ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - -static int fw_debug = 1; -static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ +#ifndef VIMAGE +static int fw_debug; +static int autoinc_step; +#endif extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); @@ -218,9 +213,11 @@ * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ +#ifndef VIMAGE static ipfw_dyn_rule **ipfw_dyn_v = NULL; -static u_int32_t dyn_buckets = 256; /* must be power of 2 */ -static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ +static u_int32_t dyn_buckets; +static u_int32_t curr_dyn_buckets; +#endif static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ @@ -233,12 +230,14 @@ /* * Timeouts for various events in handing dynamic rules. */ -static u_int32_t dyn_ack_lifetime = 300; -static u_int32_t dyn_syn_lifetime = 20; -static u_int32_t dyn_fin_lifetime = 1; -static u_int32_t dyn_rst_lifetime = 1; -static u_int32_t dyn_udp_lifetime = 10; -static u_int32_t dyn_short_lifetime = 5; +#ifndef VIMAGE +static u_int32_t dyn_ack_lifetime; +static u_int32_t dyn_syn_lifetime; +static u_int32_t dyn_fin_lifetime; +static u_int32_t dyn_rst_lifetime; +static u_int32_t dyn_udp_lifetime; +static u_int32_t dyn_short_lifetime; +#endif /* * Keepalives are sent if dyn_keepalive is set. They are sent every @@ -247,15 +246,16 @@ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ +#ifndef VIMAGE +static u_int32_t dyn_keepalive_interval; +static u_int32_t dyn_keepalive_period; +static u_int32_t dyn_keepalive; -static u_int32_t dyn_keepalive_interval = 20; -static u_int32_t dyn_keepalive_period = 5; -static u_int32_t dyn_keepalive = 1; /* do send keepalives */ - static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ -static u_int32_t dyn_count; /* # of dynamic rules */ -static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ +static u_int32_t dyn_count; /* # of dynamic rules */ +static u_int32_t dyn_max; /* max # of dynamic rules */ +#endif SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, dyn_buckets, 0, "Number of dyn. buckets"); @@ -283,20 +283,28 @@ SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules"); +#ifndef VIMAGE +static int fw_deny_unknown_exthdrs; +#endif #ifdef INET6 /* * IPv6 specific variables */ + SYSCTL_DECL(_net_inet6_ip6); - -static struct sysctl_ctx_list ip6_fw_sysctl_ctx; -static struct sysctl_oid *ip6_fw_sysctl_tree; +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_SECURE, + 0, "Firewall"); +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw6_enable, 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, + deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, + fw_deny_unknown_exthdrs, 0, + "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ #endif /* SYSCTL_NODE */ -static int fw_deny_unknown_exthdrs = 1; - /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T @@ -745,7 +753,9 @@ #endif /* INET6 */ +#ifndef VIMAGE static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ +#endif #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) @@ -4459,8 +4469,11 @@ * every dyn_keepalive_period */ static void -ipfw_tick(void * __unused unused) +ipfw_tick(void *arg) { +#ifdef VIMAGE + struct vnet_ipfw *vnet_ipfw = arg; +#endif struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; @@ -4509,37 +4522,39 @@ } done: callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, - ipfw_tick, NULL); + ipfw_tick, arg); } -int -ipfw_init(void) +static int vnet_ipfw_iattach(const void *unused) { INIT_VNET_IPFW(curvnet); struct ip_fw default_rule; int error; -#ifdef INET6 - /* Setup IPv6 fw sysctl tree. */ - sysctl_ctx_init(&ip6_fw_sysctl_ctx); - ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", - CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); - SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, - &V_fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); - SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, - &V_fw_deny_unknown_exthdrs, 0, - "Deny packets with unknown IPv6 Extension Headers"); -#endif + V_fw_debug = 1; + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + V_dyn_max = 4096; /* max # of dynamic rules */ + V_fw_deny_unknown_exthdrs = 1; V_layer3_chain.rules = NULL; IPFW_LOCK_INIT(&V_layer3_chain); +#if 0 /* XXX Marko fix this! */ ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); +#endif callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); @@ -4567,6 +4582,55 @@ } ip_fw_default_rule = V_layer3_chain.rules; + +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif + + error = init_tables(&V_layer3_chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } +#ifdef VIMAGE + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, (void *) vnet_ipfw); +#else + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); +#endif + +#ifdef IPFIREWALL_NAT + LIST_INIT(&V_layer3_chain.nat); +#endif + + return 0; +} + +int +ipfw_init(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + IPFW_DYN_LOCK_INIT(); + +#if 0 /* MARKO XXX */ + /* error = init_tables(&V_layer3_chain); moved to _iattach() */ + if (error) { + IPFW_DYN_LOCK_DESTROY(); + IPFW_LOCK_DESTROY(&V_layer3_chain); + uma_zdestroy(ipfw_dyn_rule_zone); + return (error); + } +#endif + +#ifdef VIMAGE + vnet_mod_register(&vnet_ipfw_modinfo); +#else + vnet_ipfw_iattach(NULL); +#endif + printf("ipfw2 " #ifdef INET6 "(+ipv6) " @@ -4584,49 +4648,41 @@ #else "loadable", #endif + #ifdef IPFIREWALL_NAT "enabled", #else - "loadable", + "disabled, ", #endif - default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "accept" +#else + "deny" +#endif + ); #ifdef IPFIREWALL_VERBOSE - V_fw_verbose = 1; -#endif -#ifdef IPFIREWALL_VERBOSE_LIMIT - V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; -#endif - if (V_fw_verbose == 0) printf("disabled\n"); - else if (V_verbose_limit == 0) +#else +# ifndef IPFIREWALL_VERBOSE_LIMIT printf("unlimited\n"); - else +# else printf("limited to %d packets/entry by default\n", - V_verbose_limit); + IPFIREWALL_VERBOSE_LIMIT); +# endif +#endif - error = init_tables(&V_layer3_chain); - if (error) { - IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&V_layer3_chain); - uma_zdestroy(ipfw_dyn_rule_zone); - return (error); - } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; - callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); - LIST_INIT(&V_layer3_chain.nat); return (0); } -void -ipfw_destroy(void) +static int vnet_ipfw_idetach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw *reap; - ip_fw_chk_ptr = NULL; - ip_fw_ctl_ptr = NULL; callout_drain(&V_ipfw_timeout); IPFW_WLOCK(&V_layer3_chain); flush_tables(&V_layer3_chain); @@ -4636,16 +4692,30 @@ IPFW_WUNLOCK(&V_layer3_chain); if (reap != NULL) reap_rules(reap); - IPFW_DYN_LOCK_DESTROY(); - uma_zdestroy(ipfw_dyn_rule_zone); + IPFW_LOCK_DESTROY(&V_layer3_chain); if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); - IPFW_LOCK_DESTROY(&V_layer3_chain); + + return 0; +} + +void +ipfw_destroy(void) +{ + ip_fw_chk_ptr = NULL; + ip_fw_ctl_ptr = NULL; + +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ipfw_modinfo); +#else + vnet_ipfw_idetach(NULL); +#endif -#ifdef INET6 - /* Free IPv6 fw sysctl tree. */ - sysctl_ctx_free(&ip6_fw_sysctl_ctx); +#ifdef IPFIREWALL_NAT + ipfw_nat_destroy(); #endif + IPFW_DYN_LOCK_DESTROY(); + uma_zdestroy(ipfw_dyn_rule_zone); printf("IP firewall unloaded\n"); } Index: netinet/ip_fw_nat.c =========================================================================== --- netinet/ip_fw_nat.c 2008/08/25 00:28:58 #6 +++ netinet/ip_fw_nat.c 2008/08/25 00:28:58 @@ -53,6 +53,7 @@ #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ #include +#include #include #include #include @@ -64,12 +65,15 @@ #include #include #include +#include #include /* XXX for in_cksum */ MALLOC_DECLARE(M_IPFW); -extern struct ip_fw_chain layer3_chain; +#ifndef VIMAGE +extern struct ip_fw_chain V_layer3_chain; +#endif static eventhandler_tag ifaddr_event_tag; @@ -604,7 +608,7 @@ NULL, EVENTHANDLER_PRI_ANY); } -static void +void ipfw_nat_destroy(void) { INIT_VNET_IPFW(curvnet); Index: netinet/ip_fw_pfil.c =========================================================================== --- netinet/ip_fw_pfil.c 2008/08/25 00:28:58 #5 +++ netinet/ip_fw_pfil.c 2008/08/25 00:28:58 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -66,9 +67,11 @@ #include +#ifndef VIMAGE int fw_enable = 1; -#ifdef INET6 +# ifdef INET6 int fw6_enable = 1; +# endif #endif int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); @@ -487,6 +490,10 @@ int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { +#ifdef VIMAGE + INIT_VNET_IPFW(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int enable = *(int *)arg1; int error; Index: netinet/ip_icmp.c =========================================================================== --- netinet/ip_icmp.c 2008/08/25 00:28:58 #7 +++ netinet/ip_icmp.c 2008/08/25 00:28:58 @@ -45,10 +45,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -77,7 +79,9 @@ * host table maintenance routines. */ +#ifndef VIMAGE struct icmpstat icmpstat; +#endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, icmpstat, icmpstat, ""); Index: netinet/ip_input.c =========================================================================== --- netinet/ip_input.c 2008/08/25 00:28:58 #10 +++ netinet/ip_input.c 2008/08/25 00:28:58 @@ -53,6 +53,7 @@ #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include #include +#include #include #include #include @@ -85,33 +87,36 @@ #include -int rsvp_on = 0; +#ifndef VIMAGE +int rsvp_on; +int ipforwarding; +static int ipsendredirects; +int ip_defttl; +static int ip_keepfaith; +static int ip_sendsourcequench; +int ip_do_randomid; +static int ip_checkinterface; +#endif -int ipforwarding = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, ipforwarding, 0, "Enable IP forwarding between interfaces"); -static int ipsendredirects = 1; /* XXX */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, ipsendredirects, 0, "Enable sending IP redirects"); -int ip_defttl = IPDEFTTL; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); -static int ip_keepfaith = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); -static int ip_sendsourcequench = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, "Enable the transmission of source quench packets"); -int ip_do_randomid = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); @@ -128,7 +133,6 @@ * to the loopback interface instead of the interface where the * packets for those addresses are received. */ -static int ip_checkinterface = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, ip_checkinterface, 0, "Verify packet arrives on correct interface"); @@ -141,9 +145,11 @@ extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; +#ifndef VIMAGE struct in_ifaddrhead in_ifaddrhead; /* first inet address */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_long in_ifaddrhmask; /* mask for hash table */ +#endif SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); @@ -151,22 +157,20 @@ &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); +#ifndef VIMAGE struct ipstat ipstat; +#endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); -/* - * IP datagram reassembly. - */ -#define IPREASS_NHASH_LOG2 6 -#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) -#define IPREASS_HMASK (IPREASS_NHASH - 1) -#define IPREASS_HASH(x,y) \ - (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) - +static struct mtx ipqlock; +#ifndef VIMAGE +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static uma_zone_t ipq_zone; -static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; -static struct mtx ipqlock; +static int nipq; +static int maxnipq; +static int maxfragsperpacket; +#endif #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) @@ -176,13 +180,10 @@ static void maxnipq_update(void); static void ipq_zone_change(void *); -static int maxnipq; /* Administrative limit on # reass queues. */ -static int nipq = 0; /* Total # of reass queues */ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); -static int maxfragsperpacket; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); @@ -195,7 +196,9 @@ #endif #ifdef IPSTEALTH -int ipstealth = 0; +#ifndef VIMAGE +int ipstealth; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif @@ -210,6 +213,19 @@ static void ip_freef(struct ipqhead *, struct ipq *); +#ifdef VIMAGE +static void vnet_inet_register(void); + +VNET_MOD_DECLARE(INET, inet, NULL, NULL, NET, NULL) + +static void vnet_inet_register() +{ + vnet_mod_register(&vnet_inet_modinfo); +} + +SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); +#endif + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -217,11 +233,58 @@ void ip_init(void) { + INIT_VNET_INET(curvnet); struct protosw *pr; int i; TAILQ_INIT(&V_in_ifaddrhead); - V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); + V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, + &V_in_ifaddrhmask); + + /* Initialize IP reassembly queue. */ + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&V_ipq[i]); + V_nipq = 0; + V_maxnipq = nmbclusters / 32; + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + maxnipq_update(); + + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; +#ifdef IPSTEALTH + V_ipstealth = 0; +#endif + + V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ + V_ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_reservedlow = 0; + V_ipport_randomized = 1; /* user controlled via sysctl */ + V_ipport_randomcps = 10; /* user controlled via sysctl */ + V_ipport_randomtime = 45; /* user controlled via sysctl */ + V_ipport_stoprandom = 0; /* toggled by ipport_tick */ + + V_rsvp_on = 0; + V_ipforwarding = 0; + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; + V_ip_keepfaith = 0; + V_ip_sendsourcequench = 0; + V_ip_do_randomid = 0; + V_ip_checkinterface = 0; + +#ifdef VIMAGE + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); @@ -249,25 +312,16 @@ printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); - /* Initialize IP reassembly queue. */ - IPQ_LOCK_INIT(); - for (i = 0; i < IPREASS_NHASH; i++) - TAILQ_INIT(&V_ipq[i]); - V_maxnipq = nmbclusters / 32; - V_maxfragsperpacket = 16; - V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, - NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); - /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); - ipport_tick(NULL); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ + IPQ_LOCK_INIT(); V_ip_id = time_second & 0xffff; ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); @@ -1573,8 +1627,10 @@ * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ +#ifndef VIMAGE static int ip_rsvp_on; struct socket *ip_rsvpd; +#endif int ip_rsvp_init(struct socket *so) { Index: netinet/ip_ipsec.c =========================================================================== --- netinet/ip_ipsec.c 2008/08/25 00:28:58 #6 +++ netinet/ip_ipsec.c 2008/08/25 00:28:58 @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -62,6 +63,7 @@ #include #include #include +#include #endif /*IPSEC*/ extern struct protosw inetsw[]; @@ -141,9 +143,9 @@ int ip_ipsec_input(struct mbuf *m) { + INIT_VNET_IPSEC(curvnet); struct ip *ip = mtod(m, struct ip *); #ifdef IPSEC - INIT_VNET_IPSEC(curvnet); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; Index: netinet/ip_mroute.c =========================================================================== --- netinet/ip_mroute.c 2008/08/25 00:28:58 #7 +++ netinet/ip_mroute.c 2008/08/25 00:28:58 @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include Index: netinet/ip_options.c =========================================================================== --- netinet/ip_options.c 2008/08/25 00:28:58 #6 +++ netinet/ip_options.c 2008/08/25 00:28:58 @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -54,6 +55,7 @@ #include #include +#include #include #include #include Index: netinet/ip_output.c =========================================================================== --- netinet/ip_output.c 2008/08/25 00:28:58 #7 +++ netinet/ip_output.c 2008/08/25 00:28:58 @@ -52,6 +52,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,7 @@ #include #endif +#include #include #include #include @@ -83,7 +85,9 @@ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); +#ifndef VIMAGE u_short ip_id; +#endif #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; Index: netinet/ip_var.h =========================================================================== --- netinet/ip_var.h 2008/08/25 00:28:58 #7 +++ netinet/ip_var.h 2008/08/25 00:28:58 @@ -175,19 +175,22 @@ struct route; struct sockopt; +#ifndef VIMAGE extern struct ipstat ipstat; extern u_short ip_id; /* ip packet ctr, for ids */ extern int ip_defttl; /* default IP ttl */ extern int ipforwarding; /* ip forwarding */ +extern int ip_do_randomid; #ifdef IPSTEALTH extern int ipstealth; /* stealth forwarding */ #endif -extern u_char ip_protox[]; +extern int rsvp_on; extern struct socket *ip_rsvpd; /* reservation protocol daemon */ extern struct socket *ip_mrouter; /* multicast routing daemon */ +#endif +extern u_char ip_protox[]; extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); -extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; void inp_freemoptions(struct ip_moptions *); @@ -220,6 +223,9 @@ int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); +#ifdef VIMAGE +void rip_destroy(void); +#endif void rip_input(struct mbuf *, int); int rip_output(struct mbuf *, struct socket *, u_long); void ipip_input(struct mbuf *, int); @@ -234,9 +240,7 @@ void in_delayed_cksum(struct mbuf *m); -static __inline uint16_t ip_newid(void); -extern int ip_do_randomid; - +#if 0 static __inline uint16_t ip_newid(void) { @@ -245,6 +249,9 @@ return htons(V_ip_id++); } +#else +#define ip_newid() (V_ip_do_randomid ? ip_randomid() : V_ip_id++) +#endif #endif /* _KERNEL */ Index: netinet/ipprotosw.h =========================================================================== --- netinet/ipprotosw.h 2008/08/25 00:28:58 #1 +++ netinet/ipprotosw.h 2008/08/25 00:28:58 @@ -87,6 +87,7 @@ void *pr_ousrreq; /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ Index: netinet/raw_ip.c =========================================================================== --- netinet/raw_ip.c 2008/08/25 00:28:58 #11 +++ netinet/raw_ip.c 2008/08/25 00:28:58 @@ -56,9 +56,11 @@ #include +#include #include #include +#include #include #include #include @@ -76,8 +78,11 @@ #include +#ifndef VIMAGE struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +#endif +static struct uma_zone *ripcb_zone; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; @@ -91,7 +96,9 @@ /* * The socket used to communicate with the multicast routing daemon. */ +#ifndef VIMAGE struct socket *ip_mrouter; +#endif /* * The various mrouter and rsvp functions. @@ -177,6 +184,16 @@ { INIT_VNET_INET(curvnet); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + ripcb_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } + V_ripcbinfo.ipi_vnet = curvnet; +#endif + INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); LIST_INIT(&V_ripcb); V_ripcbinfo.ipi_listhead = &V_ripcb; @@ -184,13 +201,25 @@ hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); V_ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); - V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + V_ripcbinfo.ipi_zone = ripcb_zone; uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +rip_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, + V_ripcbinfo.ipi_hashmask); + hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, + V_ripcbinfo.ipi_porthashmask); +} +#endif + static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) Index: netinet/sctp_output.c =========================================================================== --- netinet/sctp_output.c 2008/08/25 00:28:58 #9 +++ netinet/sctp_output.c 2008/08/25 00:28:58 @@ -33,8 +33,11 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/sctp_output.c,v 1.75 2008/08/24 18:29:22 bz Exp $"); + #include #include +#include +#include #include #include #include Index: netinet/tcp_hostcache.c =========================================================================== --- netinet/tcp_hostcache.c 2008/08/25 00:28:58 #9 +++ netinet/tcp_hostcache.c 2008/08/25 00:28:58 @@ -57,11 +57,6 @@ * of bucket limit memory constrains. */ -/* - * Many thanks to jlemon for basic structure of tcp_syncache which is being - * followed here. - */ - #include __FBSDID("$FreeBSD: src/sys/netinet/tcp_hostcache.c,v 1.20 2008/08/20 01:24:55 julian Exp $"); @@ -78,8 +73,10 @@ #include #include +#include #include +#include #include #include #include @@ -95,60 +92,21 @@ #ifdef INET6 #include #endif +#include #include -TAILQ_HEAD(hc_qhead, hc_metrics); - -struct hc_head { - struct hc_qhead hch_bucket; - u_int hch_length; - struct mtx hch_mtx; -}; - -struct hc_metrics { - /* housekeeping */ - TAILQ_ENTRY(hc_metrics) rmx_q; - struct hc_head *rmx_head; /* head of bucket tail queue */ - struct in_addr ip4; /* IP address */ - struct in6_addr ip6; /* IP6 address */ - /* endpoint specific values for TCP */ - u_long rmx_mtu; /* MTU for this path */ - u_long rmx_ssthresh; /* outbound gateway buffer limit */ - u_long rmx_rtt; /* estimated round trip time */ - u_long rmx_rttvar; /* estimated rtt variance */ - u_long rmx_bandwidth; /* estimated bandwidth */ - u_long rmx_cwnd; /* congestion window */ - u_long rmx_sendpipe; /* outbound delay-bandwidth product */ - u_long rmx_recvpipe; /* inbound delay-bandwidth product */ - /* TCP hostcache internal data */ - int rmx_expire; /* lifetime for object */ - u_long rmx_hits; /* number of hits */ - u_long rmx_updates; /* number of updates */ -}; - /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ -struct tcp_hostcache { - struct hc_head *hashbase; - uma_zone_t zone; - u_int hashsize; - u_int hashmask; - u_int bucket_limit; - u_int cache_count; - u_int cache_limit; - int expire; - int prune; - int purgeall; -}; +#ifndef VIMAGE static struct tcp_hostcache tcp_hostcache; - static struct callout tcp_hc_callout; +#endif static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); @@ -220,16 +178,16 @@ V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; V_tcp_hostcache.cache_limit = - V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; + V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", - &V_tcp_hostcache.hashsize); + &V_tcp_hostcache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", - &V_tcp_hostcache.cache_limit); + &V_tcp_hostcache.cache_limit); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", - &V_tcp_hostcache.bucket_limit); + &V_tcp_hostcache.bucket_limit); if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ @@ -255,6 +213,8 @@ /* * Allocate the hostcache entries. + * + * XXX don't need a separate zone for each hc instance - revisit!!! */ V_tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), @@ -269,6 +229,16 @@ tcp_hc_purge, 0); } +void +tcp_hc_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX TODO walk the hashtable and free all entries */ + + callout_drain(&V_tcp_hc_callout); +} + /* * Internal function: look up an entry in the hostcache or return NULL. * @@ -673,9 +643,10 @@ static void tcp_hc_purge(void *arg) { + CURVNET_SET((struct vnet *) arg); INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry, *hc_next; - int all = (intptr_t)arg; + int all = 0; int i; if (V_tcp_hostcache.purgeall) { @@ -701,4 +672,6 @@ callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, arg); + + CURVNET_RESTORE(); } Index: netinet/tcp_hostcache.h =========================================================================== *** /dev/null Mon Aug 25 00:22:00 2008 --- netinet/tcp_hostcache.h Mon Aug 25 00:29:03 2008 *************** *** 0 **** --- 1,82 ---- + /*- + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS ID + */ + + /* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + + #ifndef _NETINET_TCP_HOSTCACHE_H_ + #define _NETINET_TCP_HOSTCACHE_H_ + + TAILQ_HEAD(hc_qhead, hc_metrics); + + struct hc_head { + struct hc_qhead hch_bucket; + u_int hch_length; + struct mtx hch_mtx; + }; + + struct hc_metrics { + /* housekeeping */ + TAILQ_ENTRY(hc_metrics) rmx_q; + struct hc_head *rmx_head; /* head of bucket tail queue */ + struct in_addr ip4; /* IP address */ + struct in6_addr ip6; /* IP6 address */ + /* endpoint specific values for tcp */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ + /* TCP hostcache internal data */ + int rmx_expire; /* lifetime for object */ + u_long rmx_hits; /* number of hits */ + u_long rmx_updates; /* number of updates */ + }; + + struct tcp_hostcache { + struct hc_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + int expire; + int prune; + int purgeall; + }; + + #endif /* !_NETINET_TCP_HOSTCACHE_H_*/ Index: netinet/tcp_input.c =========================================================================== --- netinet/tcp_input.c 2008/08/25 00:28:58 #11 +++ netinet/tcp_input.c 2008/08/25 00:28:58 @@ -57,11 +57,13 @@ #include +#include #include #include #define TCPSTATES /* for logging */ +#include #include #include #include @@ -73,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +94,7 @@ #ifdef IPSEC #include #include +#include #endif /*IPSEC*/ #include @@ -99,7 +103,23 @@ static const int tcprexmtthresh = 3; -struct tcpstat tcpstat; +#ifndef VIMAGE +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; +struct tcpstat tcpstat; +int blackhole; +int tcp_delack_enabled; +int drop_synfin; +int tcp_do_rfc3042; +int tcp_do_rfc3390; +int tcp_do_ecn; +int tcp_ecn_maxretries; +int tcp_insecure_rst; +int tcp_do_autorcvbuf; +int tcp_autorcvbuf_inc; +int tcp_autorcvbuf_max; +#endif + SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); @@ -108,61 +128,47 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); -static int blackhole = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, blackhole, 0, "Do not send RST on segments to closed ports"); -int tcp_delack_enabled = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); -static int drop_synfin = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); -static int tcp_do_rfc3042 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); -static int tcp_do_rfc3390 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); -int tcp_do_ecn = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); -int tcp_ecn_maxretries = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); -static int tcp_insecure_rst = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets"); -int tcp_do_autorcvbuf = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); -int tcp_autorcvbuf_inc = 16*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); -int tcp_autorcvbuf_max = 256*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); -struct inpcbhead tcb; -#define tcb6 tcb /* for KAME src sync over BSD*'s */ -struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, @@ -258,6 +264,13 @@ void tcp_input(struct mbuf *m, int off0) { + INIT_VNET_INET(curvnet); +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif +#ifdef IPSEC + INIT_VNET_IPSEC(curvnet); +#endif struct tcphdr *th; struct ip *ip = NULL; struct ipovly *ipov; Index: netinet/tcp_offload.c =========================================================================== --- netinet/tcp_offload.c 2008/08/25 00:28:58 #5 +++ netinet/tcp_offload.c 2008/08/25 00:28:58 @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -102,6 +103,7 @@ void tcp_offload_twstart(struct tcpcb *tp) { + INIT_VNET_INET(curvnet); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(tp->t_inpcb); @@ -112,6 +114,7 @@ struct tcpcb * tcp_offload_close(struct tcpcb *tp) { + INIT_VNET_INET(curvnet); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(tp->t_inpcb); @@ -126,6 +129,7 @@ struct tcpcb * tcp_offload_drop(struct tcpcb *tp, int error) { + INIT_VNET_INET(curvnet); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(tp->t_inpcb); Index: netinet/tcp_output.c =========================================================================== --- netinet/tcp_output.c 2008/08/25 00:28:58 #11 +++ netinet/tcp_output.c 2008/08/25 00:28:58 @@ -51,8 +51,10 @@ #include #include +#include #include +#include #include #include #include @@ -87,40 +89,42 @@ extern struct mbuf *m_copypack(); #endif -int path_mtu_discovery = 1; +#ifndef VIMAGE +int path_mtu_discovery; +int ss_fltsz; +int ss_fltsz_local; +int tcp_do_newreno; +int tcp_do_tso; +int tcp_do_autosndbuf; +int tcp_autosndbuf_inc; +int tcp_autosndbuf_max; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, path_mtu_discovery, 1, "Enable Path MTU Discovery"); -int ss_fltsz = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, ss_fltsz, 1, "Slow start flight size"); -int ss_fltsz_local = 4; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_newreno = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, tcp_do_newreno, 0, "Enable NewReno Algorithms"); -int tcp_do_tso = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, tcp_do_tso, 0, "Enable TCP Segmentation Offload"); -int tcp_do_autosndbuf = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); -int tcp_autosndbuf_inc = 8*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); -int tcp_autosndbuf_max = 256*1024; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); Index: netinet/tcp_reass.c =========================================================================== --- netinet/tcp_reass.c 2008/08/25 00:28:58 #7 +++ netinet/tcp_reass.c 2008/08/25 00:28:58 @@ -52,6 +52,7 @@ #include #include +#include #include #include #include @@ -77,22 +78,25 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); +#ifndef VIMAGE static int tcp_reass_maxseg = 0; +int tcp_reass_qsize = 0; +static int tcp_reass_maxqlen = 48; +static int tcp_reass_overflows = 0; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); -int tcp_reass_qsize = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); -static int tcp_reass_maxqlen = 48; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); -static int tcp_reass_overflows = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); Index: netinet/tcp_sack.c =========================================================================== --- netinet/tcp_sack.c 2008/08/25 00:28:58 #7 +++ netinet/tcp_sack.c 2008/08/25 00:28:58 @@ -95,9 +95,11 @@ #include +#include #include #include +#include #include #include #include @@ -124,23 +126,25 @@ extern struct uma_zone *sack_hole_zone; +#ifndef VIMAGE +int tcp_do_sack; +int tcp_sack_maxholes; +int tcp_sack_globalmaxholes; +int tcp_sack_globalholes; +#endif + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); -int tcp_do_sack = 1; -TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, tcp_do_sack, 0, "Enable/Disable TCP SACK support"); -int tcp_sack_maxholes = 128; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); -int tcp_sack_globalmaxholes = 65536; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); -int tcp_sack_globalholes = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); Index: netinet/tcp_subr.c =========================================================================== --- netinet/tcp_subr.c 2008/08/25 00:28:58 #8 +++ netinet/tcp_subr.c 2008/08/25 00:28:58 @@ -59,9 +59,11 @@ #include +#include #include #include +#include #include #include #include @@ -95,6 +97,7 @@ #include #endif #include +#include #ifdef IPSEC #include @@ -110,12 +113,31 @@ #include -int tcp_mssdflt = TCP_MSS; +static int tcp_tcbhashsize = 0; +static int do_tcpdrain = 1; +static int tcp_inflight_debug = 0; + +#ifndef VIMAGE +int tcp_mssdflt; +int tcp_minmss; +int tcp_do_rfc1323; +static int icmp_may_rst; +static int tcp_isn_reseed_interval; +static int tcp_inflight_enable; +static int tcp_inflight_rttthresh; +static int tcp_inflight_min; +static int tcp_inflight_max; +static int tcp_inflight_stab; +static int nolocaltimewait; +#endif + SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #ifdef INET6 -int tcp_v6mssdflt = TCP6_MSS; +#ifndef VIMAGE +int tcp_v6mssdflt; +#endif SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6"); @@ -129,11 +151,9 @@ * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ -int tcp_minmss = TCP_MINMSS; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); -int tcp_do_rfc1323 = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); @@ -142,23 +162,19 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); -static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); -static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs"); -static int icmp_may_rst = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); -static int tcp_isn_reseed_interval = 0; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); @@ -171,29 +187,30 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); -static int tcp_inflight_enable = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); -static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); -static int tcp_inflight_rttthresh; -SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, - &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", +static int sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS); + +static int +sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS) +{ + return (0); /* XXX MARKO REVISIT */ +} +SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_tcp_inflight_rttthresh, "I", "RTT threshold below which inflight will deactivate itself"); -static int tcp_inflight_min = 6144; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); -static int tcp_inflight_stab = 20; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); @@ -233,14 +250,15 @@ #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) +static struct uma_zone *tcp_ipi_zone; + /* * TCP initialization. */ static void tcp_zone_change(void *tag) { - - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(tcp_ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } @@ -259,6 +277,26 @@ { INIT_VNET_INET(curvnet); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + tcp_ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_ipi_zone, maxsockets); + /* + * These have to be type stable for the benefit of the timers. + */ + tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcpcb_zone, maxsockets); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } +#endif + + tcp_tw_init(); + int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; @@ -272,6 +310,44 @@ tcp_rexmit_slop = TCPTV_CPU_VAR; V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + V_path_mtu_discovery = 1; + V_ss_fltsz = 1; + V_ss_fltsz_local = 4; + V_tcp_do_newreno = 1; + V_tcp_do_tso = 1; + V_tcp_do_autosndbuf = 1; + V_tcp_autosndbuf_inc = 8*1024; + V_tcp_autosndbuf_max = 256*1024; + V_blackhole = 0; + V_tcp_delack_enabled = 1; + V_drop_synfin = 0; + V_tcp_do_rfc3042 = 1; + V_tcp_do_rfc3390 = 1; + V_tcp_insecure_rst = 0; + V_tcp_do_ecn = 0; + V_tcp_ecn_maxretries = 1; + V_tcp_do_autorcvbuf = 1; + V_tcp_autorcvbuf_inc = 16*1024; + V_tcp_autorcvbuf_max = 256*1024; + V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + V_tcp_mssdflt = TCP_MSS; +#ifdef INET6 + V_tcp_v6mssdflt = TCP6_MSS; +#endif + V_tcp_minmss = TCP_MINMSS; + V_tcp_do_rfc1323 = 1; + V_icmp_may_rst = 1; + V_tcp_isn_reseed_interval = 0; + V_tcp_inflight_enable = 1; + V_tcp_inflight_min = 6144; + V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; + V_tcp_inflight_stab = 20; + V_nolocaltimewait = 0; + V_tcp_do_sack = 1; + V_tcp_sack_maxholes = 128; + V_tcp_sack_globalmaxholes = 65536; + V_tcp_sack_globalholes = 0; + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); LIST_INIT(&V_tcb); @@ -286,9 +362,8 @@ &V_tcbinfo.ipi_hashmask); V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, &V_tcbinfo.ipi_porthashmask); - V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); + V_tcbinfo.ipi_zone = tcp_ipi_zone; + V_tcbinfo.ipi_vnet = curvnet; #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -299,28 +374,45 @@ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR - /* - * These have to be type stable for the benefit of the timers. - */ - tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcpcb_zone, maxsockets); - tcp_tw_init(); + syncache_init(); tcp_hc_init(); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); - tcp_isn_tick(NULL); + callout_reset(&isn_callout, 1, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); - sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE void +tcp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + tcp_tw_destroy(); + tcp_hc_destroy(); + syncache_destroy(); + + /* XXX check that hashes are empty! */ + hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, + V_tcbinfo.ipi_hashmask); + hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, + V_tcbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_tcbinfo); +} +#endif + +void tcp_fini(void *xtp) { @@ -601,6 +693,9 @@ return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; +#ifdef VIMAGE + tp->t_vnet = inp->inp_vnet; +#endif /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 @@ -858,6 +953,9 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; +#ifdef INVARIANTS + INIT_VNET_INET(inp->inp_vnet); +#endif INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); @@ -1387,10 +1485,12 @@ #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#ifndef VIMAGE static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; +#endif tcp_seq tcp_new_isn(struct tcpcb *tp) @@ -1471,6 +1571,9 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; +#ifdef INVARIANTS + INIT_VNET_INET(inp->inp_vnet); +#endif INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); Index: netinet/tcp_syncache.c =========================================================================== --- netinet/tcp_syncache.c 2008/08/25 00:28:58 #17 +++ netinet/tcp_syncache.c 2008/08/25 00:28:58 @@ -58,9 +58,11 @@ #include +#include #include #include +#include #include #include #include @@ -98,6 +100,12 @@ #include +#ifdef TCP_OFFLOAD_DISABLE +#define TOEPCB_ISSET(sc) (0) +#else +#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#endif + static int tcp_syncookies = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, &tcp_syncookies, 0, @@ -108,12 +116,6 @@ &tcp_syncookiesonly, 0, "Use only TCP SYN cookies"); -#ifdef TCP_OFFLOAD_DISABLE -#define TOEPCB_ISSET(sc) (0) -#else -#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) -#endif - static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); @@ -142,7 +144,10 @@ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 +#ifndef VIMAGE static struct tcp_syncache tcp_syncache; +int tcp_sc_rst_sock_fail; +#endif SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); @@ -166,7 +171,6 @@ rexmtlimit, CTLFLAG_RW, tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); -int tcp_sc_rst_sock_fail = 1; SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_RW, tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure"); @@ -228,6 +232,7 @@ V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); + V_tcp_sc_rst_sock_fail = 1; TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); @@ -253,6 +258,9 @@ /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { +#ifdef VIMAGE + V_tcp_syncache.hashbase[i].sch_vnet = curvnet; +#endif TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); @@ -262,11 +270,25 @@ } /* Create the syncache entry zone. */ + /* XXX one zone for all vnets should do fine - revisit!!! */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); } +#ifdef VIMAGE +void +syncache_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX walk the cache, free remaining objects, stop timers */ + + uma_zdestroy(V_tcp_syncache.zone); + FREE(V_tcp_syncache.hashbase, M_SYNCACHE); +} +#endif + /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. @@ -357,6 +379,7 @@ struct syncache *sc, *nsc; int tick = ticks; char *s; + CURVNET_SET(sch->sch_vnet); INIT_VNET_INET(sch->sch_vnet); /* NB: syncache_head has already been locked by the callout. */ @@ -407,6 +430,7 @@ if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); + CURVNET_RESTORE(); } /* @@ -924,6 +948,7 @@ tcp_offload_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { + INIT_VNET_INET(curvnet); int rc; INP_INFO_WLOCK(&V_tcbinfo); Index: netinet/tcp_syncache.h =========================================================================== --- netinet/tcp_syncache.h 2008/08/25 00:28:58 #3 +++ netinet/tcp_syncache.h 2008/08/25 00:28:58 @@ -1,6 +1,12 @@ /*- - * Copyright (c) 1982, 1986, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 2001 McAfee, Inc. + * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jonathan Lemon + * and McAfee Research, the Security Research Division of McAfee, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -10,14 +16,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -35,6 +38,9 @@ #ifdef _KERNEL void syncache_init(void); +#ifdef VIMAGE +void syncache_destroy(void); +#endif void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); @@ -49,7 +55,8 @@ void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); int syncache_pcbcount(void); -int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); +int syncache_pcblist(struct sysctl_req *req, int max_pcbs, + int *pcbs_exported); struct syncache { TAILQ_ENTRY(syncache) sc_hash; @@ -118,4 +125,4 @@ }; #endif /* _KERNEL */ -#endif /* !_NETINET_TCP_SYNCACHE_H_ */ +#endif /* _NETINET_TCP_SYNCACHE_H_ */ Index: netinet/tcp_timer.c =========================================================================== --- netinet/tcp_timer.c 2008/08/25 00:28:58 #7 +++ netinet/tcp_timer.c 2008/08/25 00:28:58 @@ -47,8 +47,10 @@ #include #include +#include #include +#include #include #include #include Index: netinet/tcp_timewait.c =========================================================================== --- netinet/tcp_timewait.c 2008/08/25 00:28:58 #7 +++ netinet/tcp_timewait.c 2008/08/25 00:28:58 @@ -57,6 +57,7 @@ #include #include +#include #include #include #include @@ -102,7 +103,9 @@ * queue pointers in each tcptw structure, are protected using the global * tcbinfo lock, which must be held over queue iteration and modification. */ +#ifndef VIMAGE static TAILQ_HEAD(, tcptw) twq_2msl; +#endif static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *); @@ -163,7 +166,15 @@ void tcp_tw_init(void) { + INIT_VNET_INET(curvnet); + TAILQ_INIT(&V_twq_2msl); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); @@ -171,8 +182,21 @@ uma_zone_set_max(tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(tcptw_zone, maxtcptw); - TAILQ_INIT(&V_twq_2msl); +} + +#ifdef VIMAGE +void +tcp_tw_destroy(void) +{ + INIT_VNET_INET(curvnet); + struct tcptw *tw; + + INP_INFO_WLOCK(&V_tcbinfo); + while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + tcp_twclose(tw, 0); + INP_INFO_WUNLOCK(&V_tcbinfo); } +#endif /* * Move a TCP connection into TIME_WAIT state. Index: netinet/tcp_usrreq.c =========================================================================== --- netinet/tcp_usrreq.c 2008/08/25 00:28:58 #7 +++ netinet/tcp_usrreq.c 2008/08/25 00:28:58 @@ -59,9 +59,11 @@ #include #endif +#include #include #include +#include #include #include #ifdef INET6 Index: netinet/tcp_var.h =========================================================================== --- netinet/tcp_var.h 2008/08/25 00:28:58 #2 +++ netinet/tcp_var.h 2008/08/25 00:28:58 @@ -35,6 +35,8 @@ #include +struct vnet; + /* * Kernel variables for tcp. */ @@ -48,7 +50,6 @@ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); -extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; struct sackblk { @@ -186,6 +187,7 @@ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ + u_char snd_limited; /* segments limited transmitted */ u_int32_t ts_recent; /* timestamp echo data */ u_long ts_recent_age; /* when last updated */ u_int32_t ts_offset; /* our timestamp offset */ @@ -196,7 +198,7 @@ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ - u_char snd_limited; /* segments limited transmitted */ + struct vnet *t_vnet; /* back pointer to parent vnet */ /* SACK related state */ int snd_numholes; /* number of holes seen by sender */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; @@ -505,10 +507,10 @@ MALLOC_DECLARE(M_TCPLOG); #endif +#ifndef VIMAGE extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ -extern int tcp_log_in_vain; extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_delack_enabled; @@ -516,11 +518,30 @@ extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; - extern int tcp_do_sack; /* SACK enabled/disabled */ extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */ extern int tcp_do_ecn; /* TCP ECN enabled/disabled */ extern int tcp_ecn_maxretries; +extern int tcp_sack_maxholes; +extern int tcp_sack_globalmaxholes; +extern int tcp_sack_globalholes; +extern int tcp_do_tso; +extern int tcp_do_autosndbuf; +extern int tcp_autosndbuf_max; +extern int tcp_autosndbuf_inc; +extern int tcp_autorcvbuf; +extern int tcp_do_autorcvbuf; +extern int tcp_autorcvbuf_inc; +extern int tcp_autorcvbuf_max; +extern int blackhole; +extern int drop_synfin; +extern int tcp_do_rfc3042; +extern int tcp_do_rfc3390; +extern int tcp_insecure_rst; +extern int tcp_reass_qsize; +#endif + +extern int tcp_log_in_vain; int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * @@ -538,6 +559,7 @@ void tcp_drain(void); void tcp_fasttimo(void); void tcp_init(void); +void tcp_destroy(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); @@ -558,6 +580,9 @@ void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); +#ifdef VIMAGE +void tcp_tw_destroy(void); +#endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); @@ -578,6 +603,7 @@ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); +void tcp_hc_destroy(void); void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); u_long tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, u_long); Index: netinet/udp_usrreq.c =========================================================================== --- netinet/udp_usrreq.c 2008/08/25 00:28:58 #16 +++ netinet/udp_usrreq.c 2008/08/25 00:28:58 @@ -61,9 +61,11 @@ #include +#include #include #include +#include #include #include #include @@ -84,6 +86,7 @@ #ifdef IPSEC #include +#include #endif #include @@ -129,14 +132,18 @@ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); +#ifndef VIMAGE struct inpcbhead udb; /* from udp_var.h */ struct inpcbinfo udbinfo; +#endif #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif +#ifndef VIMAGE struct udpstat udpstat; /* from udp_var.h */ +#endif SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); @@ -145,11 +152,12 @@ static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); +static struct uma_zone *udp_ipi_zone; + static void udp_zone_change(void *tag) { - - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); + uma_zone_set_max(udp_ipi_zone, maxsockets); } static int @@ -167,6 +175,19 @@ { INIT_VNET_INET(curvnet); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + udp_ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, + NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(udp_ipi_zone, maxsockets); + EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, + EVENTHANDLER_PRI_ANY); +#ifdef VIMAGE + } + V_udbinfo.ipi_vnet = curvnet; +#endif + INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); LIST_INIT(&V_udb); V_udbinfo.ipi_listhead = &V_udb; @@ -174,12 +195,22 @@ &V_udbinfo.ipi_hashmask); V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, &V_udbinfo.ipi_porthashmask); - V_udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, - NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, - EVENTHANDLER_PRI_ANY); + V_udbinfo.ipi_zone = udp_ipi_zone; +} + +#ifdef VIMAGE +void +udp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_udbinfo.ipi_hashbase, M_PCB, + V_udbinfo.ipi_hashmask); + hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB, + V_udbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_udbinfo); } +#endif /* * Subroutine of udp_input(), which appends the provided mbuf chain to the Index: netinet/udp_var.h =========================================================================== --- netinet/udp_var.h 2008/08/25 00:28:58 #2 +++ netinet/udp_var.h 2008/08/25 00:28:58 @@ -94,16 +94,22 @@ SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; + +#ifndef VIMAGE extern struct inpcbhead udb; extern struct inpcbinfo udbinfo; +extern struct udpstat udpstat; +#endif extern u_long udp_sendspace; extern u_long udp_recvspace; -extern struct udpstat udpstat; extern int udp_blackhole; extern int udp_log_in_vain; void udp_ctlinput(int, struct sockaddr *, void *); void udp_init(void); +#ifdef VIMAGE +void udp_destroy(void); +#endif void udp_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); int udp_shutdown(struct socket *so); Index: netinet/vinet.h =========================================================================== --- netinet/vinet.h 2008/08/25 00:28:58 #1 +++ netinet/vinet.h 2008/08/25 00:28:58 @@ -270,6 +270,7 @@ #define V_tcp_reass_maxqlen VNET_INET(tcp_reass_maxqlen) #define V_tcp_reass_overflows VNET_INET(tcp_reass_overflows) +/* pf needs to get to these (!?) */ #define V_isn_secret VNET_INET(isn_secret) #define V_isn_last_reseed VNET_INET(isn_last_reseed) #define V_isn_offset VNET_INET(isn_offset) Index: netinet6/dest6.c =========================================================================== --- netinet6/dest6.c 2008/08/25 00:28:58 #6 +++ netinet6/dest6.c 2008/08/25 00:28:58 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include #include Index: netinet6/frag6.c =========================================================================== --- netinet6/frag6.c 2008/08/25 00:28:58 #8 +++ netinet6/frag6.c 2008/08/25 00:28:58 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include #include /* for ECN definitions */ @@ -73,9 +75,11 @@ /* * These fields all protected by ip6qlock. */ +#ifndef VIMAGE static u_int frag6_nfragpackets; static u_int frag6_nfrags; static struct ip6q ip6q; /* ip6 reassemble queue */ +#endif #define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); #define IP6Q_LOCK() mtx_lock(&ip6qlock) @@ -102,14 +106,17 @@ { INIT_VNET_INET6(curvnet); + V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); IP6Q_LOCK_INIT(); - - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; } /* @@ -689,6 +696,7 @@ struct ip6q *q6; IP6Q_LOCK(); + VNET_ITERLOOP_BEGIN() INIT_VNET_INET6(curvnet); q6 = V_ip6q.ip6q_next; if (q6) @@ -712,6 +720,7 @@ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(V_ip6q.ip6q_prev); } + VNET_ITERLOOP_END() IP6Q_UNLOCK(); #if 0 @@ -740,11 +749,13 @@ if (IP6Q_TRYLOCK() == 0) return; + VNET_ITERLOOP_BEGIN() INIT_VNET_INET6(curvnet); while (V_ip6q.ip6q_next != &V_ip6q) { V_ip6stat.ip6s_fragdropped++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(V_ip6q.ip6q_next); } + VNET_ITERLOOP_END() IP6Q_UNLOCK(); } Index: netinet6/icmp6.c =========================================================================== --- netinet6/icmp6.c 2008/08/25 00:28:58 #11 +++ netinet6/icmp6.c 2008/08/25 00:28:58 @@ -83,17 +83,20 @@ #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -111,12 +114,16 @@ struct icmp6stat icmp6stat; +#ifndef VIMAGE extern struct inpcbinfo ripcbinfo; extern struct inpcbhead ripcb; extern int icmp6errppslim; -static int icmp6errpps_count = 0; +static int icmp6errpps_count; +#endif /* !VIMAGE */ static struct timeval icmp6errppslim_last; +#ifndef VIMAGE extern int icmp6_nodeinfo; +#endif /* !VIMAGE */ static void icmp6_errcount(struct icmp6errstat *, int, int); static int icmp6_rip6_input(struct mbuf **, int); @@ -137,6 +144,7 @@ icmp6_init(void) { INIT_VNET_INET6(curvnet); + V_icmp6errpps_count = 0; mld6_init(); } @@ -393,7 +401,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) { INIT_VNET_INET6(curvnet); - /* XXX this below is WRONG - MARKO */ + /* XXX this bellow is WRONG - MARKO */ INIT_VPROCG(TD_TO_VPROCG(curthread)); struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; @@ -2802,7 +2810,7 @@ ret = 0; /* okay to send */ /* PPS limit */ - if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, + if (!ppsratecheck(&icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; Index: netinet6/in6.c =========================================================================== --- netinet6/in6.c 2008/08/25 00:28:58 #8 +++ netinet6/in6.c 2008/08/25 00:28:58 @@ -80,6 +80,7 @@ #include #include +#include #include #include #include @@ -93,6 +94,7 @@ #include #include +#include #include #include #include Index: netinet6/in6_gif.c =========================================================================== --- netinet6/in6_gif.c 2008/08/25 00:28:58 #7 +++ netinet6/in6_gif.c 2008/08/25 00:28:58 @@ -61,6 +61,7 @@ #include #include #include +#include #endif #include #include @@ -74,12 +75,15 @@ struct ifnet *); extern struct domain inet6domain; -struct ip6protosw in6_gif_protosw = -{ SOCK_RAW, &inet6domain, 0/* IPPROTO_IPV[46] */, PR_ATOMIC|PR_ADDR, - in6_gif_input, rip6_output, 0, rip6_ctloutput, - 0, - 0, 0, 0, 0, - &rip6_usrreqs +struct ip6protosw in6_gif_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inet6domain, + .pr_protocol = 0/* IPPROTO_IPV[46] */, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = in6_gif_input, + .pr_output = rip6_output, + .pr_ctloutput = rip6_ctloutput, + .pr_usrreqs = &rip6_usrreqs }; int Index: netinet6/in6_ifattach.c =========================================================================== --- netinet6/in6_ifattach.c 2008/08/25 00:28:58 #10 +++ netinet6/in6_ifattach.c 2008/08/25 00:28:58 @@ -42,17 +42,20 @@ #include #include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -61,18 +64,13 @@ #include #include -unsigned long in6_maxmtu = 0; - -#ifdef IP6_AUTO_LINKLOCAL -int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; -#else -int ip6_auto_linklocal = 1; /* enable by default */ -#endif - +#ifndef VIMAGE +unsigned long in6_maxmtu; +int ip6_auto_linklocal; struct callout in6_tmpaddrtimer_ch; - +extern struct inpcbinfo ripcbinfo; extern struct inpcbinfo udbinfo; -extern struct inpcbinfo ripcbinfo; +#endif static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); @@ -873,8 +871,9 @@ } void -in6_tmpaddrtimer(void *ignored_arg) +in6_tmpaddrtimer(void *arg) { + CURVNET_SET((struct vnet *) arg); INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct nd_ifinfo *ndi; @@ -883,7 +882,7 @@ callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); + V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, arg); bzero(nullbuf, sizeof(nullbuf)); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; @@ -899,6 +898,7 @@ } } + CURVNET_RESTORE(); } static void Index: netinet6/in6_pcb.c =========================================================================== --- netinet6/in6_pcb.c 2008/08/25 00:28:58 #8 +++ netinet6/in6_pcb.c 2008/08/25 00:28:58 @@ -86,10 +86,12 @@ #include +#include #include #include #include +#include #include #include #include @@ -97,6 +99,7 @@ #include #include +#include #include #include #include Index: netinet6/in6_proto.c =========================================================================== --- netinet6/in6_proto.c 2008/08/25 00:28:58 #7 +++ netinet6/in6_proto.c 2008/08/25 00:28:58 @@ -88,6 +88,7 @@ #ifdef RADIX_MPATH #include #endif +#include #include #include @@ -96,6 +97,7 @@ #include #include #include +#include #include #include @@ -145,6 +147,9 @@ .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, +#ifdef VIMAGE + .pr_destroy = ip6_destroy, +#endif .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, @@ -345,25 +350,9 @@ }; extern int in6_inithead(void **, int); - -struct domain inet6domain = { - .dom_family = AF_INET6, - .dom_name = "internet6", - .dom_protosw = (struct protosw *)inet6sw, - .dom_protoswNPROTOSW = (struct protosw *) - &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], -#ifdef RADIX_MPATH - .dom_rtattach = rn6_mpath_inithead, -#else - .dom_rtattach = in6_inithead, +#ifdef VIMAGE +extern int in6_detachhead(void **, int); #endif - .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, - .dom_maxrtkey = sizeof(struct sockaddr_in6), - .dom_ifattach = in6_domifattach, - .dom_ifdetach = in6_domifdetach -}; - -DOMAIN_SET(inet6); /* * Internet configuration info @@ -380,29 +369,31 @@ #define IPV6_SENDREDIRECTS 1 #endif -int ip6_forwarding = IPV6FORWARDING; /* act as router? */ -int ip6_sendredirects = IPV6_SENDREDIRECTS; -int ip6_defhlim = IPV6_DEFHLIM; -int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; -int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ +#ifndef VIMAGE +int ip6_forwarding; /* act as router? */ +int ip6_sendredirects; +int ip6_defhlim; +int ip6_defmcasthlim; +int ip6_accept_rtadv; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ -int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ -int ip6_log_interval = 5; -int ip6_hdrnestlimit = 15; /* How many header options will we process? */ -int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ -int ip6_auto_flowlabel = 1; -int ip6_gif_hlim = 0; -int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ -int ip6_rr_prune = 5; /* router renumbering prefix +int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ +int ip6_log_interval; +int ip6_hdrnestlimit; /* How many header options will we process? */ +int ip6_dad_count; /* DupAddrDetectionTransmits */ +int ip6_auto_flowlabel; +int ip6_gif_hlim = 0; +int ip6_use_deprecated; /* allow deprecated addr (RFC2462 5.5.4) */ +int ip6_rr_prune; /* router renumbering prefix * walk list every 5 sec. */ -int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ -int ip6_v6only = 1; +int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ +int ip6_v6only; -int ip6_keepfaith = 0; -time_t ip6_log_time = (time_t)0L; +int ip6_keepfaith; +time_t ip6_log_time; #ifdef IPSTEALTH -int ip6stealth = 0; +int ip6stealth; #endif +#endif /* !VIMAGE */ /* icmp6 */ /* @@ -410,8 +401,10 @@ * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ -int pmtu_expire = 60*10; -int pmtu_probe = 60*2; +#ifndef VIMAGE +int pmtu_expire; +int pmtu_probe; +#endif /* raw IP6 parameters */ /* @@ -420,20 +413,21 @@ #define RIPV6SNDQ 8192 #define RIPV6RCVQ 8192 -u_long rip6_sendspace = RIPV6SNDQ; -u_long rip6_recvspace = RIPV6RCVQ; +#ifndef VIMAGE +u_long rip6_sendspace; +u_long rip6_recvspace; /* ICMPV6 parameters */ -int icmp6_rediraccept = 1; /* accept and process redirects */ -int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ -int icmp6errppslim = 100; /* 100pps */ +int icmp6_rediraccept; /* accept and process redirects */ +int icmp6_redirtimeout; +int icmp6errppslim; /* control how to respond to NI queries */ -int icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); +int icmp6_nodeinfo; /* UDP on IP6 parameters */ -int udp6_sendspace = 9216; /* really max datagram size */ -int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); - /* 40 1K datagrams */ +int udp6_sendspace; /* really max datagram size */ +int udp6_recvspace; +#endif /* !VIMAGE */ /* * sysctl related items. @@ -458,6 +452,9 @@ sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; @@ -478,6 +475,9 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; @@ -541,16 +541,16 @@ SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, rip6stat, rip6stat, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, - prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, ""); + prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, - use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,""); + use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGS, - maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, ""); + maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, ""); SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MCAST_PMTU, - mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, ""); + mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, ""); #ifdef IPSTEALTH -SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH, - stealth, CTLFLAG_RW, ip6stealth, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, +ip6stealth, 0, ""); #endif /* net.inet6.icmp6 */ @@ -579,3 +579,63 @@ SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_RW, nd6_debug, 0, ""); +static void +ip6_dom_init(void) +{ + INIT_VNET_INET6(curvnet); + + V_ip6_forwarding = IPV6FORWARDING; + V_ip6_sendredirects = IPV6_SENDREDIRECTS; + V_ip6_defhlim = IPV6_DEFHLIM; + V_ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; + V_ip6_accept_rtadv = 0; + V_ip6_log_interval = 5; + V_ip6_hdrnestlimit = 15; + V_ip6_dad_count = 1; + V_ip6_auto_flowlabel = 1; + V_ip6_use_deprecated = 1; + V_ip6_rr_prune = 5; + V_ip6_mcast_pmtu = 0; + V_ip6_v6only = 1; + V_ip6_keepfaith = 0; + V_ip6_log_time = (time_t)0L; +#ifdef IPSTEALTH + V_ip6stealth = 0; +#endif + V_pmtu_expire = 60*10; + V_pmtu_probe = 60*2; + V_rip6_sendspace = RIPV6SNDQ; + V_rip6_recvspace = RIPV6RCVQ; + + /* ICMPV6 parameters */ + V_icmp6_rediraccept = 1; + V_icmp6_redirtimeout = 10 * 60; /* 10 minutes */ + V_icmp6errppslim = 100; /* 100pps */ + /* control how to respond to NI queries */ + V_icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); + + /* UDP on IP6 parameters */ + V_udp6_sendspace = 9216; /* really max datagram size */ + V_udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); + /* 40 1K datagrams */ + +} + +struct domain inet6domain = { + .dom_family = AF_INET6, + .dom_name = "internet6", + .dom_protosw = (struct protosw *)inet6sw, + .dom_protoswNPROTOSW = (struct protosw *) + &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], + .dom_rtattach = in6_inithead, +#ifdef VIMAGE + .dom_rtdetach = in6_detachhead, +#endif + .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, + .dom_maxrtkey = sizeof(struct sockaddr_in6), + .dom_ifattach = in6_domifattach, + .dom_ifdetach = in6_domifdetach, + .dom_init = ip6_dom_init +}; + +DOMAIN_SET(inet6); Index: netinet6/in6_rmx.c =========================================================================== --- netinet6/in6_rmx.c 2008/08/25 00:28:58 #9 +++ netinet6/in6_rmx.c 2008/08/25 00:28:58 @@ -87,12 +87,15 @@ #include #include +#include #include #include #include #include #include +#include + #include #include @@ -105,6 +108,9 @@ #include extern int in6_inithead(void **head, int off); +#ifdef VIMAGE +extern int in6_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -219,20 +225,20 @@ SYSCTL_DECL(_net_inet6_ip6); -static int rtq_reallyold6 = 60*60; - /* one hour is ``really old'' */ -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire, - CTLFLAG_RW, &rtq_reallyold6 , 0, ""); +#ifndef VIMAGE +static int rtq_reallyold6 = 60*60; /* one hour is ``really old'' */ +static int rtq_minreallyold6 = 10; /* never automatically crank down to less */ +static int rtq_toomany6 = 128; /* 128 cached routes is ``too many'' */ +#endif /* VIMAGE */ + +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTEXPIRE, + rtexpire, CTLFLAG_RW, rtq_reallyold6 , 0, ""); -static int rtq_minreallyold6 = 10; - /* never automatically crank down to less */ -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire, - CTLFLAG_RW, &rtq_minreallyold6 , 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTMINEXPIRE, + rtminexpire, CTLFLAG_RW, rtq_minreallyold6 , 0, ""); -static int rtq_toomany6 = 128; - /* 128 cached routes is ``too many'' */ -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, - CTLFLAG_RW, &rtq_toomany6 , 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RTMAXCACHE, + rtmaxcache, CTLFLAG_RW, rtq_toomany6 , 0, ""); /* @@ -242,6 +248,7 @@ static void in6_clsroute(struct radix_node *rn, struct radix_node_head *head) { + INIT_VNET_INET6(curvnet); struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); @@ -286,6 +293,7 @@ static int in6_rtqkill(struct radix_node *rn, void *rock) { + INIT_VNET_INET6(curvnet); struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; @@ -322,16 +330,18 @@ } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout6 = RTQ_TIMEOUT; +#ifndef VIMAGE +static int rtq_timeout6; static struct callout rtq_timer6; +#endif static void in6_rtqtimo(void *rock) { - struct radix_node_head *rnh = rock; CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[0][AF_INET6]; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; @@ -385,7 +395,9 @@ struct radix_node_head *rnh; time_t nextstop; }; +#ifndef VIMAGE static struct callout rtq_mtutimer; +#endif static int in6_mtuexpire(struct radix_node *rn, void *rock) @@ -414,10 +426,10 @@ static void in6_mtutimo(void *rock) { - struct radix_node_head *rnh = rock; CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[0][AF_INET6]; struct mtuex_arg arg; struct timeval atv; @@ -476,13 +488,26 @@ if (off == 0) /* See above */ return 1; /* only do the rest for the real thing */ + V_rtq_timeout6 = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in6_addroute; rnh->rnh_matchaddr = in6_matroute; rnh->rnh_close = in6_clsroute; callout_init(&V_rtq_timer6, CALLOUT_MPSAFE); - in6_rtqtimo(rnh); /* kick off timeout first time */ callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE); - in6_mtutimo(rnh); /* kick off timeout first time */ + in6_rtqtimo(curvnet); /* kick off timeout first time */ + in6_mtutimo(curvnet); /* kick off timeout first time */ + return 1; +} + +#ifdef VIMAGE +int +in6_detachhead(void **head, int off) +{ + INIT_VNET_INET6(curvnet); + + callout_drain(&V_rtq_timer6); + callout_drain(&V_rtq_mtutimer); return 1; } +#endif Index: netinet6/in6_src.c =========================================================================== --- netinet6/in6_src.c 2008/08/25 00:28:58 #7 +++ netinet6/in6_src.c 2008/08/25 00:28:58 @@ -84,18 +84,21 @@ #include #include +#include #include #include #ifdef RADIX_MPATH #include #endif +#include #include #include #include #include #include +#include #include #include #include @@ -117,9 +120,11 @@ #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) +#ifndef VIMAGE struct in6_addrpolicy defaultaddrpolicy; int ip6_prefer_tempaddr = 0; +#endif static int selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, @@ -458,7 +463,6 @@ struct ifnet **retifp, struct rtentry **retrt, int clone, int norouteok) { - INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); int error = 0; struct ifnet *ifp = NULL; @@ -867,8 +871,6 @@ void addrsel_policy_init(void) { - ADDRSEL_LOCK_INIT(); - ADDRSEL_SXLOCK_INIT(); INIT_VNET_INET6(curvnet); init_policy_queue(); @@ -876,6 +878,14 @@ /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + + ADDRSEL_LOCK_INIT(); + ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * @@ -966,9 +976,9 @@ struct in6_addrpolicy ape_policy; }; -TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); - -struct addrsel_policyhead addrsel_policytab; +#ifndef VIMAGE +TAILQ_HEAD(, addrsel_policyent) addrsel_policytab; +#endif static void init_policy_queue(void) Index: netinet6/in6_var.h =========================================================================== --- netinet6/in6_var.h 2008/08/25 00:28:58 #1 +++ netinet6/in6_var.h 2008/08/25 00:28:58 @@ -470,9 +470,11 @@ #endif #ifdef _KERNEL +#ifndef VIMAGE extern struct in6_ifaddr *in6_ifaddr; extern struct icmp6stat icmp6stat; +#endif #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ Index: netinet6/ip6_forward.c =========================================================================== --- netinet6/ip6_forward.c 2008/08/25 00:28:58 #7 +++ netinet6/ip6_forward.c 2008/08/25 00:28:58 @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -72,11 +73,14 @@ #include #include #include +#include #endif /* IPSEC */ #include +#ifndef VIMAGE struct route_in6 ip6_forward_rt; +#endif /* * Forward a packet. If some error occurs return the sender Index: netinet6/ip6_input.c =========================================================================== --- netinet6/ip6_input.c 2008/08/25 00:28:58 #8 +++ netinet6/ip6_input.c 2008/08/25 00:28:58 @@ -82,12 +82,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include @@ -96,10 +98,13 @@ #include #endif /* INET */ #include +#include #include #include #include #include + +#include #include #include #include @@ -116,6 +121,7 @@ u_char ip6_protox[IPPROTO_MAX]; static struct ifqueue ip6intrq; +#ifndef VIMAGE static int ip6qmaxlen = IFQ_MAXLEN; struct in6_ifaddr *in6_ifaddr; @@ -126,10 +132,13 @@ int ip6_sourcecheck_interval; /* XXX */ int ip6_ours_check_algorithm; +#endif /* !VIMAGE */ struct pfil_head inet6_pfil_hook; +#ifndef VIMAGE struct ip6stat ip6stat; +#endif static void ip6_init2(void *); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); @@ -138,6 +147,20 @@ static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif +#ifdef VIMAGE +static void vnet_inet6_register(void); + +VNET_MOD_DECLARE(INET6, inet6, NULL, NULL, INET, NULL) + +static void +vnet_inet6_register(void) +{ + vnet_mod_register(&vnet_inet6_modinfo); +} + +SYSINIT(inet6, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet6_register, 0); +#endif /* VIMAGE */ + /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. @@ -145,9 +168,36 @@ void ip6_init(void) { + INIT_VNET_INET6(curvnet); struct ip6protosw *pr; int i; + V_ip6_prefer_tempaddr = 0; + + V_ip6qmaxlen = IFQ_MAXLEN; + V_ip6_forward_srcrt = 0; /* XXX */ + V_ip6_sourcecheck = 0; /* XXX */ + V_ip6_sourcecheck_interval = 0; /* XXX */ + + V_ip6_ours_check_algorithm = 0; + +#ifdef IP6_AUTO_LINKLOCAL + V_ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; +#else + V_ip6_auto_linklocal = 1; /* enable by default */ +#endif + + scope6_init(); + addrsel_policy_init(); + nd6_init(); + frag6_init(); + +#ifdef VIMAGE + /* Skip global initialization stuff for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); @@ -182,13 +232,20 @@ ip6intrq.ifq_maxlen = V_ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); - scope6_init(); - addrsel_policy_init(); - nd6_init(); - frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; } +#ifdef VIMAGE +void +ip6_destroy() +{ + INIT_VNET_INET6(curvnet); + + nd6_destroy(); + callout_drain(&V_in6_tmpaddrtimer_ch); +} +#endif + static void ip6_init2(void *dummy) { @@ -196,21 +253,23 @@ /* nd6_timer_init */ callout_init(&V_nd6_timer_ch, 0); - callout_reset(&V_nd6_timer_ch, hz, nd6_timer, NULL); + callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); /* timer for regeneranation of temporary addresses randomize ID */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, - in6_tmpaddrtimer, NULL); + in6_tmpaddrtimer, curvnet); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); +#ifndef VIMAGE extern struct route_in6 ip6_forward_rt; +#endif void ip6_input(struct mbuf *m) @@ -255,7 +314,7 @@ #define M2MMAX (sizeof(V_ip6stat.ip6s_m2m)/sizeof(V_ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { - V_ip6stat.ip6s_m2m[V_loif[0].if_index]++; /* XXX */ + V_ip6stat.ip6s_m2m[V_loif->if_index]++; } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) V_ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else Index: netinet6/ip6_ipsec.c =========================================================================== --- netinet6/ip6_ipsec.c 2008/08/25 00:28:58 #7 +++ netinet6/ip6_ipsec.c 2008/08/25 00:28:58 @@ -63,6 +63,7 @@ #include #include #include +#include #ifdef IPSEC_DEBUG #include #else @@ -70,6 +71,7 @@ #endif #endif /*IPSEC*/ +#include #include #include Index: netinet6/ip6_mroute.c =========================================================================== --- netinet6/ip6_mroute.c 2008/08/25 00:28:58 #7 +++ netinet6/ip6_mroute.c 2008/08/25 00:28:58 @@ -103,6 +103,7 @@ #include #include +#include #include #include #include @@ -113,6 +114,7 @@ #include #include +#include #include #include #include @@ -149,7 +151,9 @@ .pr_usrreqs = &rip6_usrreqs }; +#ifndef VIMAGE static int ip6_mrouter_ver = 0; +#endif /* !VIMAGE */ SYSCTL_DECL(_net_inet6); SYSCTL_DECL(_net_inet6_ip6); @@ -177,7 +181,9 @@ "Multicast Interfaces (struct mif[MAXMIFS], netinet6/ip6_mroute.h)"); #ifdef MRT6DEBUG -static u_int mrt6debug = 0; /* debug level */ +#ifndef VIMAGE +static u_int mrt6debug; /* debug level */ +#endif /* !VIMAGE */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 @@ -222,7 +228,9 @@ &pim6stat, pim6stat, "PIM Statistics (struct pim6stat, netinet6/pim_var.h)"); +#ifndef VIMAGE static int pim6; +#endif /* * Hash function for a source, group entry @@ -470,7 +478,11 @@ { INIT_VNET_INET6(curvnet); + V_ip6_mrouter_ver = 0; + #ifdef MRT6DEBUG + V_mrt6debug = 0; + if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", Index: netinet6/ip6_output.c =========================================================================== --- netinet6/ip6_output.c 2008/08/25 00:28:58 #7 +++ netinet6/ip6_output.c 2008/08/25 00:28:58 @@ -80,6 +80,7 @@ #include #include +#include #include #include #include @@ -87,6 +88,7 @@ #include #include +#include #include #include #include Index: netinet6/ip6_var.h =========================================================================== --- netinet6/ip6_var.h 2008/08/25 00:28:58 #3 +++ netinet6/ip6_var.h 2008/08/25 00:28:58 @@ -278,6 +278,7 @@ #define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif +#ifndef VIMAGE extern struct ip6stat ip6stat; /* statistics */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ @@ -289,8 +290,10 @@ * walk list every 5 sec. */ extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ extern int ip6_v6only; +#endif extern struct socket *ip6_mrouter; /* multicast routing daemon */ +#ifndef VIMAGE extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ @@ -304,6 +307,7 @@ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ extern int ip6_auto_flowlabel; +#endif extern int ip6_auto_linklocal; extern int ip6_anonportmin; /* minimum ephemeral port */ @@ -312,8 +316,10 @@ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_use_tempaddr; /* whether to use temporary addresses. */ +#ifndef VIMAGE extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses in the source address selection */ +#endif extern int ip6_use_defzone; /* whether to use the default scope zone when unspecified */ @@ -332,6 +338,9 @@ struct in6_ifaddr; void ip6_init __P((void)); +#ifdef VIMAGE +void ip6_destroy __P((void)); +#endif void ip6_input __P((struct mbuf *)); struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *)); void ip6_freepcbopts __P((struct ip6_pktopts *)); Index: netinet6/ip6protosw.h =========================================================================== --- netinet6/ip6protosw.h 2008/08/25 00:28:58 #1 +++ netinet6/ip6protosw.h 2008/08/25 00:28:58 @@ -134,6 +134,8 @@ /* utility hooks */ void (*pr_init) /* initialization hook */ __P((void)); + void (*pr_destroy) /* cleanup hook */ + __P((void)); void (*pr_fasttimo) /* fast timeout (200ms) */ __P((void)); Index: netinet6/mld6.c =========================================================================== --- netinet6/mld6.c 2008/08/25 00:28:58 #6 +++ netinet6/mld6.c 2008/08/25 00:28:58 @@ -81,10 +81,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -104,7 +106,9 @@ */ #define MLD_UNSOLICITED_REPORT_INTERVAL 10 +#ifndef VIMAGE static struct ip6_pktopts ip6_opts; +#endif static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); static void mld_starttimer(struct in6_multi *); @@ -172,6 +176,7 @@ callout_stop(in6m->in6m_timer_ch); + CURVNET_SET(in6m->in6m_ifp->if_vnet); switch (in6m->in6m_state) { case MLD_REPORTPENDING: mld6_start_listening(in6m); @@ -180,6 +185,7 @@ mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); break; } + CURVNET_RESTORE(); splx(s); } Index: netinet6/nd6.c =========================================================================== --- netinet6/nd6.c 2008/08/25 00:28:58 #10 +++ netinet6/nd6.c 2008/08/25 00:28:58 @@ -50,7 +50,9 @@ #include #include #include +#include +#include #include #include #include @@ -61,6 +63,7 @@ #include #include +#include #include #include #include @@ -80,18 +83,19 @@ #define SDL(s) ((struct sockaddr_dl *)s) /* timer values */ -int nd6_prune = 1; /* walk list every 1 seconds */ -int nd6_delay = 5; /* delay first probe time 5 second */ -int nd6_umaxtries = 3; /* maximum unicast query */ -int nd6_mmaxtries = 3; /* maximum multicast query */ -int nd6_useloopback = 1; /* use loopback interface for local traffic */ -int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ +#ifndef VIMAGE +int nd6_prune; /* walk list every 1 seconds */ +int nd6_delay; /* delay first probe time 5 second */ +int nd6_umaxtries; /* maximum unicast query */ +int nd6_mmaxtries; /* maximum multicast query */ +int nd6_useloopback; /* use loopback interface for local traffic */ +int nd6_gctimer; /* 1 day: garbage collection timer */ /* preventing too many loops in ND option parsing */ -int nd6_maxndopt = 10; /* max # of ND options allowed */ +int nd6_maxndopt; /* max # of ND options allowed */ -int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ -int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ +int nd6_maxnudhint; /* max # of subsequent upper layer hints */ +int nd6_maxqueuelen; /* max # of packets cached in unresolved ND entries */ #ifdef ND6_DEBUG int nd6_debug = 1; @@ -102,11 +106,15 @@ /* for debugging? */ static int nd6_inuse, nd6_allocated; -struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6}; +struct llinfo_nd6 llinfo_nd6; struct nd_drhead nd_defrouter; -struct nd_prhead nd_prefix = { 0 }; +struct nd_prhead nd_prefix; + +int nd6_recalc_reachtm_interval; -int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; +extern int dad_ignore_ns; /* ignore NS in DAD - specwise incorrect*/ +extern int dad_maxtry; /* max # of *tries* to transmit DAD packet */ +#endif /* !VIMAGE */ static struct sockaddr_in6 all1_sa; static int nd6_is_new_addr_neighbor __P((struct sockaddr_in6 *, @@ -118,20 +126,59 @@ static void nd6_llinfo_timer(void *); static void clear_llinfo_pqueue(struct llinfo_nd6 *); +#ifndef VIMAGE struct callout nd6_slowtimo_ch; struct callout nd6_timer_ch; extern struct callout in6_tmpaddrtimer_ch; +#endif void nd6_init(void) { - static int nd6_init_done = 0; + INIT_VNET_INET6(curvnet); int i; - if (nd6_init_done) { - log(LOG_NOTICE, "nd6_init called more than once(ignored)\n"); - return; - } + V_nd6_prune = 1; /* walk list every 1 seconds */ + V_nd6_delay = 5; /* delay first probe time 5 second */ + V_nd6_umaxtries = 3; /* maximum unicast query */ + V_nd6_mmaxtries = 3; /* maximum multicast query */ + V_nd6_useloopback = 1; /* use loopback interface for local traffic */ + V_nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ + + /* preventing too many loops in ND option parsing */ + V_nd6_maxndopt = 10; /* max # of ND options allowed */ + + V_nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ + V_nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ + +#ifdef ND6_DEBUG + V_nd6_debug = 1; +#else + V_nd6_debug = 0; +#endif + + V_nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; + +#ifdef INET6 + V_dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ +#endif + V_dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ + + V_ip6_use_tempaddr = 0; + + V_ip6_desync_factor = 0; + V_ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; + V_ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; + /* + * shorter lifetimes for debugging purposes. + V_ip6_temp_preferred_lifetime = 800; + V_ip6_temp_valid_lifetime = 1800; + */ + + V_ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; + + V_llinfo_nd6.ln_next = V_llinfo_nd6.ln_prev = &V_llinfo_nd6; + LIST_INIT(&V_nd_prefix); all1_sa.sin6_family = AF_INET6; all1_sa.sin6_len = sizeof(struct sockaddr_in6); @@ -143,11 +190,19 @@ /* start timer */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + nd6_slowtimo, curvnet); +} - nd6_init_done = 1; +#ifdef VIMAGE +void +nd6_destroy() +{ + INIT_VNET_INET6(curvnet); + callout_drain(&V_nd6_slowtimo_ch); + callout_drain(&V_nd6_timer_ch); } +#endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) @@ -233,7 +288,6 @@ if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ -#undef MIN } void @@ -527,6 +581,7 @@ } break; } + CURVNET_RESTORE(); } @@ -534,8 +589,9 @@ * ND6 timer routine to expire default route list and prefix list */ void -nd6_timer(void *ignored_arg) +nd6_timer(void *arg) { + CURVNET_SET_QUIET((struct vnet *) arg); INIT_VNET_INET6((struct vnet *) arg); int s; struct nd_defrouter *dr; @@ -544,7 +600,7 @@ struct in6_addrlifetime *lt6; callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, - nd6_timer, NULL); + nd6_timer, arg); /* expire default router list */ s = splnet(); @@ -656,6 +712,7 @@ pr = pr->ndpr_next; } splx(s); + CURVNET_RESTORE(); } /* @@ -1347,7 +1404,7 @@ SDL(gate)->sdl_alen = ifp->if_addrlen; } if (V_nd6_useloopback) { - rt->rt_ifp = &V_loif[0]; /* XXX */ + rt->rt_ifp = V_loif; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. @@ -1898,7 +1955,7 @@ } static void -nd6_slowtimo(void *ignored_arg) +nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); INIT_VNET_NET((struct vnet *) arg); @@ -1907,7 +1964,7 @@ struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + nd6_slowtimo, arg); IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { @@ -1925,6 +1982,7 @@ } } IFNET_RUNLOCK(); + CURVNET_RESTORE(); } #define senderr(e) { error = (e); goto bad;} @@ -2283,8 +2341,8 @@ CTLFLAG_RD, nd6_sysctl_drlist, ""); SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLFLAG_RD, nd6_sysctl_prlist, ""); -SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, - nd6_maxqueuelen, CTLFLAG_RW, nd6_maxqueuelen, 1, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, + CTLFLAG_RW, nd6_maxqueuelen, 1, ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) Index: netinet6/nd6.h =========================================================================== --- netinet6/nd6.h 2008/08/25 00:28:58 #3 +++ netinet6/nd6.h 2008/08/25 00:28:58 @@ -328,6 +328,7 @@ LIST_HEAD(nd_prhead, nd_prefix); /* nd6.c */ +#ifndef VIMAGE extern int nd6_prune; extern int nd6_delay; extern int nd6_umaxtries; @@ -339,17 +340,22 @@ extern struct nd_drhead nd_defrouter; extern struct nd_prhead nd_prefix; extern int nd6_debug; +#endif #define nd6log(x) do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0) +#ifndef VIMAGE extern struct callout nd6_timer_ch; +#endif /* nd6_rtr.c */ +#ifndef VIMAGE extern int nd6_defifindex; extern int ip6_desync_factor; /* seconds */ extern u_int32_t ip6_temp_preferred_lifetime; /* seconds */ extern u_int32_t ip6_temp_valid_lifetime; /* seconds */ extern int ip6_temp_regen_advance; /* seconds */ +#endif union nd_opts { struct nd_opt_hdr *nd_opt_array[8]; /* max = target address list */ @@ -379,6 +385,9 @@ /* XXX: need nd6_var.h?? */ /* nd6.c */ void nd6_init __P((void)); +#ifdef VIMAGE +void nd6_destroy __P((void)); +#endif struct nd_ifinfo *nd6_ifattach __P((struct ifnet *)); void nd6_ifdetach __P((struct nd_ifinfo *)); int nd6_is_addr_neighbor __P((struct sockaddr_in6 *, struct ifnet *)); Index: netinet6/nd6_nbr.c =========================================================================== --- netinet6/nd6_nbr.c 2008/08/25 00:28:58 #9 +++ netinet6/nd6_nbr.c 2008/08/25 00:28:58 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,7 @@ #include #include +#include #include #include #include @@ -81,13 +83,15 @@ static struct dadq *nd6_dad_find(struct ifaddr *); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_stoptimer(struct dadq *); -static void nd6_dad_timer(struct ifaddr *); +static void nd6_dad_timer(struct dadq *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); static void nd6_dad_na_input(struct ifaddr *); -static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ -static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +#ifndef VIMAGE +int dad_ignore_ns; /* ignore NS in DAD - specwise incorrect*/ +int dad_maxtry; /* max # of *tries* to transmit DAD packet */ +#endif /* !VIMAGE */ /* * Input a Neighbor Solicitation Message. @@ -1066,7 +1070,6 @@ } } -TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; @@ -1076,10 +1079,13 @@ int dad_ns_icount; int dad_na_icount; struct callout dad_timer_ch; + struct vnet *dad_vnet; }; -static struct dadq_head dadq; +#ifndef VIMAGE +TAILQ_HEAD(, dadq) dadq; static int dad_init = 0; +#endif static struct dadq * nd6_dad_find(struct ifaddr *ifa) @@ -1099,7 +1105,7 @@ { callout_reset(&dp->dad_timer_ch, ticks, - (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); + (void (*)(void *))nd6_dad_timer, (void *)dp); } static void @@ -1167,6 +1173,9 @@ } bzero(dp, sizeof(*dp)); callout_init(&dp->dad_timer_ch, 0); +#ifdef VIMAGE + dp->dad_vnet = curvnet; +#endif TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), @@ -1218,27 +1227,19 @@ } static void -nd6_dad_timer(struct ifaddr *ifa) +nd6_dad_timer(struct dadq *dp) { - int s; CURVNET_SET(dp->dad_vnet); INIT_VNET_INET6(curvnet); + struct ifaddr *ifa = dp->dad_ifa; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; - struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; - s = splnet(); /* XXX */ - /* Sanity check */ if (ia == NULL) { log(LOG_ERR, "nd6_dad_timer: called with null parameter\n"); goto done; } - dp = nd6_dad_find(ifa); - if (dp == NULL) { - log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); - goto done; - } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", @@ -1320,7 +1321,6 @@ } done: - splx(s); CURVNET_RESTORE(); } Index: netinet6/nd6_rtr.c =========================================================================== --- netinet6/nd6_rtr.c 2008/08/25 00:28:58 #9 +++ netinet6/nd6_rtr.c 2008/08/25 00:28:58 @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -55,6 +56,7 @@ #include #include +#include #include #include #include @@ -85,22 +87,24 @@ static int rt6_deleteroute(struct radix_node *, void *); +#ifndef VIMAGE extern int nd6_recalc_reachtm_interval; static struct ifnet *nd6_defifp; int nd6_defifindex; -int ip6_use_tempaddr = 0; +int ip6_use_tempaddr; int ip6_desync_factor; -u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; -u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; +u_int32_t ip6_temp_preferred_lifetime; +u_int32_t ip6_temp_valid_lifetime; /* * shorter lifetimes for debugging purposes. -int ip6_temp_preferred_lifetime = 800; -static int ip6_temp_valid_lifetime = 1800; +int ip6_temp_preferred_lifetime; +static int ip6_temp_valid_lifetime; */ -int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; +int ip6_temp_regen_advance; +#endif /* !VIMAGE */ /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 Index: netinet6/raw_ip6.c =========================================================================== --- netinet6/raw_ip6.c 2008/08/25 00:28:58 #9 +++ netinet6/raw_ip6.c 2008/08/25 00:28:58 @@ -80,16 +80,19 @@ #include #include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -101,6 +104,7 @@ #ifdef IPSEC #include #include +#include #endif /* IPSEC */ #include @@ -112,13 +116,15 @@ * Raw interface to IP6 protocol. */ +#ifndef VIMAGE extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; + +struct rip6stat rip6stat; +#endif extern u_long rip_sendspace; extern u_long rip_recvspace; -struct rip6stat rip6stat; - /* * Hooks for multicast forwarding. */ @@ -764,7 +770,6 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { - INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; Index: netinet6/route6.c =========================================================================== --- netinet6/route6.c 2008/08/25 00:28:58 #6 +++ netinet6/route6.c 2008/08/25 00:28:58 @@ -45,6 +45,7 @@ #include #include +#include #include #include #include Index: netinet6/scope6.c =========================================================================== --- netinet6/scope6.c 2008/08/25 00:28:58 #7 +++ netinet6/scope6.c 2008/08/25 00:28:58 @@ -41,20 +41,23 @@ #include #include +#include #include #include #include -#include +#include #include #include +#ifndef VIMAGE #ifdef ENABLE_DEFAULT_SCOPE int ip6_use_defzone = 1; #else int ip6_use_defzone = 0; #endif +#endif /* !VIMAGE */ /* * The scope6_lock protects the global sid default stored in @@ -66,7 +69,10 @@ #define SCOPE6_UNLOCK() mtx_unlock(&scope6_lock) #define SCOPE6_LOCK_ASSERT() mtx_assert(&scope6_lock, MA_OWNED) +#ifndef VIMAGE static struct scope6_id sid_default; +#endif + #define SID(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id) @@ -75,8 +81,20 @@ { INIT_VNET_INET6(curvnet); +#ifdef ENABLE_DEFAULT_SCOPE + V_ip6_use_defzone = 1; +#else + V_ip6_use_defzone = 0; +#endif + + bzero(&V_sid_default, sizeof(V_sid_default)); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + SCOPE6_LOCK_INIT(); - bzero(&V_sid_default, sizeof(V_sid_default)); } struct scope6_id * Index: netinet6/sctp6_usrreq.c =========================================================================== --- netinet6/sctp6_usrreq.c 2008/08/25 00:28:58 #8 +++ netinet6/sctp6_usrreq.c 2008/08/25 00:28:58 @@ -34,11 +34,13 @@ #include #include +#include #include #include #include #if defined(INET6) #include +#include #endif #include #include @@ -787,6 +789,7 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { + INIT_VNET_INET6(curvnet); struct sctp_inpcb *inp; struct inpcb *in_inp; struct in6pcb *inp6; @@ -899,6 +902,7 @@ static int sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) { + INIT_VNET_INET6(curvnet); uint32_t vrf_id; int error = 0; struct sctp_inpcb *inp; Index: netinet6/udp6_usrreq.c =========================================================================== --- netinet6/udp6_usrreq.c 2008/08/25 00:28:58 #9 +++ netinet6/udp6_usrreq.c 2008/08/25 00:28:58 @@ -86,10 +86,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -102,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +114,7 @@ #ifdef IPSEC #include #include +#include #endif /* IPSEC */ #include Index: netinet6/vinet6.h =========================================================================== --- netinet6/vinet6.h 2008/08/25 00:28:58 #1 +++ netinet6/vinet6.h 2008/08/25 00:28:58 @@ -31,17 +31,15 @@ #ifndef _NETINET6_VINET6_H_ #define _NETINET6_VINET6_H_ -#include #include -#include #include #include +#include #include #include #include #include #include -#include #define INIT_VNET_INET6(vnet) \ INIT_FROM_VNET(vnet, VNET_MOD_INET6, \ @@ -93,7 +91,8 @@ int _dad_init; int _icmp6errpps_count; - int _icmp6errppslim_last; + //int _icmp6errppslim_last; + //int _icmp6_nodeinfo; int _ip6_forwarding; int _ip6_sendredirects; @@ -238,7 +237,7 @@ #define V_icmp6_nodeinfo VNET_INET6(icmp6_nodeinfo) #define V_udp6_sendspace VNET_INET6(udp6_sendspace) #define V_udp6_recvspace VNET_INET6(udp6_recvspace) -#define V_icmp6errppslim_last VNET_INET6(icmp6errppslim_last) +//#define V_icmp6errppslim_last VNET_INET6(icmp6errppslim_last) #define V_ip6_prefer_tempaddr VNET_INET6(ip6_prefer_tempaddr) #define V_ip6qmaxlen VNET_INET6(ip6qmaxlen) #define V_ip6_forward_srcrt VNET_INET6(ip6_forward_srcrt) Index: netipsec/ipsec.c =========================================================================== --- netipsec/ipsec.c 2008/08/25 00:28:58 #11 +++ netipsec/ipsec.c 2008/08/25 00:28:58 @@ -67,6 +67,7 @@ #include #include #include +#include #include #ifdef INET6 @@ -77,6 +78,8 @@ #include #endif +#include + #include #include #ifdef INET6 @@ -92,18 +95,27 @@ #include #include +#include #include #include +#ifndef VIMAGE #ifdef IPSEC_DEBUG int ipsec_debug = 1; #else int ipsec_debug = 0; #endif +#endif + +static int vnet_ipsec_iattach(const void *); +#ifdef VIMAGE +static int vnet_ipsec_idetach(const void *); +#endif /* NB: name changed so netstat doesn't use it */ +#ifndef VIMAGE struct ipsecstat ipsec4stat; int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ int ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ @@ -113,7 +125,6 @@ int ip4_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip4_def_policy; int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ -int ip4_esp_randpad = -1; /* * Crypto support requirements: * @@ -122,6 +133,7 @@ * 0 take anything */ int crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; +#endif SYSCTL_DECL(_net_inet_ipsec); @@ -129,16 +141,16 @@ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0, "IPsec default policy."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, CTLFLAG_RW, ip4_esp_trans_deflev, 0, "Default ESP transport mode level"); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, CTLFLAG_RW, ip4_esp_net_deflev, 0, "Default ESP tunnel mode level."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, CTLFLAG_RW, ip4_ah_trans_deflev, 0, "AH transfer mode default level."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, CTLFLAG_RW, ip4_ah_net_deflev, 0, "AH tunnel mode default level."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_CLEARTOS, @@ -147,20 +159,20 @@ SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask, CTLFLAG_RW, ip4_ah_offsetmask, 0, "If not set clear offset field mask when doing AH computation."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DFBIT, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DFBIT, dfbit, CTLFLAG_RW, ip4_ipsec_dfbit, 0, "Do not fragment bit on encap."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_ECN, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_ECN, ecn, CTLFLAG_RW, ip4_ipsec_ecn, 0, "Explicit Congestion Notification handling."); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEBUG, debug, CTLFLAG_RW, ipsec_debug, 0, "Enable IPsec debugging output when set."); -SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO, +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, crypto_support, CTLFLAG_RW, crypto_support,0, "Crypto driver selection."); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO, - ipsecstats, CTLFLAG_RD, ipsec4stat, ipsecstat, + ipsecstats, CTLFLAG_RD, ipsec4stat, ipsecstat, "IPsec IPv4 statistics."); #ifdef REGRESSION @@ -168,26 +180,34 @@ * When set to 1, IPsec will send packets with the same sequence number. * This allows to verify if the other side has proper replay attacks detection. */ +#ifndef VIMAGE int ipsec_replay = 0; +#endif SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_replay, CTLFLAG_RW, ipsec_replay, 0, "Emulate replay attack"); /* * When set 1, IPsec will send packets with corrupted HMAC. * This allows to verify if the other side properly detects modified packets. */ +#ifndef VIMAGE int ipsec_integrity = 0; +#endif SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_integrity, CTLFLAG_RW, ipsec_integrity, 0, "Emulate man-in-the-middle attack"); #endif +#ifndef VIMAGE #ifdef INET6 struct ipsecstat ipsec6stat; -int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; -int ip6_esp_net_deflev = IPSEC_LEVEL_USE; -int ip6_ah_trans_deflev = IPSEC_LEVEL_USE; -int ip6_ah_net_deflev = IPSEC_LEVEL_USE; -int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +int ip6_esp_trans_deflev; +int ip6_esp_net_deflev; +int ip6_ah_trans_deflev; +int ip6_ah_net_deflev; +int ip6_ipsec_ecn; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +#endif +#endif /* !VIMAGE */ +#ifdef INET6 SYSCTL_DECL(_net_inet6_ipsec6); /* net.inet6.ipsec6 */ @@ -221,6 +241,9 @@ "IPsec IPv6 statistics."); #endif /* INET6 */ +VNET_MOD_DECLARE(IPSEC, ipsec, vnet_ipsec_iattach, vnet_ipsec_idetach, + INET, NULL) + static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb)); #ifdef INET6 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb)); @@ -1971,9 +1994,62 @@ static void ipsec_attach(void) { +#ifdef VIMAGE + vnet_mod_register(&vnet_ipsec_modinfo); +#else + vnet_ipsec_iattach(NULL); +#endif +} + +static int +vnet_ipsec_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + #ifdef IPSEC_DEBUG + V_ipsec_debug = 1; + #else + V_ipsec_debug = 0; + #endif + SECPOLICY_LOCK_INIT(&V_ip4_def_policy); - ip4_def_policy.refcnt = 1; /* NB: disallow free */ + V_ip4_def_policy.refcnt = 1; /* NB: disallow free */ + + V_ip4_ah_offsetmask = 0; /* maybe IP_DF? */ + V_ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ + V_ip4_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ipsec_ecn = 0; + + V_crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; +#ifdef REGRESSION + V_ipsec_replay = 0; + V_ipsec_integrity = 0; +#endif + +#ifdef INET6 + V_ip6_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +#endif + + return 0; +} + +/* XXX finish this! */ +#ifdef VIMAGE +static int +vnet_ipsec_idetach(unused) + const void *unused; +{ + return 0; } +#endif SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL); Index: netipsec/ipsec.h =========================================================================== --- netipsec/ipsec.h 2008/08/25 00:28:58 #4 +++ netipsec/ipsec.h 2008/08/25 00:28:58 @@ -433,9 +433,6 @@ extern char *ipsec_dump_policy __P((caddr_t, char *)); extern const char *ipsec_strerror __P((void)); - -#else -#include -#endif /* ! KERNEL */ +#endif /* !_KERNEL */ #endif /* _NETIPSEC_IPSEC_H_ */ Index: netipsec/ipsec_input.c =========================================================================== --- netipsec/ipsec_input.c 2008/08/25 00:28:58 #7 +++ netipsec/ipsec_input.c 2008/08/25 00:28:58 @@ -69,6 +69,7 @@ #include #ifdef INET6 +#include #include #endif #include @@ -89,6 +90,7 @@ #include #include +#include #include #include Index: netipsec/ipsec_mbuf.c =========================================================================== --- netipsec/ipsec_mbuf.c 2008/08/25 00:28:58 #6 +++ netipsec/ipsec_mbuf.c 2008/08/25 00:28:58 @@ -42,6 +42,7 @@ #include #include +#include /* * Make space for a new header of length hlen at skip bytes Index: netipsec/ipsec_output.c =========================================================================== --- netipsec/ipsec_output.c 2008/08/25 00:28:58 #7 +++ netipsec/ipsec_output.c 2008/08/25 00:28:58 @@ -55,6 +55,7 @@ #include #include #ifdef INET6 +#include #include #endif @@ -80,6 +81,7 @@ #include #include #include +#include #include Index: netipsec/key.c =========================================================================== --- netipsec/key.c 2008/08/25 00:28:58 #10 +++ netipsec/key.c 2008/08/25 00:28:58 @@ -62,12 +62,14 @@ #include #include +#include #include #include #include #include #ifdef INET6 +#include #include #include #include @@ -85,6 +87,7 @@ #include #include #include +#include #include #ifdef INET6 @@ -113,6 +116,7 @@ * field hits 0 (= no external reference other than from SA header. */ +#ifndef VIMAGE u_int32_t key_debug_level = 0; static u_int key_spi_trycnt = 1000; static u_int32_t key_spi_minval = 0x100; @@ -125,8 +129,11 @@ static int key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/ static u_int32_t acq_seq = 0; +#endif +#ifndef VIMAGE static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ +#endif static struct mtx sptree_lock; #define SPTREE_LOCK_INIT() \ mtx_init(&sptree_lock, "sptree", \ @@ -136,7 +143,9 @@ #define SPTREE_UNLOCK() mtx_unlock(&sptree_lock) #define SPTREE_LOCK_ASSERT() mtx_assert(&sptree_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ +#endif static struct mtx sahtree_lock; #define SAHTREE_LOCK_INIT() \ mtx_init(&sahtree_lock, "sahtree", \ @@ -145,9 +154,10 @@ #define SAHTREE_LOCK() mtx_lock(&sahtree_lock) #define SAHTREE_UNLOCK() mtx_unlock(&sahtree_lock) #define SAHTREE_LOCK_ASSERT() mtx_assert(&sahtree_lock, MA_OWNED) - /* registed list */ +#ifndef VIMAGE static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; +#endif static struct mtx regtree_lock; #define REGTREE_LOCK_INIT() \ mtx_init(®tree_lock, "regtree", "fast ipsec regtree", MTX_DEF) @@ -156,7 +166,9 @@ #define REGTREE_UNLOCK() mtx_unlock(®tree_lock) #define REGTREE_LOCK_ASSERT() mtx_assert(®tree_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */ +#endif static struct mtx acq_lock; #define ACQ_LOCK_INIT() \ mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF) @@ -165,7 +177,9 @@ #define ACQ_UNLOCK() mtx_unlock(&acq_lock) #define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_spacqtree, secspacq) spacqtree; /* SP acquiring list */ +#endif static struct mtx spacq_lock; #define SPACQ_LOCK_INIT() \ mtx_init(&spacq_lock, "spacqtree", \ @@ -182,6 +196,7 @@ static const u_int saorder_state_valid_prefer_new[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, }; +#ifndef VIMAGE static u_int saorder_state_alive[] = { /* except DEAD */ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL @@ -190,7 +205,7 @@ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD }; - +#endif static const int minsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ @@ -235,11 +250,11 @@ 0, /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ }; - +#ifndef VIMAGE static int ipsec_esp_keymin = 256; static int ipsec_esp_auth = 0; static int ipsec_ah_keymin = 128; - +#endif #ifdef SYSCTL_DECL SYSCTL_DECL(_net_key); #endif @@ -2339,6 +2354,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + printf("\n---> key_spdflush()..\n"); INIT_VNET_IPSEC(curvnet); struct sadb_msg *newmsg; struct secpolicy *sp; @@ -4329,7 +4345,7 @@ key_timehandler(void) { time_t now = time_second; - + VNET_ITERLOOP_BEGIN(); key_flush_spd(now); key_flush_sad(now); @@ -7178,12 +7194,40 @@ INIT_VNET_IPSEC(curvnet); int i; + V_key_debug_level = 0; + V_key_spi_trycnt = 1000; + V_key_spi_minval = 0x100; + V_key_spi_maxval = 0x0fffffff; /* XXX */ + V_policy_id = 0; + V_key_int_random = 60; /*interval to initialize randseed,1(m)*/ + V_key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ + V_key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ + V_key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ + V_key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/ + V_acq_seq = 0; + + V_saorder_state_alive[0] = SADB_SASTATE_MATURE; + V_saorder_state_alive[1] = SADB_SASTATE_DYING; + V_saorder_state_alive[2] = SADB_SASTATE_LARVAL; + V_saorder_state_any[0] = SADB_SASTATE_MATURE; + V_saorder_state_any[1] = SADB_SASTATE_DYING; + V_saorder_state_any[2] = SADB_SASTATE_LARVAL; + V_saorder_state_any[3] = SADB_SASTATE_DEAD; + + V_ipsec_esp_keymin = 256; + V_ipsec_esp_auth = 0; + V_ipsec_ah_keymin = 128; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); ACQ_LOCK_INIT(); SPACQ_LOCK_INIT(); - +#ifdef VIMAGE + } +#endif for (i = 0; i < IPSEC_DIR_MAX; i++) LIST_INIT(&V_sptree[i]); @@ -7199,6 +7243,11 @@ V_ip4_def_policy.policy = IPSEC_POLICY_NONE; V_ip4_def_policy.refcnt++; /*never reclaim this*/ +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + #ifndef IPSEC_DEBUG2 timeout((void *)key_timehandler, (void *)0, hz); #endif /*IPSEC_DEBUG2*/ @@ -7207,9 +7256,74 @@ keystat.getspi_count = 1; printf("IPsec: Initialized Security Association Processing.\n"); +} + +#ifdef VIMAGE +void key_destroy(void) +{ + INIT_VNET_IPSEC(curvnet); + struct secpolicy *sp, *nextsp; + struct secspacq *acq, *nextacq; + struct secashead *sah, *nextsah; + struct secreg *reg; + int i; + + SPTREE_LOCK(); + for (i = 0; i < IPSEC_DIR_MAX; i++) { + for (sp = LIST_FIRST(&V_sptree[i]); + sp != NULL; sp = nextsp) { + nextsp = LIST_NEXT(sp, chain); + if (__LIST_CHAINED(sp)) { + LIST_REMOVE(sp, chain); + free(sp, M_IPSEC_SP); + } + } + } + SPTREE_UNLOCK(); + + SAHTREE_LOCK(); + for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { + nextsah = LIST_NEXT(sah, chain); + if (__LIST_CHAINED(sah)) { + LIST_REMOVE(sah, chain); + free(sah, M_IPSEC_SAH); + } + } + SAHTREE_UNLOCK(); + + REGTREE_LOCK(); + for (i = 0; i <= SADB_SATYPE_MAX; i++) { + LIST_FOREACH(reg, &V_regtree[i], chain) { + if (__LIST_CHAINED(reg)) { + LIST_REMOVE(reg, chain); + free(reg, M_IPSEC_SAR); + break; + } + } + } + REGTREE_UNLOCK(); + + ACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + ACQ_UNLOCK(); - return; + SPACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + SPACQ_UNLOCK(); } +#endif /* * XXX: maybe This function is called after INBOUND IPsec processing. Index: netipsec/key.h =========================================================================== --- netipsec/key.h 2008/08/25 00:28:58 #1 +++ netipsec/key.h 2008/08/25 00:28:58 @@ -96,6 +96,9 @@ extern void key_freereg __P((struct socket *)); extern int key_parse __P((struct mbuf *, struct socket *)); extern void key_init __P((void)); +#ifdef VIMAGE +extern void key_destroy(void); +#endif extern void key_sa_recordxfer __P((struct secasvar *, struct mbuf *)); extern void key_sa_routechange __P((struct sockaddr *)); extern void key_sa_stir_iv __P((struct secasvar *)); Index: netipsec/keysock.c =========================================================================== --- netipsec/keysock.c 2008/08/25 00:28:58 #7 +++ netipsec/keysock.c 2008/08/25 00:28:58 @@ -52,20 +52,25 @@ #include #include -#include #include #include +#include #include +#include #include #include #include #include -#include - +#include +#include +#ifdef VIMAGE +#include +#endif #include +#ifndef VIMAGE struct key_cb { int key_count; int any_count; @@ -73,10 +78,13 @@ static struct key_cb key_cb; static struct sockaddr key_src = { 2, PF_KEY, }; +#endif static int key_sendup0 __P((struct rawcb *, struct mbuf *, int)); +#ifndef VIMAGE struct pfkeystat pfkeystat; +#endif /* * key_output() @@ -570,6 +578,10 @@ key_init0(void) { INIT_VNET_IPSEC(curvnet); + + V_key_src.sa_len = 2; + V_key_src.sa_family = PF_KEY; + bzero((caddr_t)&V_key_cb, sizeof(V_key_cb)); key_init(); } @@ -578,6 +590,9 @@ .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init0, +#ifdef VIMAGE + .dom_destroy = key_destroy, +#endif .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[sizeof(keysw)/sizeof(keysw[0])] }; Index: netipsec/keysock.h =========================================================================== --- netipsec/keysock.h 2008/08/25 00:28:58 #1 +++ netipsec/keysock.h 2008/08/25 00:28:58 @@ -57,7 +57,12 @@ /* others */ u_quad_t sockerr; /* # of socket related errors */ }; - +#ifdef VIMAGE +struct key_cb { + int key_count; + int any_count; +}; +#endif #define KEY_SENDUP_ONE 0 #define KEY_SENDUP_ALL 1 #define KEY_SENDUP_REGISTERED 2 Index: netipsec/vipsec.h =========================================================================== --- netipsec/vipsec.h 2008/08/25 00:28:58 #1 +++ netipsec/vipsec.h 2008/08/25 00:28:58 @@ -108,7 +108,6 @@ struct pfkeystat _pfkeystat; struct key_cb _key_cb; - struct sockaddr _key_dst; struct sockaddr _key_src; LIST_HEAD(, secpolicy) _sptree[IPSEC_DIR_MAX]; @@ -176,7 +175,6 @@ #define V_ipcompstat VNET_IPSEC(ipcompstat) #define V_pfkeystat VNET_IPSEC(pfkeystat) #define V_key_cb VNET_IPSEC(key_cb) -#define V_key_dst VNET_IPSEC(key_dst) #define V_key_src VNET_IPSEC(key_src) #define V_sptree VNET_IPSEC(sptree) #define V_sahtree VNET_IPSEC(sahtree) Index: netipsec/xform_ah.c =========================================================================== --- netipsec/xform_ah.c 2008/08/25 00:28:58 #8 +++ netipsec/xform_ah.c 2008/08/25 00:28:58 @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -88,9 +89,11 @@ #define AUTHSIZE(sav) \ ((sav->flags & SADB_X_EXT_OLD) ? 16 : AH_HMAC_HASHLEN) +#ifndef VIMAGE int ah_enable = 1; /* control flow of packets with AH */ int ah_cleartos = 1; /* clear ip_tos when doing AH calc */ struct ahstat ahstat; +#endif SYSCTL_DECL(_net_inet_ah); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO, @@ -100,6 +103,10 @@ SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ah, IPSECCTL_STATS, stats, CTLFLAG_RD, ahstat, ahstat, ""); +static int ah_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(AH, ah, ah_iattach, NULL, IPSEC) + static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ static int ah_input_cb(struct cryptop*); @@ -1214,9 +1221,26 @@ ah_init, ah_zeroize, ah_input, ah_output, }; +static int +ah_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ah_enable = 1; /* control flow of packets with AH */ + V_ah_cleartos = 1; /* clear ip_tos when doing AH calc */ + + return 0; +} + static void ah_attach(void) { +#ifdef VIMAGE + vnet_mod_register(&vnet_ah_modinfo); +#else + ah_iattach(NULL); +#endif xform_register(&ah_xformsw); } SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL); Index: netipsec/xform_esp.c =========================================================================== --- netipsec/xform_esp.c 2008/08/25 00:28:58 #8 +++ netipsec/xform_esp.c 2008/08/25 00:28:58 @@ -63,6 +63,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -76,19 +77,26 @@ #include #include +#ifndef VIMAGE int esp_enable = 1; struct espstat espstat; +#endif SYSCTL_DECL(_net_inet_esp); SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_esp, OID_AUTO, esp_enable, CTLFLAG_RW, esp_enable, 0, ""); SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_esp, IPSECCTL_STATS, stats, CTLFLAG_RD, espstat, espstat, ""); +#ifndef VIMAGE static int esp_max_ivlen; /* max iv length over all algorithms */ - +#endif static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); +static int esp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(ESP, esp, esp_iattach, NULL, IPSEC) + /* * NB: this is public for use by the PF_KEY support. * NB: if you add support here; be sure to add code to esp_attach below! @@ -985,14 +993,19 @@ esp_output }; -static void -esp_attach(void) +static int +esp_iattach(unused) + const void *unused; { + INIT_VNET_IPSEC(curvnet); + + V_esp_enable = 1; + V_esp_max_ivlen = 0; + #define MAXIV(xform) \ if (xform.blocksize > V_esp_max_ivlen) \ V_esp_max_ivlen = xform.blocksize \ - - V_esp_max_ivlen = 0; + MAXIV(enc_xform_des); /* SADB_EALG_DESCBC */ MAXIV(enc_xform_3des); /* SADB_EALG_3DESCBC */ MAXIV(enc_xform_rijndael128); /* SADB_X_EALG_AES */ @@ -1002,7 +1015,19 @@ MAXIV(enc_xform_null); /* SADB_EALG_NULL */ MAXIV(enc_xform_camellia); /* SADB_X_EALG_CAMELLIACBC */ +#undef MAXIV + + return 0; +} + +static void +esp_attach(void) +{ +#ifdef VIMAGE + vnet_mod_register(&vnet_esp_modinfo); +#else + esp_iattach(NULL); +#endif xform_register(&esp_xformsw); -#undef MAXIV } SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL); Index: netipsec/xform_ipcomp.c =========================================================================== --- netipsec/xform_ipcomp.c 2008/08/25 00:28:58 #7 +++ netipsec/xform_ipcomp.c 2008/08/25 00:28:58 @@ -51,6 +51,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -67,8 +68,10 @@ #include #include +#ifndef VIMAGE int ipcomp_enable = 0; struct ipcompstat ipcompstat; +#endif SYSCTL_DECL(_net_inet_ipcomp); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipcomp, OID_AUTO, @@ -76,6 +79,10 @@ SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipcomp, IPSECCTL_STATS, stats, CTLFLAG_RD, ipcompstat, ipcompstat, ""); +static int ipcomp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPCOMP, ipcomp, ipcomp_iattach, NULL, IPSEC) + static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); @@ -594,9 +601,25 @@ ipcomp_output }; +static int +ipcomp_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ipcomp_enable = 0; + + return 0; +} + static void ipcomp_attach(void) { +#ifdef VIMAGE + vnet_mod_register(&vnet_ipcomp_modinfo); +#else + ipcomp_iattach(NULL); +#endif xform_register(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); Index: netipsec/xform_ipip.c =========================================================================== --- netipsec/xform_ipip.c 2008/08/25 00:28:58 #7 +++ netipsec/xform_ipip.c 2008/08/25 00:28:58 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -64,8 +65,10 @@ #include #include #include +#include #include +#include #include #include @@ -91,8 +94,10 @@ * We can control the acceptance of IP4 packets by altering the sysctl * net.inet.ipip.allow value. Zero means drop them, all else is acceptance. */ +#ifndef VIMAGE int ipip_allow = 0; struct ipipstat ipipstat; +#endif SYSCTL_DECL(_net_inet_ipip); SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipip, OID_AUTO, @@ -105,6 +110,10 @@ static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp); +static int ipip_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPIP, ipip, ipip_iattach, NULL, IPSEC) + #ifdef INET6 /* * Really only a wrapper for ipip_input(), for use with IPv6. @@ -658,21 +667,25 @@ extern struct domain inetdomain; static struct protosw ipe4_protosw = -{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - ip4_input, - 0, 0, rip_ctloutput, - 0, - 0, 0, 0, 0, - &rip_usrreqs +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV4, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = ip4_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs }; #ifdef INET6 static struct ip6protosw ipe6_protosw = -{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - ip4_input6, - 0, 0, rip_ctloutput, - 0, - 0, 0, 0, 0, - &rip_usrreqs +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV6, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = ip4_input6, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs }; #endif @@ -691,17 +704,39 @@ return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0); } -static void -ipe4_attach(void) +static int +ipip_iattach(unused) + const void *unused; { + INIT_VNET_IPSEC(curvnet); + + V_ipip_allow = 0; + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return 0; +#endif + xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ (void) encap_attach_func(AF_INET, -1, - ipe4_encapcheck, &ipe4_protosw, NULL); + ipe4_encapcheck, &ipe4_protosw, NULL); #ifdef INET6 (void) encap_attach_func(AF_INET6, -1, - ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); + ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); +#endif + + return 0; +} + +static void +ipe4_attach(void) +{ +#ifdef VIMAGE + vnet_mod_register(&vnet_ipip_modinfo); +#else + ipip_iattach(NULL); #endif } SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL); Index: nfsclient/bootp_subr.c =========================================================================== --- nfsclient/bootp_subr.c 2008/08/25 00:28:58 #7 +++ nfsclient/bootp_subr.c 2008/08/25 00:28:58 @@ -65,6 +65,7 @@ #include #include #include +#include #include @@ -394,9 +395,11 @@ printf("\n"); } +/* XXX we are only goin gto look at intefaces in the base vimage */ void bootpboot_p_iflist(void) { + INIT_VNET_NET(basevnet); struct ifnet *ifp; struct ifaddr *ifa; @@ -1606,6 +1609,7 @@ void bootpc_init(void) { + INIT_VNET_NET(basevnet); /* XXX only look at base vnet interfaces? */ struct bootpc_ifcontext *ifctx, *nctx; /* Interface BOOTP contexts */ struct bootpc_globalcontext *gctx; /* Global BOOTP context */ struct ifnet *ifp; Index: nfsclient/nfs_diskless.c =========================================================================== --- nfsclient/nfs_diskless.c 2008/08/25 00:28:58 #6 +++ nfsclient/nfs_diskless.c 2008/08/25 00:28:58 @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include #include Index: nfsclient/nfs_socket.c =========================================================================== --- nfsclient/nfs_socket.c 2008/08/25 00:28:58 #2 +++ nfsclient/nfs_socket.c 2008/08/25 00:28:58 @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1507,6 +1508,7 @@ mtx_unlock(&nmp->nm_mtx); continue; } + CURVNET_SET(so->so_vnet); /* * If there is enough space and the window allows.. * Resend it @@ -1572,6 +1574,7 @@ mtx_unlock(&rep->r_mtx); mtx_unlock(&nmp->nm_mtx); } + CURVNET_RESTORE(); } mtx_unlock(&nfs_reqq_mtx); callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL); Index: nfsclient/nfs_vfsops.c =========================================================================== --- nfsclient/nfs_vfsops.c 2008/08/25 00:28:58 #10 +++ nfsclient/nfs_vfsops.c 2008/08/25 00:28:58 @@ -411,14 +411,17 @@ char buf[128]; char *cp; + CURVNET_SET(TD_TO_VNET(td)); #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif - if (nfs_diskless_valid == 0) + if (nfs_diskless_valid == 0) { + CURVNET_RESTORE(); return (-1); + } if (nfs_diskless_valid == 1) nfs_convert_diskless(); @@ -502,6 +505,7 @@ nd->root_args.hostname = buf; if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { + CURVNET_RESTORE(); return (error); } @@ -518,6 +522,8 @@ break; mtx_unlock(&hostname_mtx); inittodr(ntohl(nd->root_time)); + + CURVNET_RESTORE(); return (0); } Index: nfsclient/nfs_vnops.c =========================================================================== --- nfsclient/nfs_vnops.c 2008/08/25 00:28:58 #7 +++ nfsclient/nfs_vnops.c 2008/08/25 00:28:58 @@ -81,6 +81,8 @@ #include #include + +#include #include #include Index: nlm/nlm_advlock.c =========================================================================== --- nlm/nlm_advlock.c 2008/08/25 00:28:58 #2 +++ nlm/nlm_advlock.c 2008/08/25 00:28:58 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1221,12 +1222,12 @@ } mtx_lock(&hostname_mtx); - snprintf(oh_space, 32, "%d@%s", svid, hostname); + snprintf(oh_space, 32, "%d@%s", svid, G_hostname); mtx_unlock(&hostname_mtx); oh_len = strlen(oh_space); memset(lock, 0, sizeof(*lock)); - lock->caller_name = hostname; + lock->caller_name = G_hostname; lock->fh.n_len = fhlen; lock->fh.n_bytes = fh; lock->oh.n_len = oh_len; Index: rpc/rpc_generic.c =========================================================================== --- rpc/rpc_generic.c 2008/08/25 00:28:58 #1 +++ rpc/rpc_generic.c 2008/08/25 00:28:58 @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -183,9 +184,12 @@ struct sockopt opt; int error; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); - if (error) + if (error) { + CURVNET_RESTORE(); return 0; + } sip->si_alen = sa->sa_len; family = sa->sa_family; @@ -198,6 +202,7 @@ opt.sopt_valsize = sizeof type; opt.sopt_td = NULL; error = sogetopt(so, &opt); + CURVNET_RESTORE(); if (error) return 0; @@ -694,7 +699,9 @@ struct sockaddr *sa; int error, bound; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + CURVNET_RESTORE(); if (error) return (0); Index: rpc/svc_dg.c =========================================================================== --- rpc/svc_dg.c 2008/08/25 00:28:58 #1 +++ rpc/svc_dg.c 2008/08/25 00:28:58 @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -125,7 +126,9 @@ xprt->xp_p2 = NULL; xprt->xp_ops = &svc_dg_ops; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + CURVNET_RESTORE(); if (error) goto freedata; Index: rpc/svc_generic.c =========================================================================== --- rpc/svc_generic.c 2008/08/25 00:28:58 #1 +++ rpc/svc_generic.c 2008/08/25 00:28:58 @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -203,7 +204,9 @@ socklen_t salen; if (sa == NULL) { + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + CURVNET_RESTORE(); if (error) return (error); freesa = TRUE; @@ -324,6 +327,7 @@ /* * If the socket is unbound, try to bind it. */ + CURVNET_SET(so->so_vnet); if (madeso || !__rpc_sockisbound(so)) { if (bindaddr == NULL) { if (bindresvport(so, NULL)) { @@ -393,9 +397,11 @@ if (nconf) { xprt->xp_netid = strdup(nconf->nc_netid, M_RPC); } + CURVNET_RESTORE(); return (xprt); freedata: + CURVNET_RESTORE(); if (madeso) (void)soclose(so); if (xprt) { Index: sys/domain.h =========================================================================== --- sys/domain.h 2008/08/25 00:28:58 #1 +++ sys/domain.h 2008/08/25 00:28:58 @@ -48,6 +48,8 @@ char *dom_name; void (*dom_init) /* initialize domain data structures */ (void); + void (*dom_destroy) /* cleanup structures / state */ + (void); int (*dom_externalize) /* externalize access rights */ (struct mbuf *, struct mbuf **); void (*dom_dispose) /* dispose of internalized rights */ @@ -56,6 +58,8 @@ struct domain *dom_next; int (*dom_rtattach) /* initialize routing table */ (void **, int); + int (*dom_rtdetach) /* clean up routing table */ + (void **, int); int dom_rtoffset; /* an arg to rtattach, in bits */ /* XXX MRT. * rtoffset May be 0 if the domain supplies its own rtattach(), Index: sys/kernel.h =========================================================================== --- sys/kernel.h 2008/08/25 00:28:58 #4 +++ sys/kernel.h 2008/08/25 00:28:58 @@ -58,8 +58,10 @@ extern struct mtx hostname_mtx; extern unsigned long hostid; extern char hostuuid[64]; +#ifndef VIMAGE extern char hostname[MAXHOSTNAMELEN]; extern char domainname[MAXHOSTNAMELEN]; +#endif extern char kernelname[MAXPATHLEN]; extern int tick; /* usec per tick (1000000 / hz) */ @@ -117,6 +119,7 @@ SI_SUB_MAC = 0x2180000, /* TrustedBSD MAC subsystem */ SI_SUB_MAC_POLICY = 0x21C0000, /* TrustedBSD MAC policies */ SI_SUB_MAC_LATE = 0x21D0000, /* TrustedBSD MAC subsystem */ + SI_SUB_VIMAGE = 0x21E0000, /* vimage 0 */ SI_SUB_INTRINSIC = 0x2200000, /* proc 0*/ SI_SUB_VM_CONF = 0x2300000, /* config VM, set limits*/ SI_SUB_DDB_SERVICES = 0x2380000, /* capture, scripting, etc. */ @@ -168,6 +171,7 @@ SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ + SI_SUB_VIMAGE_DONE = 0xef00000, /* clear curvnet*/ SI_SUB_SMP = 0xf000000, /* start the APs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; Index: sys/mbuf.h =========================================================================== --- sys/mbuf.h 2008/08/25 00:28:58 #1 +++ sys/mbuf.h 2008/08/25 00:28:58 @@ -192,6 +192,7 @@ #define M_PROTO6 0x00080000 /* protocol-specific */ #define M_PROTO7 0x00100000 /* protocol-specific */ #define M_PROTO8 0x00200000 /* protocol-specific */ +#define M_REMOTE_VNET 0x00400000 /* mbuf crossed boundary between two vnets */ /* * For RELENG_{6,7} steal these flags for limited multiple routing table * support. In RELENG_8 and beyond, use just one flag and a tag. Index: sys/proc.h =========================================================================== --- sys/proc.h 2008/08/25 00:28:58 #4 +++ sys/proc.h 2008/08/25 00:28:58 @@ -273,6 +273,8 @@ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ + struct vnet *td_vnet; /* (*) Effective vnet */ + const char *td_vnet_lpush; /* (*) Debugging vnet push / pop */ }; struct mtx *thread_lock_block(struct thread *); Index: sys/protosw.h =========================================================================== --- sys/protosw.h 2008/08/25 00:28:58 #1 +++ sys/protosw.h 2008/08/25 00:28:58 @@ -72,6 +72,7 @@ typedef void pr_ctlinput_t (int, struct sockaddr *, void *); typedef int pr_ctloutput_t (struct socket *, struct sockopt *); typedef void pr_init_t (void); +typedef void pr_destroy_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); typedef void pr_drain_t (void); @@ -93,6 +94,7 @@ pr_usrreq_t *pr_ousrreq; /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ Index: sys/sched.h =========================================================================== --- sys/sched.h 2008/08/25 00:28:58 #1 +++ sys/sched.h 2008/08/25 00:28:58 @@ -63,6 +63,7 @@ #define _SCHED_H_ #ifdef _KERNEL + /* * General scheduling info. * Index: sys/socketvar.h =========================================================================== --- sys/socketvar.h 2008/08/25 00:28:58 #3 +++ sys/socketvar.h 2008/08/25 00:28:58 @@ -45,6 +45,8 @@ #include #endif +struct vnet; + /* * Kernel structure per socket. * Contains send and receive buffer queues, @@ -72,6 +74,7 @@ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ + struct vnet *so_vnet; /* network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. @@ -292,6 +295,7 @@ MALLOC_DECLARE(M_SONAME); #endif +extern int accf_unloadable; extern int maxsockets; extern u_long sb_max; extern struct uma_zone *socket_zone; Index: sys/sockio.h =========================================================================== --- sys/sockio.h 2008/08/25 00:28:58 #2 +++ sys/sockio.h 2008/08/25 00:28:58 @@ -108,6 +108,10 @@ #define SIOCGPRIVATE_0 _IOWR('i', 80, struct ifreq) /* device private 0 */ #define SIOCGPRIVATE_1 _IOWR('i', 81, struct ifreq) /* device private 1 */ +#define SIOCSPVIMAGE _IOW('i', 101, struct vi_req) /* set proc vimage */ +#define SIOCGPVIMAGE _IOWR('i', 102, struct vi_req) /* get proc vimage */ +#define SIOCSIFVIMAGE _IOWR('i', 103, struct vi_req) /* set ifc vi/net */ + #define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific parameters */ #define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific Index: sys/ucred.h =========================================================================== --- sys/ucred.h 2008/08/25 00:28:58 #1 +++ sys/ucred.h 2008/08/25 00:28:58 @@ -35,6 +35,8 @@ #include +struct vimage; + /* * Credentials. * @@ -55,7 +57,9 @@ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ - void *cr_pspare[3]; /* vimage 2; general use 1 */ + struct vimage *cr_vimage; /* effective vimage */ + struct vimage *cr_rvimage; /* real vimage */ + void *cr_pspare[1]; /* vimage 2; general use 1 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ struct auditinfo_addr cr_audit; /* Audit properties. */ Index: sys/vimage.h =========================================================================== --- sys/vimage.h 2008/08/25 00:28:58 #14 +++ sys/vimage.h 2008/08/25 00:28:58 @@ -26,12 +26,11 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * * $FreeBSD: src/sys/sys/vimage.h,v 1.4 2008/08/25 05:49:16 julian Exp $ */ -#ifndef _SYS_VIMAGE_H_ -#define _SYS_VIMAGE_H_ +#ifndef _SYS_VIMAGE_H_ +#define _SYS_VIMAGE_H_ #include #include @@ -80,6 +79,7 @@ #define VNET_MOD_ALTQ 8 #define VNET_MOD_IPX 9 #define VNET_MOD_ATALK 10 +#define VNET_MOD_ACCF_HTTP 11 /* stateless modules */ #define VNET_MOD_NG_WORMHOLE 19 #define VNET_MOD_NG_ETHER 20