Index: share/man/man4/Makefile =========================================================================== --- share/man/man4/Makefile 2009/02/22 13:41:20 #309 +++ share/man/man4/Makefile 2009/02/22 13:41:20 @@ -86,6 +86,7 @@ em.4 \ en.4 \ enc.4 \ + epair.4 \ esp.4 \ et.4 \ exca.4 \ @@ -477,6 +478,7 @@ MLINKS+=em.4 if_em.4 MLINKS+=en.4 if_en.4 MLINKS+=enc.4 if_enc.4 +MLINKS+=epair.4 if_epair.4 MLINKS+=et.4 if_et.4 MLINKS+=faith.4 if_faith.4 MLINKS+=fatm.4 if_fatm.4 Index: share/man/man4/altq.4 =========================================================================== --- share/man/man4/altq.4 2009/02/22 13:41:20 #34 +++ share/man/man4/altq.4 2009/02/22 13:41:20 @@ -129,6 +129,7 @@ .Xr ed 4 , .Xr em 4 , .Xr ep 4 , +.Xr epair 4 , .Xr fxp 4 , .Xr gem 4 , .Xr hme 4 , Index: share/man/man4/epair.4 =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- share/man/man4/epair.4 Sun Feb 22 13:41:21 2009 *************** *** 0 **** --- 1,120 ---- + .\"- + .\" Copyright (c) 2008 The FreeBSD Foundation + .\" All rights reserved. + .\" + .\" This software was developed by CK Software GmbH under sponsorship + .\" from the FreeBSD Foundation. + .\" + .\" Redistribution and use in source and binary forms, with or without + .\" modification, are permitted provided that the following conditions + .\" are met: + .\" 1. Redistributions of source code must retain the above copyright + .\" notice, this list of conditions and the following disclaimer. + .\" 2. Redistributions in binary form must reproduce the above copyright + .\" notice, this list of conditions and the following disclaimer in the + .\" documentation and/or other materials provided with the distribution. + .\" + .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + .\" SUCH DAMAGE. + .\" + .\" $FreeBSD$ + .\" + .Dd December 15, 2008 + .Dt EPAIR 4 + .Os + .Sh NAME + .Nm epair + .Nd Virtual cross-over Ethernet-like interface pair. + .Sh SYNOPSIS + To compile this driver into the kernel, + place the following line in your + kernel configuration file: + .Bd -ragged -offset indent + .Cd "device epair" + .Ed + .Pp + Alternatively, to load the driver as a + module at boot time, place the following line in + .Xr loader.conf 5 : + .Bd -literal -offset indent + if_epair_load="YES" + .Ed + .Sh DESCRIPTION + The + .Nm + is a pair of Ethernet-like software interfaces, + which are directly connected by a virtual cross-over cable. + .Pp + Each + .Nm + interface pair is created at runtime using interface cloning. + This is most easily done with the + .Xr ifconfig 8 + .Cm create + command or using the + .Va cloned_interfaces + variable in + .Xr rc.conf 5 . + While for cloning you only give either + .Pa epair + or + .Pa epair + the + .Nm + pair will be named like + .Pa epair[ab] . + This means the names of the first + .Nm + interfaces will be + .Pa epair0a + and + .Pa epair0b . + .Pp + Like any other Ethernet interface, an + .Nm + needs to have a network address. + Each + .Nm + will be assigned a locally administered address by default, + that is only guaranteed to be unique within one network stack. + To change the default addresses one may use the SIOCSIFADDR ioctl(2) or + ifconfig(8) utility. + .Pp + The basic intend is to provide connectivity between two virtual + network stack instances. + When connected to a + .Xr if_bridge 4 + one end of the interface pair can also be part of another (virtual) LAN. + As with any other Ethernet interface one can configure + .Xr vlan 4 + support on top of it. + .Pp + .Sh SEE ALSO + .Xr ioctl 2 , + .Xr altq 4 , + .Xr bpf 4 , + .Xr if_bridge 4 , + .Xr vlan 4 , + .Xr loader.conf 5, + .Xr rc.conf 5 , + .Xr ifconfig 8 + .Sh HISTORY + The + .Nm + interface first appeared in + .Fx 8.0 . + .Sh AUTHORS + The + .Nm + interface was written by + .An Bjoern A. Zeeb, CK Software GmbH, + under sponsorship from the FreeBSD Foundation. Index: sys/amd64/conf/GENERIC_NODEBUG =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/amd64/conf/GENERIC_NODEBUG Sun Feb 22 13:41:21 2009 *************** *** 0 **** --- 1,9 ---- + include GENERIC + ident GENERIC_NODEBUG + + # Disable expensive debugging options + nooptions INVARIANTS + nooptions INVARIANT_SUPPORT + nooptions WITNESS + nooptions WITNESS_SKIPSPIN + Index: sys/amd64/conf/VIMAGE =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/amd64/conf/VIMAGE Sun Feb 22 13:41:21 2009 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC + ident VIMAGE + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/amd64/conf/VIMAGE_NODEBUG =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/amd64/conf/VIMAGE_NODEBUG Sun Feb 22 13:41:21 2009 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC_NODEBUG + ident VIMAGE_NODEBUG + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/amd64/conf/VLINT =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/amd64/conf/VLINT Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,15 ---- + # + # VLINT = LINT + options vimage + nooptions SCTP + # + # $FreeBSD$ + # + include LINT + ident VLINT + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/cddl/compat/opensolaris/kern/opensolaris_misc.c =========================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_misc.c 2009/02/22 13:41:20 #3 +++ sys/cddl/compat/opensolaris/kern/opensolaris_misc.c 2009/02/22 13:41:20 @@ -37,7 +37,11 @@ char hw_serial[11] = "0"; struct opensolaris_utsname utsname = { +#ifdef VIMAGE + .nodename = "XXX" /* XXX FIXME!!! */ +#else .nodename = hostname +#endif }; int Index: sys/compat/linux/linux_socket.c =========================================================================== --- sys/compat/linux/linux_socket.c 2009/02/22 13:41:20 #54 +++ sys/compat/linux/linux_socket.c 2009/02/22 13:41:20 @@ -583,7 +583,7 @@ static int linux_socket(struct thread *td, struct linux_socket_args *args) { -#ifdef INET6 +#if defined(INET6) && !defined(KLD_MODULE) INIT_VNET_INET6(curvnet); #endif struct socket_args /* { Index: sys/compat/svr4/svr4_stat.c =========================================================================== --- sys/compat/svr4/svr4_stat.c 2009/02/22 13:41:20 #19 +++ sys/compat/svr4/svr4_stat.c 2009/02/22 13:41:20 @@ -412,6 +412,8 @@ struct thread *td; struct svr4_sys_systeminfo_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); + char *str = NULL; int error = 0; register_t *retval = td->td_retval; Index: sys/conf/NOTES =========================================================================== --- sys/conf/NOTES 2009/02/22 13:41:20 #509 +++ sys/conf/NOTES 2009/02/22 13:41:20 @@ -843,6 +843,7 @@ # events for resetting the demand dial activity timer - requires bpf. # See pppd(8) for more details. # +device epair #Virtual cross-over Ethernet # Index: sys/conf/files =========================================================================== --- sys/conf/files 2009/02/22 13:41:20 #813 +++ sys/conf/files 2009/02/22 13:41:20 @@ -1959,8 +1959,8 @@ kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vimage.c standard kern/kern_xxx.c standard -kern/kern_vimage.c standard kern/link_elf.c standard kern/linker_if.m standard kern/md4c.c optional netsmb @@ -2169,6 +2169,7 @@ net/if_edsc.c optional edsc net/if_ef.c optional ef net/if_enc.c optional enc +net/if_epair.c optional epair net/if_ethersubr.c optional ether \ compile-with "${NORMAL_C} -I$S/contrib/pf" net/if_faith.c optional faith @@ -2310,6 +2311,7 @@ netgraph/ng_nat.c optional netgraph_nat netgraph/ng_one2many.c optional netgraph_one2many netgraph/ng_parse.c optional netgraph +netgraph/ng_pipe.c optional netgraph_pipe netgraph/ng_ppp.c optional netgraph_ppp netgraph/ng_pppoe.c optional netgraph_pppoe netgraph/ng_pptpgre.c optional netgraph_pptpgre Index: sys/conf/options =========================================================================== --- sys/conf/options 2009/02/22 13:41:20 #377 +++ sys/conf/options 2009/02/22 13:41:20 @@ -480,6 +480,7 @@ NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h +NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h Index: sys/contrib/altq/altq/altq_subr.c =========================================================================== --- sys/contrib/altq/altq/altq_subr.c 2009/02/22 13:41:20 #16 +++ sys/contrib/altq/altq/altq_subr.c 2009/02/22 13:41:20 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -448,6 +449,7 @@ tbr_timeout(arg) void *arg; { + VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; int active, s; @@ -460,14 +462,23 @@ #if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) IFNET_RLOCK(); #endif - for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { - /* read from if_snd unlocked */ - if (!TBR_IS_ENABLED(&ifp->if_snd)) - continue; - active++; - if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) - (*ifp->if_start)(ifp); + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + INIT_VNET_NET(vnet_iter); + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { + /* read from if_snd unlocked */ + if (!TBR_IS_ENABLED(&ifp->if_snd)) + continue; + active++; + if (!IFQ_IS_EMPTY(&ifp->if_snd) && + ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } + CURVNET_RESTORE(); } + VNET_LIST_RUNLOCK(); #if defined(__FreeBSD__) && (__FreeBSD_version >= 500000) IFNET_RUNLOCK(); #endif Index: sys/contrib/ipfilter/netinet/ip_fil_freebsd.c =========================================================================== --- sys/contrib/ipfilter/netinet/ip_fil_freebsd.c 2009/02/22 13:41:20 #14 +++ sys/contrib/ipfilter/netinet/ip_fil_freebsd.c 2009/02/22 13:41:20 @@ -244,8 +244,10 @@ bzero((char *)frcache, sizeof(frcache)); fr_running = 1; - if (fr_control_forwarding & 1) + if (fr_control_forwarding & 1) { + INIT_VNET_INET(curvnet); V_ipforwarding = 1; + } SPL_X(s); #if (__FreeBSD_version >= 300000) @@ -267,8 +269,10 @@ #ifdef USE_SPL int s; #endif - if (fr_control_forwarding & 2) + if (fr_control_forwarding & 2) { + INIT_VNET_INET(curvnet); V_ipforwarding = 0; + } SPL_NET(s); @@ -646,6 +650,7 @@ fr_info_t *fin; mb_t *m, **mpp; { + INIT_VNET_INET(curvnet); fr_info_t fnew; ip_t *ip, *oip; int hlen; Index: sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c =========================================================================== --- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2009/02/22 13:41:20 #28 +++ sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2009/02/22 13:41:20 @@ -1217,7 +1217,7 @@ * receive window. */ static __inline int -select_rcv_wscale(int space) +select_rcv_wscale(int space, struct socket *so) { INIT_VNET_INET(so->so_vnet); int wscale = 0; @@ -1326,7 +1326,7 @@ calc_opt0h(struct socket *so, int mtu_idx) { struct tcpcb *tp = so_sototcpcb(so); - int wscale = select_rcv_wscale(tp->rcv_wnd); + int wscale = select_rcv_wscale(tp->rcv_wnd, so); return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | Index: sys/i386/conf/.cvsignore =========================================================================== --- sys/i386/conf/.cvsignore 2009/02/22 13:41:20 #1 +++ sys/i386/conf/.cvsignore 2009/02/22 13:41:20 @@ -1,1 +1,0 @@ -[A-Za-z0-9]* Index: sys/i386/conf/GENERIC_NODEBUG =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/i386/conf/GENERIC_NODEBUG Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,9 ---- + include GENERIC + ident GENERIC_NODEBUG + + # Disable expensive debugging options + nooptions INVARIANTS + nooptions INVARIANT_SUPPORT + nooptions WITNESS + nooptions WITNESS_SKIPSPIN + Index: sys/i386/conf/VIMAGE =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/i386/conf/VIMAGE Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC + ident VIMAGE + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/i386/conf/VIMAGE_NODEBUG =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/i386/conf/VIMAGE_NODEBUG Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,16 ---- + # + # VIMAGE - sample kernel configuration file with a virtualized network stack + # configure. + # + # $FreeBSD$ + # + include GENERIC_NODEBUG + ident VIMAGE_NODEBUG + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/i386/conf/VLINT =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/i386/conf/VLINT Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,15 ---- + # + # VLINT = LINT + options vimage + nooptions SCTP + # + # $FreeBSD$ + # + include LINT + ident VLINT + + options VIMAGE + + # + # Some kernel subsystems and functions don't yet compile with VIMAGE. Remove + # from the configuration for now. + # + nooptions SCTP Index: sys/i386/ibcs2/ibcs2_socksys.c =========================================================================== --- sys/i386/ibcs2/ibcs2_socksys.c 2009/02/22 13:41:20 #13 +++ sys/i386/ibcs2/ibcs2_socksys.c 2009/02/22 13:41:20 @@ -174,6 +174,7 @@ struct thread *td; struct setipdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); char hname[MAXHOSTNAMELEN], *ptr; int error, sctl[2], hlen; Index: sys/kern/init_main.c =========================================================================== --- sys/kern/init_main.c 2009/02/22 13:41:20 #124 +++ sys/kern/init_main.c 2009/02/22 13:41:20 @@ -74,6 +74,7 @@ #include #include #include +#include #include @@ -452,6 +453,11 @@ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ +#ifdef VIMAGE + P_TO_VIMAGE(p) = LIST_FIRST(&vimage_head); + refcount_acquire(&P_TO_VIMAGE(p)->vi_ucredrefc); + LIST_FIRST(&vprocg_head)->nprocs++; +#endif #ifdef AUDIT audit_cred_kproc0(p->p_ucred); #endif Index: sys/kern/kern_exit.c =========================================================================== --- sys/kern/kern_exit.c 2009/02/22 13:41:20 #186 +++ sys/kern/kern_exit.c 2009/02/22 13:41:20 @@ -70,6 +70,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -739,6 +740,7 @@ nfound++; PROC_SLOCK(p); if (p->p_state == PRS_ZOMBIE) { + INIT_VPROCG(P_TO_VPROCG(p)); if (rusage) { *rusage = p->p_ru; calcru(p, &rusage->ru_utime, &rusage->ru_stime); @@ -839,6 +841,9 @@ uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; +#ifdef VIMAGE + vprocg->nprocs--; +#endif sx_xunlock(&allproc_lock); return (0); } Index: sys/kern/kern_fork.c =========================================================================== --- sys/kern/kern_fork.c 2009/02/22 13:41:20 #182 +++ sys/kern/kern_fork.c 2009/02/22 13:41:20 @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -349,6 +350,9 @@ * are hard-limits as to the number of processes that can run. */ nprocs++; +#ifdef VIMAGE + P_TO_VPROCG(p1)->nprocs++; +#endif /* * Find an unused process ID. We remember a range of unused IDs @@ -523,6 +527,11 @@ td2->td_sigmask = td->td_sigmask; td2->td_flags = TDF_INMEM; +#ifdef VIMAGE + td2->td_vnet = NULL; + td2->td_vnet_lpush = NULL; +#endif + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. Index: sys/kern/kern_jail.c =========================================================================== --- sys/kern/kern_jail.c 2009/02/22 13:41:20 #86 +++ sys/kern/kern_jail.c 2009/02/22 13:41:20 @@ -1197,6 +1197,10 @@ if (cred2->cr_prison != cred1->cr_prison) return (ESRCH); } +#ifdef VIMAGE + if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg) + return (ESRCH); +#endif return (0); } Index: sys/kern/kern_linker.c =========================================================================== --- sys/kern/kern_linker.c 2009/02/22 13:41:20 #102 +++ sys/kern/kern_linker.c 2009/02/22 13:41:20 @@ -992,7 +992,19 @@ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); +#ifdef VIMAGE + /* For now permit only the default vimage to kldload modules */ + if (!IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))) + return (EPERM); + /* + * It's possible that kldloaded module will attach a new ifnet, + * so vnet context must be set when this ocurs. + */ + CURVNET_SET(TD_TO_VNET(td)); +#endif + + /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. @@ -1019,6 +1031,7 @@ *fileid = lf->id; unlock: KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1056,6 +1069,11 @@ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); + /* XXX should suser catch this for us? */ + VNET_ASSERT(IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))); + + CURVNET_SET(TD_TO_VNET(td)); + KLD_LOCK(); lf = linker_find_file_by_id(fileid); if (lf) { @@ -1092,6 +1110,7 @@ PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm); #endif KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } Index: sys/kern/kern_mib.c =========================================================================== --- sys/kern/kern_mib.c 2009/02/22 13:41:20 #50 +++ sys/kern/kern_mib.c 2009/02/22 13:41:20 @@ -356,6 +356,7 @@ static int sysctl_domainname(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); char tmpdomainname[MAXHOSTNAMELEN]; int error; Index: sys/kern/kern_prot.c =========================================================================== --- sys/kern/kern_prot.c 2009/02/22 13:41:20 #114 +++ sys/kern/kern_prot.c 2009/02/22 13:41:20 @@ -69,6 +69,7 @@ #include #include #include +#include #if defined(INET) || defined(INET6) #include @@ -1755,6 +1756,9 @@ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); +#ifdef VIMAGE + if (!vi_child_of(TD_TO_VIMAGE(td), P_TO_VIMAGE(p))) +#endif if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC @@ -1824,6 +1828,10 @@ */ if (jailed(cr)) prison_free(cr->cr_prison); +#ifdef VIMAGE + if (cr->cr_vimage != NULL) + refcount_release(&cr->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_destroy(cr); #endif @@ -1859,6 +1867,10 @@ uihold(dest->cr_ruidinfo); if (jailed(dest)) prison_hold(dest->cr_prison); +#ifdef VIMAGE + KASSERT(src->cr_vimage != NULL, ("cr_vimage == NULL")); + refcount_acquire(&dest->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_copy(src, dest); #endif Index: sys/kern/kern_sysctl.c =========================================================================== --- sys/kern/kern_sysctl.c 2009/02/22 13:41:20 #78 +++ sys/kern/kern_sysctl.c 2009/02/22 13:41:20 @@ -931,6 +931,32 @@ } +#ifdef VIMAGE +int +sysctl_handle_v_int(SYSCTL_HANDLER_ARGS) +{ + int tmpout, error = 0; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by making a copy of the data. + */ + tmpout = *(int *)arg1; + error = SYSCTL_OUT(req, &tmpout, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} +#endif + + /* * Based on on sysctl_handle_int() convert milliseconds into ticks. */ @@ -940,7 +966,9 @@ { int error, s, tt; - tt = *(int *)oidp->oid_arg1; + SYSCTL_RESOLVE_V_ARG1(); + + tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); @@ -951,7 +979,7 @@ if (tt < 1) return (EINVAL); - *(int *)oidp->oid_arg1 = tt; + *(int *)arg1 = tt; return (0); } @@ -1065,6 +1093,48 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_string(SYSCTL_HANDLER_ARGS) +{ + int error=0; + char *tmparg; + size_t outlen; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by copying to a + * temporary kernel buffer. + */ +retry: + outlen = strlen((char *)arg1)+1; + tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); + + if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { + free(tmparg, M_SYSCTLTMP); + goto retry; + } + + error = SYSCTL_OUT(req, tmparg, outlen); + free(tmparg, M_SYSCTLTMP); + + if (error || !req->newptr) + return (error); + + if ((req->newlen - req->newidx) >= arg2) { + error = EINVAL; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} +#endif + + /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. @@ -1102,6 +1172,35 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_opaque(SYSCTL_HANDLER_ARGS) +{ + int error, tries; + u_int generation; + struct sysctl_req req2; + + SYSCTL_RESOLVE_V_ARG1(); + + tries = 0; + req2 = *req; +retry: + generation = curthread->td_generation; + error = SYSCTL_OUT(req, arg1, arg2); + if (error) + return (error); + tries++; + if (generation != curthread->td_generation && tries < 3) { + *req = req2; + goto retry; + } + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} +#endif + /* * Transfer functions to/from kernel space. * XXX: rather untested at this point Index: sys/kern/kern_vimage.c =========================================================================== --- sys/kern/kern_vimage.c 2009/02/22 13:41:20 #1 +++ sys/kern/kern_vimage.c 2009/02/22 13:41:20 @@ -31,68 +31,890 @@ #include __FBSDID("$FreeBSD: src/sys/kern/kern_vimage.c,v 1.1 2008/12/10 23:12:39 zec Exp $"); +#include "opt_ddb.h" + #include #include #include #include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#ifdef DDB +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +struct vnet_modlink; + +//#define DEBUG_ORDERING + +MALLOC_DEFINE(M_VIMAGE, "vimage", "vimage resource container"); +MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); +MALLOC_DEFINE(M_VPROCG, "vprocg", "process group control block"); +MALLOC_DEFINE(M_VCPU, "vcpu", "cpu resource control block"); + #ifndef VIMAGE_GLOBALS +static int vnet_mod_constructor(struct vnet_modlink *); +static int vnet_mod_destructor(struct vnet_modlink *); +#endif + +#ifdef VIMAGE +static struct vimage *vi_alloc(struct vimage *, char *); +static int vi_destroy(struct vimage *); +#endif -MALLOC_DEFINE(M_VIMAGE, "vimage", "vimage resource container"); +#ifndef VIMAGE +#ifndef VIMAGE_GLOBALS +struct vprocg vprocg_0; +#endif +#endif + +#define vi_malloc(addr, type, flags) malloc((addr), (type), (flags)) +#define vi_free(addr, type) free((addr), (type)) +#ifndef VIMAGE_GLOBALS static TAILQ_HEAD(vnet_modlink_head, vnet_modlink) vnet_modlink_head; +static TAILQ_HEAD(vnet_modpending_head, vnet_modlink) vnet_modpending_head; +static void vnet_mod_complete_registration(struct vnet_modlink *); +int +vi_symlookup(struct kld_sym_lookup *lookup, char *symstr) +{ + struct vnet_modlink *vml; + struct vnet_symmap *mapentry; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { + + if (vml->vml_modinfo->vmi_symmap == NULL) + continue; + + for (mapentry = vml->vml_modinfo->vmi_symmap; + mapentry->name != NULL; mapentry++) { + if (strcmp(symstr, mapentry->name) == 0) { +#ifdef VIMAGE + lookup->symvalue = + (u_long) curvnet->mod_data[vml->vml_modinfo->vmi_id]; + lookup->symvalue += mapentry->offset; +#else + lookup->symvalue = (u_long) mapentry->offset; +#endif + lookup->symsize = mapentry->size; + return (0); + } + } + } + return (ENOENT); +} + void vnet_mod_register(const struct vnet_modinfo *vmi) { + vnet_mod_register_multi(vmi, NULL, NULL); +} + +void +vnet_mod_register_multi(const struct vnet_modinfo *vmi, void *iarg, + char *iname) +{ struct vnet_modlink *vml, *vml_iter; /* Do not register the same module instance more than once. */ TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) - if (vml_iter->vml_modinfo == vmi) - panic("%s: %s", __func__, vmi->vmi_name); - vml = malloc(sizeof(struct vnet_modlink), M_VIMAGE, M_NOWAIT); + if (vml_iter->vml_modinfo == vmi && vml_iter->vml_iarg == iarg) + break; + if (vml_iter != NULL) + panic("attempt to register an already registered vnet module"); + vml = vi_malloc(sizeof(struct vnet_modlink), M_VIMAGE, M_NOWAIT); + + /* + * XXX we support only statically assigned module IDs at the time. + * In principle modules should be able to get a dynamically + * assigned ID at registration time. + */ + VNET_ASSERT(vmi->vmi_id > 0 || vmi->vmi_id < VNET_MOD_MAX); + VNET_ASSERT(!((iarg == NULL) ^ (iname == NULL))); + vml->vml_modinfo = vmi; + vml->vml_iarg = iarg; + vml->vml_iname = iname; + + /* Check whether the module we depend on is already registered */ + if (vmi->vmi_dependson != VNET_MOD_NONE) { + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_id == + vmi->vmi_dependson) + break; /* Depencency found, we are done */ + if (vml_iter == NULL) { +#ifdef DEBUG_ORDERING + printf("dependency %d missing for vnet mod %s," + "postponing registration\n", + vmi->vmi_dependson, vmi->vmi_name); +#endif /* DEBUG_ORDERING */ + TAILQ_INSERT_TAIL(&vnet_modpending_head, vml, + vml_mod_le); + return; + } + } + + vnet_mod_complete_registration(vml); +} + +void +vnet_mod_complete_registration(struct vnet_modlink *vml) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct vnet_modlink *vml_iter; + TAILQ_INSERT_TAIL(&vnet_modlink_head, vml, vml_mod_le); + + VNET_FOREACH(vnet_iter) { + CURVNET_SET_QUIET(vnet_iter); + vnet_mod_constructor(vml); + CURVNET_RESTORE(); + } + + /* Check for pending modules depending on us */ + do { + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_dependson == + vml->vml_modinfo->vmi_id) + break; + if (vml_iter != NULL) { +#ifdef DEBUG_ORDERING + printf("vnet mod %s now registering," + "dependency %d loaded\n", + vml_iter->vml_modinfo->vmi_name, + vml->vml_modinfo->vmi_id); +#endif /* DEBUG_ORDERING */ + TAILQ_REMOVE(&vnet_modpending_head, vml_iter, + vml_mod_le); + vnet_mod_complete_registration(vml_iter); + } + } while (vml_iter != NULL); } +void +vnet_mod_deregister(const struct vnet_modinfo *vmi) +{ + vnet_mod_deregister_multi(vmi, NULL, NULL); +} + +void +vnet_mod_deregister_multi(const struct vnet_modinfo *vmi, void *iarg, + char *iname) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + if (vml->vml_modinfo == vmi && vml->vml_iarg == iarg) + break; + if (vml == NULL) + panic("cannot deregister unregistered vnet module %s", + vmi->vmi_name); + + VNET_FOREACH(vnet_iter) { + CURVNET_SET_QUIET(vnet_iter); + vnet_mod_destructor(vml); + CURVNET_RESTORE(); + } + + TAILQ_REMOVE(&vnet_modlink_head, vml, vml_mod_le); + vi_free(vml, M_VIMAGE); +} + +static int vnet_mod_constructor(struct vnet_modlink *vml) +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("instatiating vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_struct_size) + printf("malloc(%d); ", vmi->vmi_struct_size); + if (vmi->vmi_iattach != NULL) + printf("iattach()"); + printf("\n"); +#endif + +#ifdef VIMAGE + if (vmi->vmi_struct_size) { + void *mem = vi_malloc(vmi->vmi_struct_size, M_VNET, + M_NOWAIT | M_ZERO); + if (mem == NULL) /* XXX should return error, not panic */ + panic("vi_alloc: malloc for %s\n", vmi->vmi_name); + curvnet->mod_data[vmi->vmi_id] = mem; + } +#endif + + if (vmi->vmi_iattach != NULL) + vmi->vmi_iattach(vml->vml_iarg); + + return (0); +} + +static int vnet_mod_destructor(struct vnet_modlink *vml) +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("destroying vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_idetach != NULL) + printf("idetach(); "); + if (vmi->vmi_struct_size) + printf("free()"); + printf("\n"); +#endif + + if (vmi->vmi_idetach) + vmi->vmi_idetach(vml->vml_iarg); + +#ifdef VIMAGE + if (vmi->vmi_struct_size) { + if (curvnet->mod_data[vmi->vmi_id] == NULL) + panic("vi_destroy: %s\n", vmi->vmi_name); + vi_free(curvnet->mod_data[vmi->vmi_id], M_VNET); + curvnet->mod_data[vmi->vmi_id] = NULL; + } +#endif + + return (0); +} +#endif /* !VIMAGE_GLOBALS */ + +#ifdef VIMAGE +struct vimage_list_head vimage_head; +struct vnet_list_head vnet_head; +struct vprocg_list_head vprocg_head; +struct vcpu_list_head vcpu_head; + +struct cv vnet_list_condvar; +struct mtx vnet_list_refc_mtx; +int vnet_list_refc = 0; + +struct mtx vcpu_list_mtx; + +#define VNET_LIST_LOCK() \ + mtx_lock(&vnet_list_refc_mtx); \ + while (vnet_list_refc != 0) \ + cv_wait(&vnet_list_condvar, &vnet_list_refc_mtx); + +#define VNET_LIST_UNLOCK() \ + mtx_unlock(&vnet_list_refc_mtx); + +static u_int last_vi_id = 0; +static u_int last_vnet_id = 0; +static u_int last_vprocg_id = 0; +static u_int last_vcpu_id = 0; + +struct vimage * +vnet2vimage(struct vnet *vnet) +{ + struct vimage *vip; + + LIST_FOREACH(vip, &vimage_head, vi_le) + if (vip->v_net == vnet) + return(vip); + + panic("vnet2vimage"); /* must never happen */ +} + +char * +vnet_name(struct vnet *vnet) +{ + return(vnet2vimage(vnet)->vi_name); +} + + +int +vi_child_of(struct vimage *parent, struct vimage *child) +{ + if (child == parent) + return (0); + for (; child; child = child->vi_parent) + if (child == parent) + return (1); + return (0); +} + /* - * vi_symlookup() attempts to resolve name to address queries for - * variables which have been moved from global namespace to virtualization - * container structures, but are still directly accessed from legacy - * userspace processes via kldsym(2) and kmem(4) interfaces. + * if_reassign_common() should be called by all device specific + * ifnet reassignment routines after the interface is detached from + * current vnet and before the interface gets attached to the target + * vnet. This routine attempts to shrink if_index in current vnet, + * find an unused if_index in target vnet and calls if_grow() if + * necessary, and finally finds an unused if_xname for the target + * vnet. + * + * XXX this routine should hold a lock over if_index and return with + * such a lock held, and the caller should release that lock + * after ifattach completes! + */ +void +if_reassign_common(struct ifnet *ifp, struct vnet *new_vnet, const char *dname) +{ + /* do/while construct needed to confine scope of INIT_VNET_NET() */ + do { + INIT_VNET_NET(curvnet); + + IFNET_WLOCK(); + ifnet_setbyindex(ifp->if_index, NULL); + while (V_if_index > 0 && + ifnet_byindex_locked(V_if_index) == NULL) + V_if_index--; + IFNET_WUNLOCK(); + } while (0); + + CURVNET_SET_QUIET(new_vnet); + INIT_VNET_NET(new_vnet); + /* + * Try to find an empty slot below if_index. If we fail, take + * the next slot. + */ + IFNET_WLOCK(); + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { + if (ifnet_byindex_locked(ifp->if_index) == NULL) + break; + } + /* Catch if_index overflow. */ + if (ifp->if_index < 1) + panic("vi_if_move: if_index overflow"); + + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) + if_grow(); + ifnet_setbyindex(ifp->if_index, ifp); + IFNET_WUNLOCK(); + + /* Rename the ifnet */ + if (new_vnet == ifp->if_home_vnet) { + /* always restore the original name on return to home vnet */ + if_initname(ifp, ifp->if_dname, ifp->if_dunit); + } else { + int unit = 0; + struct ifnet *iter; + + do { + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", dname, unit); + TAILQ_FOREACH(iter, &V_ifnet, if_link) + if (strcmp(ifp->if_xname, iter->if_xname) == 0) + break; + unit++; + } while (iter); + } + CURVNET_RESTORE(); +} + +/* + * Move the interface to another vnet. The interface can be specified either + * by ifp argument, or by name contained in vi_req->vi_chroot if NULL is + * passed as ifp. The interface will be renamed to vi_req->vi_parent_name + * if vi_req->vi_parent_name is not an empty string (uff ugly ugly)... + * Similary, the target vnet can be specified either by vnet argument or + * by name. If vnet name equals to ".." or vi_req is set to NULL the + * interface is moved to the parent vnet. */ int -vi_symlookup(struct kld_sym_lookup *lookup, char *symstr) +vi_if_move(struct vi_req *vi_req, struct ifnet *ifp, struct vimage *vip) +{ + struct vimage *new_vip; + struct vnet *new_vnet = NULL; + + if (vi_req == NULL || strcmp(vi_req->vi_name, "..") == 0) { + if (IS_DEFAULT_VIMAGE(vip)) + return (ENXIO); + new_vnet = vip->vi_parent->v_net; + } else { + new_vip = vimage_by_name(vip, vi_req->vi_name); + if (new_vip == NULL) + return (ENXIO); + new_vnet = new_vip->v_net; + } + + if (ifp == NULL) + ifp = ifunit(vi_req->vi_chroot); + if (ifp == NULL) + return (ENXIO); + + /* Abort if driver did not provide a if_reassign() method */ + if (ifp->if_reassign == NULL) + return (ENODEV); + + if (vi_req != NULL) { + struct ifnet *t_ifp; + + CURVNET_SET_QUIET(new_vnet); + t_ifp = ifunit(vi_req->vi_if_xname); + CURVNET_RESTORE(); + if (t_ifp != NULL) + return (EEXIST); + } + + if (vi_req && strlen(vi_req->vi_if_xname) > 0) + ifp->if_reassign(ifp, new_vnet, vi_req->vi_if_xname); + else + ifp->if_reassign(ifp, new_vnet, NULL); + getmicrotime(&ifp->if_lastchange); + + /* Report the new if_xname back to the userland */ + if (vi_req != NULL) + sprintf(vi_req->vi_chroot, "%s", ifp->if_xname); + + return (0); +} + + +struct vimage * +vimage_by_name(struct vimage *top, char *name) +{ + struct vimage *vip; + char *next_name; + int namelen; + + next_name = strchr(name, '.'); + if (next_name != NULL) { + namelen = next_name - name; + next_name++; + if (namelen == 0) { + if (strlen(next_name) == 0) + return(top); /* '.' == this vimage */ + else + return(NULL); + } + } else + namelen = strlen(name); + if (namelen == 0) + return(NULL); + LIST_FOREACH(vip, &top->vi_child_head, vi_sibling) + if (strlen(vip->vi_name) == namelen && + strncmp(name, vip->vi_name, namelen) == 0) { + if (next_name != NULL) + return(vimage_by_name(vip, next_name)); + else + return(vip); + } + return(NULL); +} + + +static void +vimage_relative_name(struct vimage *top, struct vimage *where, + char *buffer, int bufflen) +{ + int used = 1; + + if (where == top) { + sprintf(buffer, "."); + return; + } else + *buffer = 0; + + do { + int namelen = strlen(where->vi_name); + + if (namelen + used + 1 >= bufflen) + panic("buffer overflow"); + + if (used > 1) { + bcopy(buffer, &buffer[namelen + 1], used); + buffer[namelen] = '.'; + used++; + } else + bcopy(buffer, &buffer[namelen], used); + bcopy(where->vi_name, buffer, namelen); + used += namelen; + where = where->vi_parent; + } while (where != top); +} + + +static struct vimage * +vimage_get_next(struct vimage *top, struct vimage *where, int recurse) +{ + struct vimage *next; + + if (recurse) { + /* Try to go deeper in the hierarchy */ + next = LIST_FIRST(&where->vi_child_head); + if (next != NULL) + return(next); + } + + do { + /* Try to find next sibling */ + next = LIST_NEXT(where, vi_sibling); + if (!recurse || next != NULL) + return(next); + + /* Nothing left on this level, go one level up */ + where = where->vi_parent; + } while (where != top->vi_parent); + + /* Nothing left to be visited, we are done */ + return(NULL); +} + + +int +vi_td_ioctl(u_long cmd, struct vi_req *vi_req, struct thread *td) +{ + int error; + struct vimage *vip = TD_TO_VIMAGE(td); + struct vimage *vip_r = NULL; + + error = priv_check(td, PRIV_ROOT); + if (error) + return (error); + + vip_r = vimage_by_name(vip, vi_req->vi_name); + if (vip_r == NULL && !(vi_req->req_action & VI_CREATE)) + return (ESRCH); + if (vip_r != NULL && vi_req->req_action & VI_CREATE) + return (EADDRINUSE); + if (vi_req->req_action == VI_GETNEXT) { + vip_r = vimage_get_next(vip, vip_r, 0); + if (vip_r == NULL) + return (ESRCH); + } + if (vi_req->req_action == VI_GETNEXT_RECURSE) { + vip_r = vimage_get_next(vip, vip_r, 1); + if (vip_r == NULL) + return (ESRCH); + } + + if (vip_r && !vi_child_of(vip, vip_r) && /* XXX delete the rest? */ + vi_req->req_action != VI_GET && vi_req->req_action != VI_GETNEXT) + return (EPERM); + + switch (cmd) { + + case SIOCGPVIMAGE: + vimage_relative_name(vip, vip_r, vi_req->vi_name, + sizeof (vi_req->vi_name)); + vi_req->vi_proc_count = vip_r->v_procg->nprocs; + vi_req->vi_if_count = vip_r->v_net->ifccnt; + vi_req->vi_sock_count = vip_r->v_net->sockcnt; + vi_req->cp_time_avg = vip_r->v_cpu->_avg2_fixp; + break; + + case SIOCSPVIMAGE: + if (vi_req->req_action == VI_DESTROY) { + error = vi_destroy(vip_r); + break; + } + + if (vi_req->req_action == VI_SWITCHTO) { + struct proc *p = td->td_proc; + struct ucred *oldcred, *newcred; + + /* + * XXX priv_check()? + * XXX allow only a single td per proc here? + */ + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + setsugid(p); + crcopy(newcred, oldcred); + refcount_release(&newcred->cr_vimage->vi_ucredrefc); + newcred->cr_vimage = vip_r; + refcount_acquire(&newcred->cr_vimage->vi_ucredrefc); + p->p_ucred = newcred; + PROC_UNLOCK(p); + sx_xlock(&allproc_lock); + oldcred->cr_vimage->v_procg->nprocs--; + refcount_release(&oldcred->cr_vimage->vi_ucredrefc); + P_TO_VPROCG(p)->nprocs++; +#if 0 + sched_load_reassign(oldcred->cr_vimage->v_procg, + newcred->cr_vimage->v_procg); +#endif + sx_xunlock(&allproc_lock); + crfree(oldcred); + break; + } + + if (vi_req->req_action & VI_CREATE) { + char *dotpos; + + dotpos = strrchr(vi_req->vi_name, '.'); + if (dotpos != NULL) { + *dotpos = 0; + vip = vimage_by_name(vip, vi_req->vi_name); + if (vip == NULL) + return (ESRCH); + dotpos++; + vip_r = vi_alloc(vip, dotpos); + } else + vip_r = vi_alloc(vip, vi_req->vi_name); + if (vip_r == NULL) + return (ENOMEM); + } + + /* XXX What the hell is this doing here? */ + if (vip == vip_r && !IS_DEFAULT_VIMAGE(vip)) + return (EPERM); + } + + return (error); +} + + +struct vimage * +vi_alloc(struct vimage *parent, char *name) +{ + struct vimage *vip; + struct vnet *vnet; + struct vprocg *vprocg; + struct vcpu *vcpu; + struct vnet_modlink *vml; + + /* + * XXX don't forget the locking + */ + + /* A brute force check whether there's enough mem for a new vimage */ + vip = malloc(512*1024, M_VIMAGE, M_NOWAIT); /* XXX aaaargh... */ + if (vip == NULL) + goto vi_alloc_done; + free(vip, M_VIMAGE); + + vip = vi_malloc(sizeof(struct vimage), M_VIMAGE, M_NOWAIT | M_ZERO); + if (vip == NULL) + panic("vi_alloc: malloc failed for vimage \"%s\"\n", name); + vip->vi_id = last_vi_id++; + LIST_INIT(&vip->vi_child_head); + sprintf(vip->vi_name, "%s", name); + vip->vi_parent = parent; + /* XXX locking */ + if (parent != NULL) + LIST_INSERT_HEAD(&parent->vi_child_head, vip, vi_sibling); + else if (!LIST_EMPTY(&vimage_head)) + panic("there can be only one default vimage!"); + LIST_INSERT_HEAD(&vimage_head, vip, vi_le); + + vnet = vi_malloc(sizeof(struct vnet), M_VNET, M_NOWAIT | M_ZERO); + if (vnet == NULL) + panic("vi_alloc: malloc failed for vnet \"%s\"\n", name); + vip->v_net = vnet; + vnet->vnet_id = last_vnet_id++; + vnet->vnet_magic_n = VNET_MAGIC_N; + + vprocg = vi_malloc(sizeof(struct vprocg), M_VPROCG, M_NOWAIT | M_ZERO); + if (vprocg == NULL) + panic("vi_alloc: malloc failed for vprocg \"%s\"\n", name); + vip->v_procg = vprocg; + vprocg->vprocg_id = last_vprocg_id++; + + vcpu = vi_malloc(sizeof(struct vcpu), M_VCPU, M_NOWAIT | M_ZERO); + if (vcpu == NULL) + panic ("vi_alloc: malloc failed for vcpu \"%s\"\n", name); + vip->v_cpu = vcpu; + vcpu->vcpu_id = last_vcpu_id++; + + /* Initialize / attach vnet module instances. */ + CURVNET_SET_QUIET(vnet); + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + vnet_mod_constructor(vml); + CURVNET_RESTORE(); + + VNET_LIST_LOCK(); + LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); + VNET_LIST_UNLOCK(); + + /* XXX locking */ + LIST_INSERT_HEAD(&vprocg_head, vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_INSERT_HEAD(&vcpu_head, vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + +vi_alloc_done: + return (vip); +} + + +/* + * Destroy a vnet - unlink all linked lists, free all the memory, stop all + * the timers... How can one ever be sure to have done *all* the necessary + * steps? + */ +static int +vi_destroy(struct vimage *vip) { + struct vnet *vnet = vip->v_net; + struct vprocg *vprocg = vip->v_procg; + struct vcpu *vcpu = vip->v_cpu; + struct ifnet *ifp, *nifp; struct vnet_modlink *vml; - struct vnet_symmap *mapentry; + + /* XXX Beware of races -> more locking to be done... */ + if (!LIST_EMPTY(&vip->vi_child_head)) + return (EBUSY); + + if (vprocg->nprocs != 0) + return (EBUSY); + + if (vnet->sockcnt != 0) + return (EBUSY); + + if (vip->vi_ucredrefc != 0) + printf("vi_destroy: %s ucredrefc %d\n", + vip->vi_name, vip->vi_ucredrefc); + + /* Point with no return - cleanup MUST succeed! */ + /* XXX locking */ + LIST_REMOVE(vip, vi_le); + LIST_REMOVE(vip, vi_sibling); + + /* XXX locking */ + LIST_REMOVE(vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_REMOVE(vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + + VNET_LIST_LOCK(); + LIST_REMOVE(vnet, vnet_le); + VNET_LIST_UNLOCK(); + + CURVNET_SET_QUIET(vnet); + INIT_VNET_NET(vnet); - TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { - if (vml->vml_modinfo->vmi_symmap == NULL) - continue; - for (mapentry = vml->vml_modinfo->vmi_symmap; - mapentry->name != NULL; mapentry++) { - if (strcmp(symstr, mapentry->name) == 0) { - lookup->symvalue = (u_long) mapentry->base; - lookup->symsize = mapentry->size; - return (0); - } - } + /* + * Return all inherited interfaces to their parent vnets, + * alternatively attempt to kill cloning ifnets. + */ + TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + if (ifp->if_home_vnet != ifp->if_vnet) + vi_if_move(NULL, ifp, vip); + else + if_clone_destroy(ifp->if_xname); } - return (ENOENT); + + /* Detach / free per-module state instances. */ + TAILQ_FOREACH_REVERSE(vml, &vnet_modlink_head, + vnet_modlink_head, vml_mod_le) + vnet_mod_destructor(vml); + +#if 0 + free((caddr_t)vnet->ifnet_addrs, M_IFADDR); + free((caddr_t)vnet->ifindex2ifnet, M_IFADDR); +#endif + + CURVNET_RESTORE(); + + /* hopefully, we are finally OK to free the vnet container itself! */ + vnet->vnet_magic_n = 0xdeadbeef; + vi_free(vnet, M_VNET); + vi_free(vprocg, M_VPROCG); + vi_free(vcpu, M_VCPU); + vi_free(vip, M_VIMAGE); + + return (0); } +#endif /* VIMAGE */ static void vi_init(void *unused) { + /* vnet module list is both forward and reverse traversable */ TAILQ_INIT(&vnet_modlink_head); + TAILQ_INIT(&vnet_modpending_head); + +#ifdef VIMAGE + LIST_INIT(&vimage_head); + LIST_INIT(&vnet_head); + LIST_INIT(&vprocg_head); + LIST_INIT(&vcpu_head); + + mtx_init(&vnet_list_refc_mtx, "vnet_list_refc_mtx", NULL, MTX_DEF); + cv_init(&vnet_list_condvar, "vnet_list_condvar"); + + mtx_init(&vcpu_list_mtx, "vcpu_list_mtx", NULL, MTX_SPIN); + + vi_alloc(NULL, ""); /* Default vimage has no name */ + + /* We MUST clear curvnet in vi_init_done before going SMP. */ + curvnet = LIST_FIRST(&vnet_head); +#endif } +static void +vi_init_done(void *unused) +{ + struct vnet_modlink *vml_iter; + +#ifdef VIMAGE + curvnet = NULL; +#endif + + if (TAILQ_EMPTY(&vnet_modpending_head)) + return; + + printf("vnet modules with unresolved dependencies:\n"); + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + printf(" %s depending on %d:\n", + vml_iter->vml_modinfo->vmi_name, + vml_iter->vml_modinfo->vmi_dependson); + panic("going nowhere without my vnet modules!"); +} + SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL); +SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL); + +#ifdef VIMAGE +#ifdef DDB +static void +db_vnet_ptr(void *arg) +{ + if (arg) + db_printf(" %p", arg); + else + db_printf(" 0"); +} -#endif /* !VIMAGE_GLOBALS */ +DB_SHOW_COMMAND(vnets, db_show_vnets) +{ + VNET_ITERATOR_DECL(vnet_iter); + + db_printf(" vnet ifs socks"); + db_printf(" net inet inet6 ipsec netgraph\n"); + VNET_FOREACH(vnet_iter) { + db_printf("%p %3d %5d", + vnet_iter, vnet_iter->ifccnt, vnet_iter->sockcnt); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET6]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_IPSEC]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NETGRAPH]); + db_printf("\n"); + } +} +#endif +#endif /* VIMAGE */ Index: sys/kern/subr_pcpu.c =========================================================================== --- sys/kern/subr_pcpu.c 2009/02/22 13:41:20 #12 +++ sys/kern/subr_pcpu.c 2009/02/22 13:41:20 @@ -135,6 +135,10 @@ db_printf("none\n"); db_show_mdpcpu(pc); +#ifdef VIMAGE + db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); +#endif + #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks); Index: sys/kern/sys_socket.c =========================================================================== --- sys/kern/sys_socket.c 2009/02/22 13:41:20 #45 +++ sys/kern/sys_socket.c 2009/02/22 13:41:20 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -74,16 +75,19 @@ int flags, struct thread *td) { struct socket *so = fp->f_data; -#ifdef MAC int error; +#ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(active_cred, so); SOCK_UNLOCK(so); if (error) return (error); #endif - return (soreceive(so, 0, uio, 0, 0, 0)); + CURVNET_SET(so->so_vnet); + error = soreceive(so, 0, uio, 0, 0, 0); + CURVNET_RESTORE(); + return (error); } /* ARGSUSED */ @@ -125,6 +129,7 @@ struct socket *so = fp->f_data; int error = 0; + CURVNET_SET(so->so_vnet); switch (cmd) { case FIONBIO: SOCK_LOCK(so); @@ -205,6 +210,7 @@ (so, cmd, data, 0, td)); break; } + CURVNET_RESTORE(); return (error); } Index: sys/kern/uipc_accf.c =========================================================================== --- sys/kern/uipc_accf.c 2009/02/22 13:41:20 #15 +++ sys/kern/uipc_accf.c 2009/02/22 13:41:20 @@ -58,11 +58,12 @@ MALLOC_DEFINE(M_ACCF, "accf", "accept filter data"); -static int unloadable = 0; +int accf_unloadable = 0; SYSCTL_DECL(_net_inet); /* XXX: some header should do this for me */ SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters"); -SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, +SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, + &accf_unloadable, 0, "Allow unload of accept filters (not recommended)"); /* @@ -144,7 +145,7 @@ * having it called is a bad thing. A simple fix would be to * track the refcount in the struct accept_filter. */ - if (unloadable != 0) { + if (accf_unloadable != 0) { error = accept_filt_del(accfp->accf_name); } else error = EOPNOTSUPP; Index: sys/kern/uipc_domain.c =========================================================================== --- sys/kern/uipc_domain.c 2009/02/22 13:41:20 #32 +++ sys/kern/uipc_domain.c 2009/02/22 13:41:20 @@ -43,6 +43,7 @@ #include #include #include +#include #include /* @@ -64,6 +65,11 @@ SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL); +static vnet_attach_fn net_init_domain; +#ifdef VIMAGE +static vnet_detach_fn net_detach_domain; +#endif + static struct callout pffast_callout; static struct callout pfslow_callout; @@ -100,6 +106,9 @@ .pru_sopoll = pru_sopoll_notsupp, }; +VNET_MOD_DECLARE_STATELESS(DOMAIN, domain, net_init_domain, net_detach_domain, + NET) + static void protosw_init(struct protosw *pr) { @@ -155,13 +164,12 @@ } /* - * Add a new protocol domain to the list of supported domains - * Note: you cant unload it again because a socket may be using it. - * XXX can't fail at this time. + * Initialize a domain instance. */ -static void -net_init_domain(struct domain *dp) +static int +net_init_domain(const void *arg) { + const struct domain *dp = arg; struct protosw *pr; if (dp->dom_init) @@ -175,9 +183,30 @@ max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); + return 0; } +#ifdef VIMAGE /* + * Detach / free a domain instance. + */ +static int +net_detach_domain(const void *arg) +{ + const struct domain *dp = arg; + struct protosw *pr; + + if (dp->dom_destroy) + (*dp->dom_destroy)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_destroy) + (*pr->pr_destroy)(); + + return 0; +} +#endif + +/* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. * XXX can't fail at this time. @@ -210,7 +239,11 @@ "domainfinalize()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); +#ifdef VIMAGE + vnet_mod_register_multi(&vnet_domain_modinfo, dp, dp->dom_name); +#else net_init_domain(dp); +#endif } static void Index: sys/kern/uipc_socket.c =========================================================================== --- sys/kern/uipc_socket.c 2009/02/22 13:41:20 #227 +++ sys/kern/uipc_socket.c 2009/02/22 13:41:20 @@ -130,6 +130,7 @@ #include #include #include +#include #include @@ -263,7 +264,7 @@ * soalloc() returns a socket with a ref count of 0. */ static struct socket * -soalloc(void) +soalloc(struct vnet *vnet) { struct socket *so; @@ -284,6 +285,10 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; +#ifdef VIMAGE + so->so_vnet = vnet; + vnet->sockcnt++; +#endif mtx_unlock(&so_global_mtx); return (so); } @@ -303,6 +308,9 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ +#ifdef VIMAGE + so->so_vnet->sockcnt--; +#endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, @@ -352,7 +360,11 @@ if (prp->pr_type != type) return (EPROTOTYPE); - so = soalloc(); +#ifdef VIMAGE + so = soalloc(TD_TO_VNET(td)); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (ENOBUFS); @@ -378,7 +390,9 @@ * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ + CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); + CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); @@ -420,7 +434,12 @@ if (over) #endif return (NULL); - so = soalloc(); +#ifdef VIMAGE + VNET_ASSERT(head->so_vnet); + so = soalloc(head->so_vnet); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (NULL); if ((head->so_options & SO_ACCEPTFILTER) != 0) @@ -492,8 +511,12 @@ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { + int error; - return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); + CURVNET_SET(so->so_vnet); + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); + CURVNET_RESTORE(); + return error; } /* @@ -641,6 +664,7 @@ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { @@ -692,6 +716,7 @@ KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); + CURVNET_RESTORE(); return (error); } @@ -767,7 +792,9 @@ * biting us. */ so->so_error = 0; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); + CURVNET_RESTORE(); } return (error); @@ -1283,9 +1310,13 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + int error; - return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, - control, flags, td)); + CURVNET_SET(so->so_vnet); + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, + control, flags, td); + CURVNET_RESTORE(); + return (error); } /* @@ -2041,8 +2072,13 @@ } if (how != SHUT_WR) sorflush(so); - if (how != SHUT_RD) - return ((*pr->pr_usrreqs->pru_shutdown)(so)); + if (how != SHUT_RD) { + int error; + CURVNET_SET(so->so_vnet); + error = (*pr->pr_usrreqs->pru_shutdown)(so); + CURVNET_RESTORE(); + return (error); + } return (0); } @@ -2066,6 +2102,7 @@ * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ + CURVNET_SET(so->so_vnet); socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); @@ -2089,6 +2126,7 @@ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease_internal(&asb, so); + CURVNET_RESTORE(); } /* Index: sys/kern/uipc_syscalls.c =========================================================================== --- sys/kern/uipc_syscalls.c 2009/02/22 13:41:20 #178 +++ sys/kern/uipc_syscalls.c 2009/02/22 13:41:20 @@ -64,6 +64,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -264,7 +265,9 @@ if (error) goto done; #endif + CURVNET_SET(so->so_vnet); error = solisten(so, uap->backlog, td); + CURVNET_RESTORE(); #ifdef MAC done: #endif @@ -429,7 +432,9 @@ tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; + CURVNET_SET(so->so_vnet); error = soaccept(so, &sa); + CURVNET_RESTORE(); if (error) { /* * return a namelen of zero for older code which might @@ -976,9 +981,11 @@ ktruio = cloneuio(&auio); #endif len = auio.uio_resid; + CURVNET_SET(so->so_vnet); error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); + CURVNET_RESTORE(); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -1322,7 +1329,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sosetopt(so, &sopt); + CURVNET_RESTORE(); fdrop(fp, td); } return(error); @@ -1400,7 +1409,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sogetopt(so, &sopt); + CURVNET_RESTORE(); *valsize = sopt.sopt_valsize; fdrop(fp, td); } @@ -1463,7 +1474,9 @@ return (error); so = fp->f_data; *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -1564,7 +1577,9 @@ goto done; } *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -2176,9 +2191,11 @@ goto done; } SOCKBUF_UNLOCK(&so->so_snd); + CURVNET_SET(so->so_vnet); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); + CURVNET_RESTORE(); if (err == 0) { /* * We need two counters to get the Index: sys/kern/uipc_usrreq.c =========================================================================== --- sys/kern/uipc_usrreq.c 2009/02/22 13:41:20 #159 +++ sys/kern/uipc_usrreq.c 2009/02/22 13:41:20 @@ -90,6 +90,7 @@ #include #include #include +#include #ifdef DDB #include @@ -1656,6 +1657,10 @@ unp_init(void) { +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) Index: sys/kern/vfs_export.c =========================================================================== --- sys/kern/vfs_export.c 2009/02/22 13:41:20 #37 +++ sys/kern/vfs_export.c 2009/02/22 13:41:20 @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -141,6 +142,7 @@ } #endif + CURVNET_SET(TD_TO_VNET(curthread)); /* XXX MARKO */ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); @@ -213,8 +215,10 @@ bcopy(argp->ex_secflavors, np->netc_secflavors, sizeof(np->netc_secflavors)); refcount_init(&np->netc_anon.cr_ref, 1); + CURVNET_RESTORE(); return (0); out: + CURVNET_RESTORE(); free(np, M_NETADDR); return (error); } Index: sys/kern/vfs_lookup.c =========================================================================== --- sys/kern/vfs_lookup.c 2009/02/22 13:41:20 #71 +++ sys/kern/vfs_lookup.c 2009/02/22 13:41:20 @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -65,6 +66,15 @@ #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC +#ifdef VIMAGE +#define IMUNES_SYMLINK_HACK +#endif + +#ifdef IMUNES_SYMLINK_HACK +SYSCTL_V_INT(V_PROCG, vprocg, _vfs, OID_AUTO, morphing_symlinks, CTLFLAG_RW, + morphing_symlinks, 0, "Resolve @ to vimage name in symlinks"); +#endif + /* * Allocation zone for namei */ @@ -126,6 +136,9 @@ struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; int vfslocked; +#ifdef IMUNES_SYMLINK_HACK + INIT_VPROCG(TD_TO_VPROCG(td)); +#endif KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0, ("NOT MPSAFE and Giant not held")); @@ -313,6 +326,25 @@ error = ENOENT; break; } +#ifdef IMUNES_SYMLINK_HACK + if (V_morphing_symlinks) { + char *sp = strchr(cp, '@'); + int vnamelen = strlen(TD_TO_VIMAGE(td)->vi_name); + + if (sp) { + if (vnamelen >= auio.uio_resid) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + bcopy(sp + 1, sp + vnamelen, + linklen - (sp - cp)); + bcopy(TD_TO_VIMAGE(td)->vi_name, sp, vnamelen); + linklen += (vnamelen - 1); + } + } +#endif if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); Index: sys/modules/Makefile =========================================================================== --- sys/modules/Makefile 2009/02/22 13:41:20 #411 +++ sys/modules/Makefile 2009/02/22 13:41:20 @@ -111,6 +111,7 @@ if_disc \ if_edsc \ if_ef \ + if_epair \ if_faith \ if_gif \ if_gre \ Index: sys/modules/if_epair/Makefile =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/modules/if_epair/Makefile Sun Feb 22 13:41:22 2009 *************** *** 0 **** --- 1,8 ---- + # $FreeBSD$ + + .PATH: ${.CURDIR}/../../net + + KMOD= if_epair + SRCS= if_epair.c + + .include Index: sys/modules/netgraph/Makefile =========================================================================== --- sys/modules/netgraph/Makefile 2009/02/22 13:41:20 #35 +++ sys/modules/netgraph/Makefile 2009/02/22 13:41:20 @@ -35,6 +35,7 @@ netflow \ netgraph \ one2many \ + pipe \ ppp \ pppoe \ pptpgre \ Index: sys/modules/netgraph/pipe/Makefile =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/modules/netgraph/pipe/Makefile Sun Feb 22 13:41:23 2009 *************** *** 0 **** --- 1,6 ---- + # $FreeBSD: $ + + KMOD= ng_pipe + SRCS= ng_pipe.c + + .include Index: sys/net/bpf.c =========================================================================== --- sys/net/bpf.c 2009/02/22 13:41:20 #123 +++ sys/net/bpf.c 2009/02/22 13:41:20 @@ -873,11 +873,10 @@ m->m_len -= hlen; m->m_data += hlen; /* XXX */ + CURVNET_SET(ifp->if_vnet); #ifdef MAC BPFD_LOCK(d); - CURVNET_SET(ifp->if_vnet); mac_bpfdesc_create_mbuf(d, m); - CURVNET_RESTORE(); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); BPFD_UNLOCK(d); @@ -893,6 +892,7 @@ else m_freem(mc); } + CURVNET_RESTORE(); return (error); } @@ -1425,9 +1425,33 @@ struct bpf_if *bp; struct ifnet *theywant; +#define IMUNES_BPF_HACK +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + struct vnet *target_vnet = curvnet; + char *c; + + /* Hack to support tapping in foreign vnets */ + c = rindex(ifr->ifr_name, '@'); + if ( c != NULL ) { +printf("bpf_setif: %s\n", c); + struct vimage *target_vimage; + + *c++ = 0; + target_vimage = vimage_by_name(TD_TO_VIMAGE(curthread), c); + if (target_vimage == NULL) + return ENXIO; + target_vnet = target_vimage->v_net; + } + CURVNET_SET_QUIET(target_vnet); +#endif + theywant = ifunit(ifr->ifr_name); - if (theywant == NULL || theywant->if_bpf == NULL) + if (theywant == NULL || theywant->if_bpf == NULL) { +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (ENXIO); + } bp = theywant->if_bpf; @@ -1467,6 +1491,9 @@ BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (0); } Index: sys/net/if.c =========================================================================== --- sys/net/if.c 2009/02/22 13:41:20 #195 +++ sys/net/if.c 2009/02/22 13:41:20 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -124,7 +125,6 @@ static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); -static void if_grow(void); static void if_init(void *); static void if_check(void *); static void if_qflush(struct ifnet *); @@ -150,6 +150,11 @@ extern void nd6_setmtu(struct ifnet *); #endif +static int vnet_net_iattach(const void *); +#ifdef VIMAGE +static int vnet_net_idetach(const void *); +#endif + #ifdef VIMAGE_GLOBALS struct ifnethead ifnet; /* depend on static init XXX */ struct ifgrouphead ifg_head; @@ -181,7 +186,24 @@ }; VNET_MOD_DECLARE(NET, net, vnet_net_iattach, vnet_net_idetach, - NONE, vnet_net_symmap) + NONE, vnet_net_symmap); + +static int foo_handler(module_t mod, int /*modeventtype_t*/ what, void *arg); + +static int +foo_handler(module_t mod, int /*modeventtype_t*/ what, void *arg) +{ + return (0); +} + +static moduledata_t mod_data= { + "vnet_net", + foo_handler, + 0 +}; + +MODULE_VERSION(vnet_net, 1); +DECLARE_MODULE(vnet_net, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); #endif /* @@ -194,7 +216,7 @@ MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); -static struct ifnet * +struct ifnet * ifnet_byindex_locked(u_short idx) { INIT_VNET_NET(curvnet); @@ -215,7 +237,7 @@ return (ifp); } -static void +void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { INIT_VNET_NET(curvnet); @@ -391,27 +413,57 @@ static void if_init(void *dummy __unused) { - INIT_VNET_NET(curvnet); #ifndef VIMAGE_GLOBALS vnet_mod_register(&vnet_net_modinfo); +#else + vnet_net_iattach(NULL); #endif + IFNET_LOCK_INIT(); + ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, + 0600, "network")); + if_clone_init(); +} + +static int +vnet_net_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + V_if_index = 0; V_ifindex_table = NULL; V_if_indexlim = 8; - IFNET_LOCK_INIT(); TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); - if_grow(); /* create initial table */ - ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, - 0600, "network")); - if_clone_init(); + if_grow(); /* create initial table */ + + return 0; +} + +#ifdef VIMAGE +static int +vnet_net_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); +#ifdef NOTYET + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); +#endif + VNET_ASSERT(SLIST_EMPTY(&V_ifklist.kl_list)); + + free((caddr_t)V_ifindex_table, M_IFNET); + + return 0; } +#endif -static void +void if_grow(void) { INIT_VNET_NET(curvnet); @@ -581,6 +633,11 @@ panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); +#ifdef VIMAGE + ifp->if_vnet = curvnet; + if (ifp->if_home_vnet == NULL) + ifp->if_home_vnet = curvnet; +#endif TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); @@ -604,11 +661,13 @@ mac_ifnet_create(ifp); #endif - ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw, - ifp->if_index, UID_ROOT, GID_WHEEL, 0600, "%s/%s", - net_cdevsw.d_name, ifp->if_xname)); - make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", - net_cdevsw.d_name, ifp->if_index); + if (IS_DEFAULT_VNET(curvnet)) { + ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw, + ifp->if_index, UID_ROOT, GID_WHEEL, 0600, "%s/%s", + net_cdevsw.d_name, ifp->if_xname)); + make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", + net_cdevsw.d_name, ifp->if_index); + } ifq_attach(&ifp->if_snd, ifp); @@ -651,13 +710,17 @@ IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); +#ifdef VIMAGE + curvnet->ifccnt++; +#endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); - devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); @@ -800,6 +863,14 @@ struct ifnet *iter; int found = 0; + /* + * Detach from any vlan, bridge or lagg ifnets linked to us. + * A small though unlikely window for a race from here to ifp + * unlinking from ifnet list is possible, hence we repeat the + * procedure once again further bellow. XXX. + */ + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { @@ -807,6 +878,10 @@ found = 1; break; } +#ifdef VIMAGE + if (found) + curvnet->ifccnt--; +#endif IFNET_WUNLOCK(); if (!found) return; @@ -850,7 +925,8 @@ * Clean up all addresses. */ ifp->if_addr = NULL; - destroy_dev(ifdev_byindex(ifp->if_index)); + if (IS_DEFAULT_VNET(curvnet)) + destroy_dev(ifdev_byindex(ifp->if_index)); ifdev_setbyindex(ifp->if_index, NULL); /* We can now free link ifaddr. */ @@ -879,7 +955,8 @@ /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); - devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); IF_AFDATA_LOCK(ifp); for (dp = domains; dp; dp = dp->dom_next) { @@ -897,6 +974,9 @@ knlist_destroy(&ifp->if_klist); ifq_detach(&ifp->if_snd); IF_AFDATA_DESTROY(ifp); +#ifdef VIMAGE + ifp->if_vnet = NULL; +#endif splx(s); } @@ -1540,8 +1620,10 @@ (*lagg_linkstate_p)(ifp, link_state); } - devctl_notify("IFNET", ifp->if_xname, - (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, + (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", + NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) @@ -2008,6 +2090,24 @@ ifr = (struct ifreq *)data; switch (cmd) { +#ifdef VIMAGE + case SIOCSIFVIMAGE: + error = priv_check(td, PRIV_ROOT); + if (error == 0) + error = vi_if_move((struct vi_req *) data, NULL, + TD_TO_VIMAGE(td)); + return (error); + + /* + * XXX Should be implemented as separate system calls. This is + * just a temporary hack! + */ + case SIOCSPVIMAGE: + case SIOCGPVIMAGE: + error = vi_td_ioctl(cmd, (struct vi_req *) data, td); + return (error); +#endif + case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); Index: sys/net/if_clone.c =========================================================================== --- sys/net/if_clone.c 2009/02/22 13:41:20 #12 +++ sys/net/if_clone.c 2009/02/22 13:41:20 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include static void if_clone_free(struct if_clone *ifc); static int if_clone_createif(struct if_clone *ifc, char *name, size_t len, @@ -203,15 +205,14 @@ { int err; - if (ifc->ifc_destroy == NULL) { - err = EOPNOTSUPP; - goto done; - } + if (ifc->ifc_destroy == NULL) + return(EOPNOTSUPP); IF_CLONE_LOCK(ifc); IFC_IFLIST_REMOVE(ifc, ifp); IF_CLONE_UNLOCK(ifc); + CURVNET_SET_QUIET(ifp->if_vnet); if_delgroup(ifp, ifc->ifc_name); err = (*ifc->ifc_destroy)(ifc, ifp); @@ -223,8 +224,7 @@ IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } - -done: + CURVNET_RESTORE(); return (err); } @@ -401,6 +401,24 @@ * Find a free unit if none was given. */ if (wildcard) { +#ifdef VIMAGE + INIT_VNET_NET(curvnet); + char name[IFNAMSIZ]; + struct ifnet *ifp; + int i = 0; + + IFNET_RLOCK(); +again: + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + sprintf(name, "%s%d", ifc->ifc_name, i); + if (strcmp(name, ifp->if_xname) == 0) { + i++; + goto again; + } + } + IFNET_RUNLOCK(); + *unit = i; +#else while ((bytoff < ifc->ifc_bmlen) && (ifc->ifc_units[bytoff] == 0xff)) bytoff++; @@ -411,6 +429,7 @@ while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) bitoff++; *unit = (bytoff << 3) + bitoff; +#endif } if (*unit > ifc->ifc_maxunit) { @@ -418,6 +437,7 @@ goto done; } +#ifndef VIMAGE if (!wildcard) { bytoff = *unit >> 3; bitoff = *unit - (bytoff << 3); @@ -433,6 +453,7 @@ KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, ("%s: bit is already set", __func__)); ifc->ifc_units[bytoff] |= (1 << bitoff); +#endif IF_CLONE_ADDREF_LOCKED(ifc); done: @@ -443,6 +464,7 @@ void ifc_free_unit(struct if_clone *ifc, int unit) { +#ifndef VIMAGE int bytoff, bitoff; @@ -457,6 +479,7 @@ ("%s: bit is already cleared", __func__)); ifc->ifc_units[bytoff] &= ~(1 << bitoff); IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ +#endif } void Index: sys/net/if_epair.c =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- sys/net/if_epair.c Sun Feb 22 13:41:23 2009 *************** *** 0 **** --- 1,728 ---- + /*- + * Copyright (c) 2008 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by CK Software GmbH under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + /* + * A pair of virtual ethernet interfaces directly connected with + * a virtual cross-over cable. + * This is mostly intended to be used to provide connectivity between + * different virtual network stack instances. + */ + /* + * Things to re-think once we have more experience: + * - ifp->if_reassign function once we can test with vimage. + * - Real random etheraddrs that are checked to be uniquish; + * in case we bridge we may need this or let the user handle that case? + * - netisr and callback logic. + * - netisr queue lengths. + */ + + #include + __FBSDID("$FreeBSD$"); + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + #include + + #define EPAIRNAME "epair" + + #ifdef DEBUG_EPAIR + static int epair_debug = 0; + SYSCTL_DECL(_net_link); + SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl"); + SYSCTL_XINT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW, + &epair_debug, 0, "if_epair(4) debugging."); + #define DPRINTF(fmt, arg...) if (epair_debug) \ + printf("[%s:%d] " fmt, __func__, __LINE__, ##arg) + #else + #define DPRINTF(fmt, arg...) + #endif + + struct epair_softc { + struct ifnet *ifp; + struct ifnet *oifp; + u_int refcount; + void (*if_qflush)(struct ifnet *); + }; + + struct epair_ifp_drain { + STAILQ_ENTRY(epair_ifp_drain) ifp_next; + struct ifnet *ifp; + }; + + static STAILQ_HEAD(, epair_ifp_drain) epair_ifp_drain_list = + STAILQ_HEAD_INITIALIZER(epair_ifp_drain_list); + + #define ADD_IFQ_FOR_DRAINING(ifp) \ + do { \ + struct epair_ifp_drain *elm = NULL; \ + \ + STAILQ_FOREACH(elm, &epair_ifp_drain_list, ifp_next) { \ + if (elm->ifp == (ifp)) \ + break; \ + } \ + if (elm == NULL) { \ + elm = malloc(sizeof(struct epair_ifp_drain), \ + M_EPAIR, M_ZERO); \ + if (elm != NULL) { \ + elm->ifp = (ifp); \ + STAILQ_INSERT_TAIL( \ + &epair_ifp_drain_list, \ + elm, ifp_next); \ + } \ + } \ + } while(0) + + /* Our "hw" tx queue. */ + static struct ifqueue epairinq; + static int epair_drv_flags; + + static struct mtx if_epair_mtx; + #define EPAIR_LOCK_INIT() mtx_init(&if_epair_mtx, "if_epair", \ + NULL, MTX_DEF) + #define EPAIR_LOCK_DESTROY() mtx_destroy(&if_epair_mtx) + #define EPAIR_LOCK_ASSERT() mtx_assert(&if_epair_mtx, MA_OWNED) + #define EPAIR_LOCK() mtx_lock(&if_epair_mtx) + #define EPAIR_UNLOCK() mtx_unlock(&if_epair_mtx) + + static MALLOC_DEFINE(M_EPAIR, EPAIRNAME, + "Pair of virtual cross-over connected Ethernet-like interfaces"); + + static int epair_clone_match(struct if_clone *, const char *); + static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); + static int epair_clone_destroy(struct if_clone *, struct ifnet *); + + static void epair_start_locked(struct ifnet *); + + static struct if_clone epair_cloner = IFC_CLONE_INITIALIZER( + EPAIRNAME, NULL, IF_MAXUNIT, + NULL, epair_clone_match, epair_clone_create, epair_clone_destroy); + + + /* + * Netisr handler functions. + */ + static void + epair_sintr(struct mbuf *m) + { + struct ifnet *ifp; + struct epair_softc *sc; + + ifp = m->m_pkthdr.rcvif; + (*ifp->if_input)(ifp, m); + sc = ifp->if_softc; + refcount_release(&sc->refcount); + DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount); + } + + static void + epair_sintr_drained(void) + { + struct epair_ifp_drain *elm, *tvar; + struct ifnet *ifp; + + EPAIR_LOCK(); + /* + * Assume our "hw" queue and possibly ifq will be emptied + * again. In case we will overflow the "hw" queue while + * draining, epair_start_locked will set IFF_DRV_OACTIVE + * again and we will stop and return. + */ + STAILQ_FOREACH_SAFE(elm, &epair_ifp_drain_list, ifp_next, tvar) { + ifp = elm->ifp; + epair_drv_flags &= ~IFF_DRV_OACTIVE; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + epair_start_locked(ifp); + + IFQ_LOCK(&ifp->if_snd); + if (IFQ_IS_EMPTY(&ifp->if_snd)) { + STAILQ_REMOVE(&epair_ifp_drain_list, elm, + epair_ifp_drain, ifp_next); + free(elm, M_EPAIR); + } + IFQ_UNLOCK(&ifp->if_snd); + + if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) { + /* Our "hw"q overflew again. */ + epair_drv_flags |= IFF_DRV_OACTIVE + DPRINTF("hw queue length overflow at %u\n", + epairinq.ifq_maxlen); + #if 0 + /* ``Auto-tuning.'' */ + epairinq.ifq_maxlen += ifqmaxlen; + #endif + break; + } + } + EPAIR_UNLOCK(); + } + + /* + * Network interface (`if') related functions. + */ + static void + epair_start_locked(struct ifnet *ifp) + { + struct mbuf *m; + struct epair_softc *sc; + struct ifnet *oifp; + int error; + + EPAIR_LOCK_ASSERT(); + DPRINTF("ifp=%p\n", ifp); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + if ((ifp->if_flags & IFF_UP) == 0) + return; + + /* + * We get patckets here from ether_output via if_handoff() + * and ned to put them into the input queue of the oifp + * and call oifp->if_input() via netisr/epair_sintr(). + */ + sc = ifp->if_softc; + oifp = sc->oifp; + sc = oifp->if_softc; + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + BPF_MTAP(ifp, m); + + /* + * In case the outgoing interface is not usable, + * drop the packet. + */ + if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (oifp->if_flags & IFF_UP) ==0) { + ifp->if_oerrors++; + m_freem(m); + continue; + } + DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); + + /* + * Add a reference so the interface cannot go while the + * packet is in transit as we rely on rcvif to stay valid. + */ + refcount_acquire(&sc->refcount); + m->m_pkthdr.rcvif = oifp; + CURVNET_SET_QUIET(oifp->if_vnet); + error = netisr_queue(NETISR_EPAIR, m); + CURVNET_RESTORE(); + if (!error) { + ifp->if_opackets++; + /* Someone else received the packet. */ + oifp->if_ipackets++; + } else { + epair_drv_flags |= IFF_DRV_OACTIVE; + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + ADD_IFQ_FOR_DRAINING(ifp); + refcount_release(&sc->refcount); + } + } + } + + static void + epair_start(struct ifnet *ifp) + { + + EPAIR_LOCK(); + epair_start_locked(ifp); + EPAIR_UNLOCK(); + } + + static int + epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) + { + struct epair_softc *sc; + struct ifnet *oifp; + int error, len; + short mflags; + + EPAIR_LOCK_ASSERT(); + DPRINTF("ifp=%p m=%p\n", ifp, m); + + if (m == NULL) + return (0); + + /* + * We are not going to use the interface en/dequeue mechanism + * on the TX side. We are called from ether_output_frame() + * and will put the packet into the incoming queue of the + * other interface of our pair via the netsir. + */ + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + m_freem(m); + return (ENXIO); + } + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return (ENETDOWN); + } + + BPF_MTAP(ifp, m); + + /* + * In case the outgoing interface is not usable, + * drop the packet. + */ + sc = ifp->if_softc; + oifp = sc->oifp; + if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (oifp->if_flags & IFF_UP) ==0) { + ifp->if_oerrors++; + m_freem(m); + return (0); + } + len = m->m_pkthdr.len; + mflags = m->m_flags; + DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); + + #ifdef ALTQ + /* Support ALTQ via the clasic if_start() path. */ + IF_LOCK(&ifp->if_snd); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); + if (error) + ifp->if_snd.ifq_drops++; + IF_UNLOCK(&ifp->if_snd); + if (!error) { + ifp->if_obytes += len; + if (mflags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + + if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) + epair_start_locked(ifp); + else + ADD_IFQ_FOR_DRAINING(ifp); + } + return (error); + } + IF_UNLOCK(&ifp->if_snd); + #endif + + if ((epair_drv_flags & IFF_DRV_OACTIVE) != 0) { + /* + * Our hardware queue is full, try to fall back + * queuing to the ifq but do not call ifp->if_start. + * Either we are lucky or the packet is gone. + */ + IFQ_ENQUEUE(&ifp->if_snd, m, error); + if (!error) + ADD_IFQ_FOR_DRAINING(ifp); + return (error); + } + sc = oifp->if_softc; + /* + * Add a reference so the interface cannot go while the + * packet is in transit as we rely on rcvif to stay valid. + */ + refcount_acquire(&sc->refcount); + m->m_pkthdr.rcvif = oifp; + CURVNET_SET_QUIET(oifp->if_vnet); + error = netisr_queue(NETISR_EPAIR, m); + CURVNET_RESTORE(); + if (!error) { + ifp->if_opackets++; + /* + * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, + * but as we bypass all this we have to duplicate + * the logic another time. + */ + ifp->if_obytes += len; + if (mflags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + /* Someone else received the packet. */ + oifp->if_ipackets++; + } else { + /* The packet was freed already. */ + refcount_release(&sc->refcount); + epair_drv_flags |= IFF_DRV_OACTIVE; + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + } + + return (error); + } + + static int + epair_transmit(struct ifnet *ifp, struct mbuf *m) + { + int error; + + EPAIR_LOCK(); + error = epair_transmit_locked(ifp, m); + EPAIR_UNLOCK(); + return (error); + } + + static void + epair_qflush(struct ifnet *ifp) + { + struct epair_softc *sc; + struct ifaltq *ifq; + + EPAIR_LOCK(); + sc = ifp->if_softc; + ifq = &ifp->if_snd; + DPRINTF("ifp=%p sc refcnt=%u ifq_len=%u\n", + ifp, sc->refcount, ifq->ifq_len); + /* + * Instead of calling refcount_release(&sc->refcount); + * n times, just subtract for the cleanup. + */ + sc->refcount -= ifq->ifq_len; + EPAIR_UNLOCK(); + if (sc->if_qflush) + sc->if_qflush(ifp); + } + + static int + epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) + { + struct ifreq *ifr; + int error; + + ifr = (struct ifreq *)data; + switch (cmd) { + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + error = 0; + break; + + default: + /* Let the common ethernet handler process this. */ + error = ether_ioctl(ifp, cmd, data); + break; + } + + return (error); + } + + static void + epair_init(void *dummy __unused) + { + } + + + /* + * Interface cloning functions. + * We use our private ones so that we can create/destroy our secondary + * device along with the primary one. + */ + static int + epair_clone_match(struct if_clone *ifc, const char *name) + { + const char *cp; + + DPRINTF("name='%s'\n", name); + + /* + * Our base name is epair. + * Our interfaces will be named epair[ab]. + * So accept anything of the following list: + * - epair + * - epair + * but not the epair[ab] versions. + */ + if (strncmp(EPAIRNAME, name, sizeof(EPAIRNAME)-1) != 0) + return (0); + + for (cp = name + sizeof(EPAIRNAME) - 1; *cp != '\0'; cp++) { + if (*cp < '0' || *cp > '9') + return (0); + } + + return (1); + } + + static int + epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) + { + struct epair_softc *sca, *scb; + struct ifnet *ifp; + char *dp; + int error, unit, wildcard; + uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + /* + * We are abusing params to create our second interface. + * Actually we already created it and called if_clone_createif() + * for it to do the official insertion procedure the moment we knew + * it cannot fail anymore. So just do attach it here. + */ + if (params) { + scb = (struct epair_softc *)params; + ifp = scb->ifp; + /* Assign a hopefully unique, locally administered etheraddr. */ + eaddr[0] = 0x02; + eaddr[3] = (ifp->if_index >> 8) & 0xff; + eaddr[4] = ifp->if_index & 0xff; + eaddr[5] = 0x0b; + ether_ifattach(ifp, eaddr); + /* Correctly set the name for the cloner list. */ + strlcpy(name, scb->ifp->if_xname, len); + return (0); + } + + /* Try to see if a special unit was requested. */ + error = ifc_name2unit(name, &unit); + if (error != 0) + return (error); + wildcard = (unit < 0); + + error = ifc_alloc_unit(ifc, &unit); + if (error != 0) + return (error); + + /* + * If no unit had been given, we need to adjust the ifName. + * Also make sure there is space for our extra [ab] suffix. + */ + for (dp = name; *dp != '\0'; dp++); + if (wildcard) { + error = snprintf(dp, len - (dp - name), "%d", unit); + if (error > len - (dp - name) - 1) { + /* ifName too long. */ + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + dp += error; + } + if (len - (dp - name) - 1 < 1) { + /* No space left for our [ab] suffix. */ + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + *dp = 'a'; + /* Must not change dp so we can replace 'a' by 'b' later. */ + *(dp+1) = '\0'; + + /* Allocate memory for both [ab] interfaces */ + sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); + refcount_init(&sca->refcount, 1); + sca->ifp = if_alloc(IFT_ETHER); + if (sca->ifp == NULL) { + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + + scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); + refcount_init(&scb->refcount, 1); + scb->ifp = if_alloc(IFT_ETHER); + if (scb->ifp == NULL) { + free(scb, M_EPAIR); + if_free(sca->ifp); + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + + /* + * Cross-reference the interfaces so we will be able to free both. + */ + sca->oifp = scb->ifp; + scb->oifp = sca->ifp; + + /* Finish initialization of interface a. */ + ifp = sca->ifp; + ifp->if_softc = sca; + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = unit; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_start = epair_start; + ifp->if_ioctl = epair_ioctl; + ifp->if_init = epair_init; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + /* Assign a hopefully unique, locally administered etheraddr. */ + eaddr[0] = 0x02; + eaddr[3] = (ifp->if_index >> 8) & 0xff; + eaddr[4] = ifp->if_index & 0xff; + eaddr[5] = 0x0a; + ether_ifattach(ifp, eaddr); + sca->if_qflush = ifp->if_qflush; + ifp->if_qflush = epair_qflush; + ifp->if_transmit = epair_transmit; + ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + + /* Swap the name and finish initialization of interface b. */ + *dp = 'b'; + + ifp = scb->ifp; + ifp->if_softc = scb; + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = unit; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_start = epair_start; + ifp->if_ioctl = epair_ioctl; + ifp->if_init = epair_init; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + /* We need to play some tricks here for the second interface. */ + strlcpy(name, EPAIRNAME, len); + error = if_clone_create(name, len, (caddr_t)scb); + if (error) + panic("%s: if_clone_createif() for our 2nd iface failed: %d", + __func__, error); + scb->if_qflush = ifp->if_qflush; + ifp->if_qflush = epair_qflush; + ifp->if_transmit = epair_transmit; + ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + + /* + * Restore name to a as the ifp for this will go into the + * cloner list for the initial call. + */ + strlcpy(name, sca->ifp->if_xname, len); + DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); + + /* Tell the world, that we are ready to rock. */ + sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; + scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; + + return (0); + } + + static int + epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) + { + struct ifnet *oifp; + struct epair_softc *sca, *scb; + int unit, error; + + DPRINTF("ifp=%p\n", ifp); + + /* + * In case we called into if_clone_destroyif() ourselves + * again to remove the second interface, the softc will be + * NULL. In that case so not do anything but return success. + */ + if (ifp->if_softc == NULL) + return (0); + + unit = ifp->if_dunit; + sca = ifp->if_softc; + oifp = sca->oifp; + scb = oifp->if_softc; + + DPRINTF("ifp=%p oifp=%p\n", ifp, oifp); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + oifp->if_drv_flags &= ~IFF_DRV_RUNNING; + ether_ifdetach(oifp); + ether_ifdetach(ifp); + /* + * Wait for all packets to be dispatched to if_input. + * The numbers can only go down as the interfaces are + * detached so there is no need to use atomics. + */ + DPRINTF("sca refcnt=%u scb refcnt=%u\n", sca->refcount, scb->refcount); + KASSERT(sca->refcount == 1 && scb->refcount == 1, + ("%s: sca->refcount!=1: %d || scb->refcount!=1: %d", + __func__, sca->refcount, scb->refcount)); + + /* + * Get rid of our second half. + */ + oifp->if_softc = NULL; + error = if_clone_destroyif(ifc, oifp); + if (error) + panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", + __func__, error); + + /* Finish cleaning up. Free them and release the unit. */ + if_free_type(oifp, IFT_ETHER); + if_free_type(ifp, IFT_ETHER); + free(scb, M_EPAIR); + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + + return (0); + } + + static int + epair_modevent(module_t mod, int type, void *data) + { + int tmp; + + switch (type) { + case MOD_LOAD: + /* For now limit us to one global mutex and one inq. */ + EPAIR_LOCK_INIT(); + epair_drv_flags = 0; + epairinq.ifq_maxlen = 16 * ifqmaxlen; /* What is a good 16? */ + if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &tmp)) + epairinq.ifq_maxlen = tmp; + mtx_init(&epairinq.ifq_mtx, "epair_inq", NULL, MTX_DEF); + netisr_register2(NETISR_EPAIR, (netisr_t *)epair_sintr, + epair_sintr_drained, &epairinq, 0); + if_clone_attach(&epair_cloner); + if (bootverbose) + printf("%s initialized.\n", EPAIRNAME); + break; + case MOD_UNLOAD: + if_clone_detach(&epair_cloner); + netisr_unregister(NETISR_EPAIR); + mtx_destroy(&epairinq.ifq_mtx); + EPAIR_LOCK_DESTROY(); + if (bootverbose) + printf("%s unloaded.\n", EPAIRNAME); + break; + default: + return (EOPNOTSUPP); + } + return (0); + } + + static moduledata_t epair_mod = { + "if_epair", + epair_modevent, + 0 + }; + + DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + MODULE_VERSION(if_epair, 1); Index: sys/net/if_ethersubr.c =========================================================================== --- sys/net/if_ethersubr.c 2009/02/22 13:41:20 #163 +++ sys/net/if_ethersubr.c 2009/02/22 13:41:20 @@ -590,6 +590,8 @@ } #endif + CURVNET_SET_QUIET(ifp->if_vnet); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; @@ -626,6 +628,7 @@ /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); + CURVNET_RESTORE(); return; } @@ -674,8 +677,10 @@ ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } /* @@ -686,8 +691,10 @@ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } #ifdef DEV_CARP @@ -723,6 +730,7 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); + CURVNET_RESTORE(); } /* @@ -904,6 +912,25 @@ return (etherbuf); } +#ifdef VIMAGE +static void +ether_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "eth"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + CURVNET_RESTORE(); +} +#endif + /* * Perform common duties while attaching to interface list */ @@ -913,6 +940,9 @@ int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; +#ifdef VIMAGE + struct vnet *home_vnet_0 = ifp->if_home_vnet; +#endif ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; @@ -921,6 +951,9 @@ ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; +#ifdef VIMAGE + ifp->if_reassign = ether_reassign; +#endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; @@ -940,7 +973,11 @@ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; +#ifdef VIMAGE + if (i != ifp->if_addrlen && home_vnet_0 != ifp->if_home_vnet) +#else if (i != ifp->if_addrlen) +#endif if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); } Index: sys/net/if_gif.c =========================================================================== --- sys/net/if_gif.c 2009/02/22 13:41:20 #63 +++ sys/net/if_gif.c 2009/02/22 13:41:20 @@ -121,6 +121,7 @@ static void gif_start(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); +static int vnet_gif_iattach(const void *); IFC_SIMPLE_DECLARE(gif, 0); @@ -157,6 +158,7 @@ SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?"); +VNET_MOD_DECLARE(GIF, gif, vnet_gif_iattach, NULL, NET, NULL) /* copy from src/sys/net/if_ethersubr.c */ static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -251,6 +253,27 @@ } static int +vnet_gif_iattach(unused) + const void *unused; +{ + INIT_VNET_GIF(curvnet); + + LIST_INIT(&V_gif_softc_list); + V_max_gif_nesting = MAX_GIF_NEST; +#ifdef XBONEHACK + V_parallel_tunnels = 1; +#else + V_parallel_tunnels = 0; +#endif + V_ip_gif_ttl = GIF_TTL; +#ifdef INET6 + V_ip6_gif_hlim = GIF_HLIM; +#endif + + return 0; +} + +static int gifmodevent(mod, type, data) module_t mod; int type; @@ -261,28 +284,20 @@ case MOD_LOAD: mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - LIST_INIT(&V_gif_softc_list); - V_max_gif_nesting = MAX_GIF_NEST; -#ifdef XBONEHACK - V_parallel_tunnels = 1; +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_gif_modinfo); #else - V_parallel_tunnels = 0; + vnet_gif_iattach(NULL); #endif -#ifdef INET - V_ip_gif_ttl = GIF_TTL; -#endif -#ifdef INET6 - V_ip6_gif_hlim = GIF_HLIM; -#endif if_clone_attach(&gif_cloner); break; case MOD_UNLOAD: if_clone_detach(&gif_cloner); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_gif_modinfo); +#endif mtx_destroy(&gif_mtx); -#ifdef INET6 - V_ip6_gif_hlim = 0; -#endif break; default: return EOPNOTSUPP; Index: sys/net/if_loop.c =========================================================================== --- sys/net/if_loop.c 2009/02/22 13:41:20 #64 +++ sys/net/if_loop.c 2009/02/22 13:41:20 @@ -93,32 +93,55 @@ #define LOMTU 16384 #endif +#define LONAME "lo" + +struct lo_softc { + struct ifnet *sc_ifp; + LIST_ENTRY(lo_softc) sc_next; +}; + int loioctl(struct ifnet *, u_long, caddr_t); static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); int looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); +static int vnet_loif_iattach(const void *); +#ifdef VIMAGE +static int vnet_loif_idetach(const void *); +#endif #ifdef VIMAGE_GLOBALS struct ifnet *loif; /* Used externally */ +static LIST_HEAD(lo_list, lo_softc) lo_list; #endif +static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface"); + +static struct mtx lo_mtx; + IFC_SIMPLE_DECLARE(lo, 1); static void lo_clone_destroy(struct ifnet *ifp) { + struct lo_softc *sc; #ifdef INVARIANTS INIT_VNET_NET(ifp->if_vnet); #endif + + sc = ifp->if_softc; /* XXX: destroying lo0 will lead to panics. */ KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); + mtx_lock(&lo_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&lo_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); + free(sc, M_LO); } static int @@ -126,10 +149,16 @@ { INIT_VNET_NET(curvnet); struct ifnet *ifp; + struct lo_softc *sc; - ifp = if_alloc(IFT_LOOP); - if (ifp == NULL) + MALLOC(sc, struct lo_softc *, sizeof(*sc), M_LO, M_WAITOK | M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_LOOP); + if (ifp == NULL) { + free(sc, M_LO); return (ENOSPC); + } + if (V_loif == NULL) + V_loif = ifp; if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; @@ -137,23 +166,72 @@ ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - if (V_loif == NULL) - V_loif = ifp; + mtx_lock(&lo_mtx); + LIST_INSERT_HEAD(&V_lo_list, sc, sc_next); + mtx_unlock(&lo_mtx); return (0); } +VNET_MOD_DECLARE_STATELESS(LOIF, loif, vnet_loif_iattach, vnet_loif_idetach, + NET) + +static int vnet_loif_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + LIST_INIT(&V_lo_list); + if (IS_DEFAULT_VNET(curvnet)) + if_clone_attach(&lo_cloner); + else + lo_cloner.ifc_attach(&lo_cloner); + return 0; +} + +#ifdef VIMAGE +static int vnet_loif_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + struct lo_softc *sc, *nsc; + + LIST_FOREACH_SAFE(sc, &V_lo_list, sc_next, nsc) { + struct ifnet *ifp = sc->sc_ifp; + + if (ifp == V_loif) { + /* + * A hack to allow lo0 to be detached: + * bump if_unit number from 0 to 1. By + * setting V_loif to NULL we prevent queuing + * of routing messages that would have + * m_pkthdr.rcvif pointing to a nonexisting + * ifnet, i.e. the lo0 we just destroyed. + */ + ifp->if_dunit = 1; + V_loif = NULL; + } + if_clone_destroy(ifp->if_xname); + } + return 0; +} +#endif + static int loop_modevent(module_t mod, int type, void *data) { - INIT_VNET_NET(curvnet); switch (type) { case MOD_LOAD: - V_loif = NULL; - if_clone_attach(&lo_cloner); + mtx_init(&lo_mtx, "lo_mtx", NULL, MTX_DEF); +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_loif_modinfo); +#else + vnet_loif_iattach(NULL); +#endif break; case MOD_UNLOAD: Index: sys/net/if_ppp.c =========================================================================== --- sys/net/if_ppp.c 2009/02/22 13:41:20 #55 +++ sys/net/if_ppp.c 2009/02/22 13:41:20 @@ -1397,6 +1397,7 @@ struct mbuf *mp, *dmp = NULL; u_char *iphdr; u_int hlen; + CURVNET_SET(ifp->if_vnet); sc->sc_stats.ppp_ipackets++; @@ -1431,7 +1432,7 @@ m_freem(m); if (dmp == NULL) { /* no error, but no decompressed packet produced */ - return; + goto done; } m = dmp; cp = mtod(m, u_char *); @@ -1588,7 +1589,7 @@ ilen, 0) == 0) { /* drop this packet */ m_freem(m); - return; + goto done; } if (sc->sc_active_filt.bf_insns == 0 || bpf_filter(sc->sc_active_filt.bf_insns, (u_char *) m, ilen, 0)) @@ -1617,13 +1618,13 @@ || sc->sc_npmode[NP_IP] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; m->m_len -= PPP_HDRLEN; if ((m = ip_fastforward(m)) == NULL) - return; + goto done; isr = NETISR_IP; break; #endif @@ -1636,7 +1637,7 @@ || sc->sc_npmode[NP_IPV6] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1653,7 +1654,7 @@ /* XXX: || sc->sc_npmode[NP_IPX] != NPMODE_PASS*/) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1688,6 +1689,8 @@ if (isr == -1) (*sc->sc_ctlp)(sc); + done: + CURVNET_RESTORE(); return; bad: @@ -1695,6 +1698,7 @@ m_freem(m); PPP2IFP(sc)->if_ierrors++; sc->sc_stats.ppp_ierrors++; + CURVNET_RESTORE(); } #define MAX_DUMP_BYTES 128 Index: sys/net/if_var.h =========================================================================== --- sys/net/if_var.h 2009/02/22 13:41:20 #96 +++ sys/net/if_var.h 2009/02/22 13:41:20 @@ -70,6 +70,7 @@ struct ether_header; struct carp_if; struct ifvlantrunk; +struct vnet; #endif #include /* get TAILQ macros */ @@ -162,6 +163,10 @@ (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); + void (*if_reassign) /* reassign to vnet routine */ + (struct ifnet *, struct vnet *, char *); + struct vnet *if_vnet; /* network stack instance */ + struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_llsoftc; /* link layer softc */ int if_drv_flags; /* driver-managed status flags */ @@ -697,7 +702,9 @@ struct cdev *ife_dev; }; +void ifnet_setbyindex(u_short idx, struct ifnet *ifp); struct ifnet *ifnet_byindex(u_short idx); +struct ifnet *ifnet_byindex_locked(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only @@ -720,6 +727,7 @@ int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); +void if_grow(void); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); Index: sys/net/if_vlan.c =========================================================================== --- sys/net/if_vlan.c 2009/02/22 13:41:20 #106 +++ sys/net/if_vlan.c 2009/02/22 13:41:20 @@ -1362,6 +1362,12 @@ error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); if (error) break; +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; @@ -1389,6 +1395,12 @@ case SIOCGETVLAN: bzero(&vlr, sizeof(vlr)); +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif VLAN_LOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, Index: sys/net/netisr.c =========================================================================== --- sys/net/netisr.c 2009/02/22 13:41:20 #22 +++ sys/net/netisr.c 2009/02/22 13:41:20 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ netisr_t *ni_handler; struct ifqueue *ni_queue; int ni_flags; + void (*ni_handler_drained)(void); } netisrs[32]; static void *net_ih; @@ -72,7 +74,8 @@ } void -netisr_register(int num, netisr_t *handler, struct ifqueue *inq, int flags) +netisr_register2(int num, netisr_t *handler, void (*handler_drained)(void), + struct ifqueue *inq, int flags) { KASSERT(!(num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs))), @@ -82,9 +85,16 @@ netisrs[num].ni_handler = handler; netisrs[num].ni_queue = inq; netisrs[num].ni_flags = flags; + netisrs[num].ni_handler_drained = handler_drained; } void +netisr_register(int num, netisr_t *handler, struct ifqueue *inq, int flags) +{ + netisr_register2(num, handler, NULL, inq, flags); +} + +void netisr_unregister(int num) { struct netisr *ni; @@ -142,8 +152,13 @@ IF_DEQUEUE(ni->ni_queue, m); if (m == NULL) break; + VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); ni->ni_handler(m); + CURVNET_RESTORE(); } + if (ni->ni_handler_drained) + ni->ni_handler_drained(); } /* @@ -163,6 +178,7 @@ m_freem(m); return; } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) /* * Unless NETISR_FORCEQUEUE is set on the netisr (generally @@ -172,11 +188,31 @@ * by virtue of callers consistently calling one of queued or direct * dispatch, and the forcequeue flag being immutable after * registration. + * + * If the kernel was compiled with options VIMAGE, also defer + * dispatch of netisr handlers for mbufs that have crossed a + * boundary between two vnets. Direct dispatching in such + * cases could lead to various LORs, or in most extreme + * circumstances cause the kernel stack to overflow. */ +#ifndef VIMAGE if (netisr_direct && !(ni->ni_flags & NETISR_FORCEQUEUE)) { +#else + if (netisr_direct && !(ni->ni_flags & NETISR_FORCEQUEUE) && + !(m->m_flags & M_REMOTE_VNET)) { +#endif isrstat.isrs_directed++; ni->ni_handler(m); } else { +#ifdef VIMAGE + /* + * Once direct netisr dispatching is avoided using the + * M_REMOTE_VNET flag, it should not be observed any + * more, so clear it here in order to avoid further + * defering of direct netisr dispatching. + */ + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_deferred++; if (IF_HANDOFF(ni->ni_queue, m, NULL)) schednetisr(num); @@ -203,6 +239,10 @@ m_freem(m); return (ENXIO); } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) +#ifdef VIMAGE + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_queued++; if (!IF_HANDOFF(ni->ni_queue, m, NULL)) return (ENOBUFS); /* IF_HANDOFF has free'd the mbuf */ Index: sys/net/netisr.h =========================================================================== --- sys/net/netisr.h 2009/02/22 13:41:20 #12 +++ sys/net/netisr.h 2009/02/22 13:41:20 @@ -45,6 +45,7 @@ * now implemented via a software ithread (SWI). */ #define NETISR_POLL 0 /* polling callback, must be first */ +#define NETISR_EPAIR 1 /* if_epair(4) soft interrupt */ #define NETISR_IP 2 /* same as AF_INET */ #define NETISR_ROUTE 14 /* routing socket */ #define NETISR_AARP 15 /* Appletalk ARP */ @@ -84,6 +85,7 @@ void netisr_dispatch(int, struct mbuf *); int netisr_queue(int, struct mbuf *); #define NETISR_FORCEQUEUE 0x0002 /* Force queued dispatch. */ +void netisr_register2(int, netisr_t *, void (*)(void), struct ifqueue *, int); void netisr_register(int, netisr_t *, struct ifqueue *, int); void netisr_unregister(int); Index: sys/net/route.c =========================================================================== --- sys/net/route.c 2009/02/22 13:41:20 #88 +++ sys/net/route.c 2009/02/22 13:41:20 @@ -147,29 +147,20 @@ SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); -static void -route_init(void) +static int +rtable_init(const void *unused) { - INIT_VNET_INET(curvnet); + INIT_VNET_NET(curvnet); int table; struct domain *dom; int fam; - /* whack the tunable ints into line. */ - if (rt_numfibs > RT_MAXFIBS) - rt_numfibs = RT_MAXFIBS; - if (rt_numfibs == 0) - rt_numfibs = 1; - rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); - rn_init(); /* initialize all zeroes, all ones, mask table */ - for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach) { for (table = 0; table < rt_numfibs; table++) { if ( (fam = dom->dom_family) == AF_INET || table == 0) { - /* for now only AF_INET has > 1 table */ + /* for now only AF_INET has > 1 table */ /* XXX MRT * rtattach will be also called * from vfs_export.c but the @@ -186,6 +177,57 @@ } } } + return (0); +} + +#ifdef VIMAGE +static int +rtable_idetach(const void *unused) +{ + int table; + int fam; + struct domain *dom; + INIT_VNET_NET(curvnet); + + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_rtdetach) { + for (table = 0; table < rt_numfibs; table++) { + if ( (fam = dom->dom_family) == AF_INET || + table == 0) { + /* for now only AF_INET has > 1 table */ + dom->dom_rtdetach( + (void **)&V_rt_tables[table][fam], + dom->dom_rtoffset); + } else { + break; + } + } + } + } + return (0); +} +#endif + +VNET_MOD_DECLARE_STATELESS(RTABLE, rtable, rtable_init, rtable_idetach, NET); + +static void +route_init(void) +{ + + /* whack the tunable ints into line. */ + if (rt_numfibs > RT_MAXFIBS) + rt_numfibs = RT_MAXFIBS; + if (rt_numfibs == 0) + rt_numfibs = 1; + rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + rn_init(); /* initialize all zeroes, all ones, mask table */ + +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_rtable_modinfo); +#else + rtable_init(NULL); +#endif } #ifndef _SYS_SYSPROTO_H_ Index: sys/net/rtsock.c =========================================================================== --- sys/net/rtsock.c 2009/02/22 13:41:20 #112 +++ sys/net/rtsock.c 2009/02/22 13:41:20 @@ -1221,6 +1221,14 @@ *(unsigned short *)(tag + 1) = sa->sa_family; m_tag_prepend(m, tag); } +#ifdef VIMAGE + if (V_loif) + m->m_pkthdr.rcvif = V_loif; + else { + m_freem(m); + return; + } +#endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } Index: sys/net80211/ieee80211.c =========================================================================== --- sys/net80211/ieee80211.c 2009/02/22 13:41:20 #65 +++ sys/net80211/ieee80211.c 2009/02/22 13:41:20 @@ -37,6 +37,7 @@ #include #include +#include #include #include @@ -243,6 +244,9 @@ struct ifaddr *ifa; KASSERT(ifp->if_type == IFT_IEEE80211, ("if_type %d", ifp->if_type)); +#ifdef VIMAGE + ifp->if_reassign = NULL; /* Override ether_reassign() */ +#endif IEEE80211_LOCK_INIT(ic, ifp->if_xname); TAILQ_INIT(&ic->ic_vaps); @@ -697,6 +701,30 @@ IEEE80211_UNLOCK(ic); } +#ifdef VIMAGE +void +ieee80211_reassign( struct ieee80211vap *vap, struct vnet *vnet, char *dname) +{ + struct ifnet *ifp = vap->iv_ifp; + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + bpfdetach(ifp); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + vap->iv_rawbpf = NULL; + if_reassign_common(ifp, vnet, ifp->if_dname); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + bpfattach2(ifp, DLT_IEEE802_11, + sizeof(struct ieee80211_frame_addr4), &vap->iv_rawbpf); + CURVNET_RESTORE(); +} +#endif + static __inline int mapgsm(u_int freq, u_int flags) { Index: sys/net80211/ieee80211_ddb.c =========================================================================== --- sys/net80211/ieee80211_ddb.c 2009/02/22 13:41:20 #22 +++ sys/net80211/ieee80211_ddb.c 2009/02/22 13:41:20 @@ -153,6 +153,7 @@ break; } + /* XXX to lock or not to lock the vnet list - we are in DDB here? */ VNET_FOREACH(vnet_iter) { INIT_VNET_NET(vnet_iter); TAILQ_FOREACH(ifp, &V_ifnet, if_list) Index: sys/net80211/ieee80211_freebsd.c =========================================================================== --- sys/net80211/ieee80211_freebsd.c 2009/02/22 13:41:20 #25 +++ sys/net80211/ieee80211_freebsd.c 2009/02/22 13:41:20 @@ -41,6 +41,7 @@ #include #include +#include #include #include @@ -488,9 +489,11 @@ { struct ieee80211_join_event iev; + CURVNET_SET(ifp->if_vnet); memset(&iev, 0, sizeof(iev)); IEEE80211_ADDR_COPY(iev.iev_addr, mac); rt_ieee80211msg(ifp, op, &iev, sizeof(iev)); + CURVNET_RESTORE(); } void @@ -499,6 +502,7 @@ struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; + CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode join", (ni == vap->iv_bss) ? "bss " : ""); @@ -510,6 +514,7 @@ notify_macaddr(ifp, newassoc ? RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, ni->ni_macaddr); } + CURVNET_RESTORE(); } void @@ -518,6 +523,7 @@ struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; + CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode leave", (ni == vap->iv_bss) ? "bss " : ""); @@ -528,6 +534,7 @@ /* fire off wireless event station leaving */ notify_macaddr(ifp, RTM_IEEE80211_LEAVE, ni->ni_macaddr); } + CURVNET_RESTORE(); } void @@ -538,7 +545,9 @@ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", "notify scan done"); /* dispatch wireless event indicating scan completed */ + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0); + CURVNET_RESTORE(); } void @@ -566,7 +575,9 @@ iev.iev_keyix = k->wk_keyix; iev.iev_keyrsc = k->wk_keyrsc[0]; /* XXX need tid */ iev.iev_rsc = rsc; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } @@ -587,7 +598,9 @@ IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = IEEE80211_CIPHER_TKIP; iev.iev_keyix = keyix; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } Index: sys/net80211/ieee80211_var.h =========================================================================== --- sys/net80211/ieee80211_var.h 2009/02/22 13:41:20 #72 +++ sys/net80211/ieee80211_var.h 2009/02/22 13:41:20 @@ -584,6 +584,8 @@ int ieee80211_vap_attach(struct ieee80211vap *, ifm_change_cb_t, ifm_stat_cb_t); void ieee80211_vap_detach(struct ieee80211vap *); +void ieee80211_reassign(struct ieee80211vap *, struct vnet *, char *); + const struct ieee80211_rateset *ieee80211_get_suprates(struct ieee80211com *ic, const struct ieee80211_channel *); void ieee80211_announce(struct ieee80211com *); Index: sys/netgraph/netgraph.h =========================================================================== --- sys/netgraph/netgraph.h 2009/02/22 13:41:20 #49 +++ sys/netgraph/netgraph.h 2009/02/22 13:41:20 @@ -352,6 +352,7 @@ LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */ struct ng_queue nd_input_queue; /* input queue for locking */ int nd_refs; /* # of references to this node */ + struct vnet *nd_vnet; /* network stack instance */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define ND_MAGIC 0x59264837 int nd_magic; @@ -1123,6 +1124,7 @@ struct ng_type *ng_findtype(const char *type); int ng_make_node_common(struct ng_type *typep, node_p *nodep); int ng_name_node(node_p node, const char *name); +node_p ng_name2noderef(node_p node, const char *name); int ng_newtype(struct ng_type *tp); ng_ID_t ng_node2ID(node_p node); item_p ng_package_data(struct mbuf *m, int flags); Index: sys/netgraph/ng_base.c =========================================================================== --- sys/netgraph/ng_base.c 2009/02/22 13:41:20 #117 +++ sys/netgraph/ng_base.c 2009/02/22 13:41:20 @@ -137,10 +137,10 @@ }, 1, /* refs */ #ifdef NETGRAPH_DEBUG - ND_MAGIC, - __FILE__, - __LINE__, - {NULL} + .nd_magic = ND_MAGIC, + .lastfile = __FILE__, + .lastline = __LINE__, + .nd_all = {NULL} #endif /* NETGRAPH_DEBUG */ }; @@ -227,7 +227,6 @@ /* Imported, these used to be externally visible, some may go back. */ void ng_destroy_hook(hook_p hook); -node_p ng_name2noderef(node_p node, const char *name); int ng_path2noderef(node_p here, const char *path, node_p *dest, hook_p *lasthook); int ng_make_node(const char *type, node_p *nodepp); @@ -267,6 +266,14 @@ #define NG_WORKLIST_WAKEUP() \ wakeup_one(&ng_worklist) +static vnet_attach_fn vnet_netgraph_iattach; +#ifdef VIMAGE +static vnet_detach_fn vnet_netgraph_idetach; +#endif /* VIMAGE */ + +VNET_MOD_DECLARE(NETGRAPH, netgraph, vnet_netgraph_iattach, + vnet_netgraph_idetach, LOIF, NULL) + #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ /* * In debug mode: @@ -645,6 +652,9 @@ return (ENOMEM); } node->nd_type = type; +#ifdef VIMAGE + node->nd_vnet = curvnet; +#endif NG_NODE_REF(node); /* note reference */ type->refs++; @@ -3082,7 +3092,6 @@ switch (event) { case MOD_LOAD: /* Initialize everything. */ - V_nextID = 1; NG_WORKLIST_LOCK_INIT(); mtx_init(&ng_typelist_mtx, "netgraph types mutex", NULL, MTX_DEF); @@ -3116,6 +3125,11 @@ break; } } +#ifdef VIMAGE + vnet_mod_register(&vnet_netgraph_modinfo); +#else + vnet_netgraph_iattach(NULL); +#endif /* !VIMAGE */ break; case MOD_UNLOAD: /* You can't unload it because an interface may be using it. */ @@ -3128,6 +3142,44 @@ return (error); } +static int vnet_netgraph_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + +#ifdef VIMAGE + LIST_INIT(&V_ng_nodelist); /* XXX should go away */ +#endif + V_nextID = 1; + + return 0; +} + +#ifdef VIMAGE +static int vnet_netgraph_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node, last_killed = NULL; + + while ((node = LIST_FIRST(&V_ng_nodelist)) != NULL) { + if (node == last_killed) { + /* This should never happen */ + node->nd_flags |= NGF_REALLY_DIE; + printf("netgraph node %s needs NGF_REALLY_DIE\n", + node->nd_name); + ng_rmnode(node, NULL, NULL, 0); + /* This must never happen */ + if (node == LIST_FIRST(&V_ng_nodelist)) + panic("netgraph node %s won't die", + node->nd_name); + } + ng_rmnode(node, NULL, NULL, 0); + last_killed = node; + } + + return 0; +} +#endif /* VIMAGE */ + static moduledata_t netgraph_mod = { "netgraph", ngb_mod_event, @@ -3286,6 +3338,7 @@ NG_WORKLIST_SLEEP(); STAILQ_REMOVE_HEAD(&ng_worklist, nd_input_queue.q_work); NG_WORKLIST_UNLOCK(); + CURVNET_SET(node->nd_vnet); CTR3(KTR_NET, "%20s: node [%x] (%p) taken off worklist", __func__, node->nd_ID, node); /* @@ -3315,6 +3368,7 @@ } } NG_NODE_UNREF(node); + CURVNET_RESTORE(); } } @@ -3648,7 +3702,9 @@ { item_p item = arg; + CURVNET_SET(NGI_NODE(item)->nd_vnet); ng_snd_item(item, 0); + CURVNET_RESTORE(); } Index: sys/netgraph/ng_bridge.c =========================================================================== --- sys/netgraph/ng_bridge.c 2009/02/22 13:41:20 #25 +++ sys/netgraph/ng_bridge.c 2009/02/22 13:41:20 @@ -98,13 +98,14 @@ /* Per-node private data */ struct ng_bridge_private { struct ng_bridge_bucket *tab; /* hash table bucket array */ - struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS]; + struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS + 1]; struct ng_bridge_config conf; /* node configuration */ node_p node; /* netgraph node */ u_int numHosts; /* num entries in table */ u_int numBuckets; /* num buckets in table */ u_int hashMask; /* numBuckets - 1 */ int numLinks; /* num connected links */ + int persistent; /* can exist w/o any hooks */ struct callout timer; /* one second periodic timer */ }; typedef struct ng_bridge_private *priv_p; @@ -345,13 +346,13 @@ ng_bridge_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); + int linkNum = -1; /* Check for a link hook */ if (strncmp(name, NG_BRIDGE_HOOK_LINK_PREFIX, strlen(NG_BRIDGE_HOOK_LINK_PREFIX)) == 0) { const char *cp; char *eptr; - u_long linkNum; cp = name + strlen(NG_BRIDGE_HOOK_LINK_PREFIX); if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0')) @@ -359,6 +360,14 @@ linkNum = strtoul(cp, &eptr, 10); if (*eptr != '\0' || linkNum >= NG_BRIDGE_MAX_LINKS) return (EINVAL); + } else if (strcmp(name, "anchor") == 0) { + linkNum = NG_BRIDGE_MAX_LINKS; + if (priv->persistent) + return (EISCONN); + priv->persistent = 1; + } + + if (linkNum >= 0 ) { if (priv->links[linkNum] != NULL) return (EISCONN); priv->links[linkNum] = malloc(sizeof(*priv->links[linkNum]), @@ -366,7 +375,7 @@ if (priv->links[linkNum] == NULL) return (ENOMEM); priv->links[linkNum]->hook = hook; - NG_HOOK_SET_PRIVATE(hook, (void *)linkNum); + NG_HOOK_SET_PRIVATE(hook, (void *)(intptr_t)linkNum); priv->numLinks++; return (0); } @@ -785,7 +794,7 @@ /* Get link number */ linkNum = (intptr_t)NG_HOOK_PRIVATE(hook); - KASSERT(linkNum >= 0 && linkNum < NG_BRIDGE_MAX_LINKS, + KASSERT(linkNum >= 0 && linkNum <= NG_BRIDGE_MAX_LINKS, ("%s: linkNum=%u", __func__, linkNum)); /* Remove all hosts associated with this link */ @@ -799,7 +808,8 @@ /* If no more hooks, go away */ if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) - && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) { + && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + && !priv->persistent) { ng_rmnode_self(NG_HOOK_NODE(hook)); } return (0); Index: sys/netgraph/ng_eiface.c =========================================================================== --- sys/netgraph/ng_eiface.c 2009/02/22 13:41:20 #43 +++ sys/netgraph/ng_eiface.c 2009/02/22 13:41:20 @@ -113,10 +113,16 @@ }; NETGRAPH_INIT(eiface, &typestruct); +static vnet_attach_fn ng_eiface_iattach; +static vnet_detach_fn ng_eiface_idetach; + #ifdef VIMAGE_GLOBALS static struct unrhdr *ng_eiface_unit; #endif +VNET_MOD_DECLARE_STATELESS(NG_EIFACE, ng_eiface, ng_eiface_iattach, + ng_eiface_idetach, NETGRAPH) + /************************************************************************ INTERFACE STUFF ************************************************************************/ @@ -248,6 +254,14 @@ * Send packet; if hook is not connected, mbuf will get * freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(node->nd_vnet); + NG_SEND_DATA_ONLY(error, priv->ether, m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, priv->ether, m); /* Update stats */ @@ -372,12 +386,10 @@ ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; ifp->if_flags = (IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST); -#if 0 - /* Give this node name */ - bzero(ifname, sizeof(ifname)); - sprintf(ifname, "if%s", ifp->if_xname); - (void)ng_name_node(node, ifname); -#endif + /* Give this node the same name as the interface (if possible) */ + if (ng_name_node(node, ifp->if_xname) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", + ifp->if_xname); /* Attach the interface */ ether_ifattach(ifp, eaddr); @@ -534,6 +546,12 @@ /* Update interface stats */ ifp->if_ipackets++; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + (*ifp->if_input)(ifp, m); /* Done */ @@ -587,10 +605,18 @@ switch (event) { case MOD_LOAD: - V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_eiface_modinfo); +#else + ng_eiface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(V_ng_eiface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_eiface_modinfo); +#else + ng_eiface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -598,3 +624,32 @@ } return (error); } + +static int ng_eiface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_eiface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_eiface_unit); + + return 0; +} Index: sys/netgraph/ng_ether.c =========================================================================== --- sys/netgraph/ng_ether.c 2009/02/22 13:41:20 #49 +++ sys/netgraph/ng_ether.c 2009/02/22 13:41:20 @@ -72,6 +72,11 @@ #define IFP2NG(ifp) (IFP2AC((ifp))->ac_netgraph) +static vnet_attach_fn ng_ether_iattach; + +VNET_MOD_DECLARE_STATELESS(NG_ETHER, ng_ether, ng_ether_iattach, + NULL, NETGRAPH) + /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ @@ -284,6 +289,17 @@ priv_p priv; node_p node; + /* + * Do not create / attach an ether node to this ifnet if + * a netgraph node with the same name already exists. + * This should prevent ether nodes to be attached to + * eiface nodes in the same vnet, which is pointless. + */ + if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) { + NG_NODE_UNREF(node); + return; + } + /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { @@ -738,7 +754,6 @@ static int ng_ether_mod_event(module_t mod, int event, void *data) { - struct ifnet *ifp; int error = 0; int s; @@ -757,15 +772,11 @@ ng_ether_input_p = ng_ether_input; ng_ether_input_orphan_p = ng_ether_input_orphan; ng_ether_link_state_p = ng_ether_link_state; - - /* Create nodes for any already-existing Ethernet interfaces */ - IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type == IFT_ETHER - || ifp->if_type == IFT_L2VLAN) - ng_ether_attach(ifp); - } - IFNET_RUNLOCK(); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_ether_modinfo); +#else + error = ng_ether_iattach(NULL); +#endif break; case MOD_UNLOAD: @@ -778,6 +789,10 @@ * is MOD_UNLOAD, so there's no need to detach any nodes. */ +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_ether_modinfo); +#endif + /* Unregister function hooks */ ng_ether_attach_p = NULL; ng_ether_detach_p = NULL; @@ -795,3 +810,19 @@ return (error); } +static int ng_ether_iattach(const void *unused) +{ + INIT_VNET_NET(curvnet); + struct ifnet *ifp; + + /* Create nodes for any already-existing Ethernet interfaces */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ETHER + || ifp->if_type == IFT_L2VLAN) + ng_ether_attach(ifp); + } + IFNET_RUNLOCK(); + + return 0; +} Index: sys/netgraph/ng_hub.c =========================================================================== --- sys/netgraph/ng_hub.c 2009/02/22 13:41:20 #3 +++ sys/netgraph/ng_hub.c 2009/02/22 13:41:20 @@ -37,6 +37,7 @@ #include static ng_constructor_t ng_hub_constructor; +static ng_newhook_t ng_hub_newhook; static ng_rcvdata_t ng_hub_rcvdata; static ng_disconnect_t ng_hub_disconnect; @@ -44,6 +45,7 @@ .version = NG_ABI_VERSION, .name = NG_HUB_NODE_TYPE, .constructor = ng_hub_constructor, + .newhook = ng_hub_newhook, .rcvdata = ng_hub_rcvdata, .disconnect = ng_hub_disconnect, }; @@ -57,6 +59,14 @@ return (0); } +static int +ng_hub_newhook(node_p node, hook_p hook, const char *name) +{ + if (strcmp(name, "anchor") == 0) + node->nd_private = (void *) 1; + return (0); +} + static int ng_hub_rcvdata(hook_p hook, item_p item) { @@ -94,7 +104,7 @@ { if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0 && - NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + NG_NODE_IS_VALID(NG_HOOK_NODE(hook)) && !hook->hk_node->nd_private) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } Index: sys/netgraph/ng_iface.c =========================================================================== --- sys/netgraph/ng_iface.c 2009/02/22 13:41:20 #39 +++ sys/netgraph/ng_iface.c 2009/02/22 13:41:20 @@ -122,6 +122,10 @@ static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct rtentry *rt0); +#ifdef VIMAGE +static void ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, + char *dname); +#endif static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, @@ -208,10 +212,16 @@ }; NETGRAPH_INIT(iface, &typestruct); +static vnet_attach_fn ng_iface_iattach; +static vnet_detach_fn ng_iface_idetach; + #ifdef VIMAGE_GLOBALS static struct unrhdr *ng_iface_unit; #endif +VNET_MOD_DECLARE_STATELESS(NG_IFACE, ng_iface, ng_iface_iattach, + ng_iface_idetach, NETGRAPH) + /************************************************************************ HELPER STUFF ************************************************************************/ @@ -470,6 +480,14 @@ /* Send packet. If hook is not connected, mbuf will get freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != priv->node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(priv->node->nd_vnet); + NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); /* Update stats. */ @@ -556,6 +574,9 @@ ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; +#ifdef VIMAGE + ifp->if_reassign = ng_iface_reassign; +#endif ifp->if_watchdog = NULL; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); @@ -580,6 +601,24 @@ return (0); } +#ifdef VIMAGE +static void +ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + bpfdetach(ifp); + if_detach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "ser"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + CURVNET_RESTORE(); +} +#endif + /* * Give our ok for a hook to be added */ @@ -742,6 +781,12 @@ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; @@ -833,10 +878,18 @@ switch (event) { case MOD_LOAD: - V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_iface_modinfo); +#else + ng_iface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(V_ng_iface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_iface_modinfo); +#else + ng_iface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -844,3 +897,32 @@ } return (error); } + +static int ng_iface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_iface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_iface_unit); + + return 0; +} Index: sys/netinet/accf_http.c =========================================================================== --- sys/netinet/accf_http.c 2009/02/22 13:41:20 #13 +++ sys/netinet/accf_http.c 2009/02/22 13:41:20 @@ -37,6 +37,7 @@ #include #include #include +#include /* check for GET/HEAD */ static void sohashttpget(struct socket *so, void *arg, int waitflag); @@ -51,6 +52,8 @@ int max, char *cmp); /* socketbuffer is full */ static int sbfull(struct sockbuf *sb); +static int +accept_filt_http_mod_event(module_t mod, int event, void *data); static struct accept_filter accf_http_filter = { "httpready", @@ -61,19 +64,45 @@ static moduledata_t accf_http_mod = { "accf_http", - accept_filt_generic_mod_event, - &accf_http_filter + accept_filt_http_mod_event, + NULL, }; DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); -static int parse_http_version = 1; +#ifdef VIMAGE_GLOBALS +static int parse_http_version; +#endif + +struct vnet_accf_http { + int _parse_http_version; +}; + +#ifndef VIMAGE +#ifndef VIMAGE_GLOBALS +struct vnet_accf_http vnet_accf_http_0; +#endif +#endif + +#define INIT_VNET_ACCF_HTTP(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_ACCF_HTTP, struct vnet_accf_http, vnet_accf_http) + +#define VNET_ACCF_HTTP(sym) VSYM(vnet_accf_http, sym) + +#define V_parse_http_version VNET_ACCF_HTTP(parse_http_version) + +#define V_MOD_vnet_accf_http VNET_MOD_ACCF_HTTP + +static vnet_attach_fn vnet_accf_http_iattach; + +VNET_MOD_DECLARE(ACCF_HTTP, accf_http, vnet_accf_http_iattach, + NULL, INET, NULL) SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, "HTTP accept filter"); -SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, -&parse_http_version, 1, -"Parse http version so that non 1.x requests work"); +SYSCTL_V_INT(V_NET, vnet_accf_http, _net_inet_accf_http, OID_AUTO, + parsehttpversion, CTLFLAG_RW, parse_http_version, 1, + "Parse http version so that non 1.x requests work"); #ifdef ACCF_HTTP_DEBUG #define DPRINT(fmt, args...) \ @@ -161,6 +190,7 @@ static void sohashttpget(struct socket *so, void *arg, int waitflag) { + INIT_VNET_ACCF_HTTP(so->so_vnet); if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { struct mbuf *m; @@ -192,7 +222,7 @@ } if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) { DPRINT("mbufstrcmp ok"); - if (parse_http_version == 0) + if (V_parse_http_version == 0) soishttpconnected(so, arg, waitflag); else soparsehttpvers(so, arg, waitflag); @@ -360,3 +390,58 @@ soisconnected(so); return; } + +static int +accept_filt_http_mod_event(module_t mod, int event, void *data) +{ + struct accept_filter *p; + int error; + + switch (event) { + case MOD_LOAD: +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_accf_http_modinfo); +#else + vnet_accf_http_iattach(NULL); +#endif /* !VIMAGE */ + + MALLOC(p, struct accept_filter *, sizeof(*p), M_ACCF, + M_WAITOK); + bcopy(&accf_http_filter, p, sizeof(*p)); + error = accept_filt_add(p); + break; + + case MOD_UNLOAD: + /* + * Do not support unloading yet. we don't keep track of + * refcounts and unloading an accept filter callback and then + * having it called is a bad thing. A simple fix would be to + * track the refcount in the struct accept_filter. + */ + if (accf_unloadable != 0) { + error = accept_filt_del(accf_http_filter.accf_name); + } else + error = EOPNOTSUPP; + break; + + case MOD_SHUTDOWN: + error = 0; + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +static int vnet_accf_http_iattach(const void *unused) +{ + INIT_VNET_ACCF_HTTP(curvnet); + + V_parse_http_version = 1; + + return (0); +} + Index: sys/netinet/if_ether.c =========================================================================== --- sys/netinet/if_ether.c 2009/02/22 13:41:20 #109 +++ sys/netinet/if_ether.c 2009/02/22 13:41:20 @@ -111,6 +111,7 @@ static void arp_init(void); void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, u_char *); +static int arp_iattach(const void *); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET @@ -119,6 +120,7 @@ #ifdef AF_INET void arp_ifscrub(struct ifnet *ifp, uint32_t addr); +VNET_MOD_DECLARE_STATELESS(ARP, arp, arp_iattach, NULL, INET) /* * called by in_ifscrub to remove entry from the table when @@ -133,10 +135,12 @@ addr4.sin_len = sizeof(addr4); addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = addr; + CURVNET_SET(ifp->if_vnet); IF_AFDATA_LOCK(ifp); lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR), (struct sockaddr *)&addr4); IF_AFDATA_UNLOCK(ifp); + CURVNET_RESTORE(); } #endif @@ -788,8 +792,9 @@ ifa->ifa_rtrequest = NULL; } -static void -arp_init(void) +static int +arp_iattach(unused) + const void *unused; { INIT_VNET_INET(curvnet); @@ -798,8 +803,21 @@ V_useloopback = 1; /* use loopback interface for local traffic */ V_arp_proxyall = 0; + return 0; +} + +static void +arp_init(void) +{ + +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_arp_modinfo); +#else + arp_iattach(NULL); +#endif arpintrq.ifq_maxlen = 50; mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); netisr_register(NETISR_ARP, arpintr, &arpintrq, 0); } + SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: sys/netinet/igmp.c =========================================================================== --- sys/netinet/igmp.c 2009/02/22 13:41:20 #30 +++ sys/netinet/igmp.c 2009/02/22 13:41:20 @@ -125,6 +125,11 @@ INIT_VNET_INET(curvnet); struct ipoption *ra; + SLIST_INIT(&V_router_info_head); + + if (!IS_DEFAULT_VNET(curvnet)) + return; + /* * To avoid byte-swapping the same value over and over again. */ @@ -146,7 +151,6 @@ router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); - SLIST_INIT(&V_router_info_head); } static struct router_info * Index: sys/netinet/in_mcast.c =========================================================================== --- sys/netinet/in_mcast.c 2009/02/22 13:41:20 #12 +++ sys/netinet/in_mcast.c 2009/02/22 13:41:20 @@ -118,6 +118,8 @@ static int inp_leave_group(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static struct ifnet * + ip_multicast_if(struct in_addr *a); /* * Resize the ip_moptions vector to the next power-of-two minus 1. @@ -1029,9 +1031,9 @@ * If all of these conditions fail, return EADDRNOTAVAIL, and * reject the IPv4 multicast join. */ - if (mreqs.imr_interface.s_addr != INADDR_ANY) { - INADDR_TO_IFP(mreqs.imr_interface, ifp); - } else { + if (mreqs.imr_interface.s_addr != INADDR_ANY) + ifp = ip_multicast_if(&mreqs.imr_interface); + else { struct route ro; ro.ro_rt = NULL; @@ -1450,7 +1452,7 @@ if (addr.s_addr == INADDR_ANY) { ifp = NULL; } else { - INADDR_TO_IFP(addr, ifp); + ifp = ip_multicast_if(&addr); if (ifp == NULL) return (EADDRNOTAVAIL); } @@ -1833,3 +1835,25 @@ return (error); } + +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(struct in_addr *a) +{ + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); + int ifindex; + struct ifnet *ifp; + + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + if (ifindex < 0 || V_if_index < ifindex) + return NULL; + ifp = ifnet_byindex(ifindex); + } else + INADDR_TO_IFP(*a, ifp); + return ifp; +} + Index: sys/netinet/in_pcb.c =========================================================================== --- sys/netinet/in_pcb.c 2009/02/22 13:41:20 #155 +++ sys/netinet/in_pcb.c 2009/02/22 13:41:20 @@ -126,7 +126,9 @@ INIT_VNET_INET(curvnet); int error; - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + SYSCTL_RESOLVE_V_ARG1(); + + error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); Index: sys/netinet/in_pcb.h =========================================================================== --- sys/netinet/in_pcb.h 2009/02/22 13:41:20 #87 +++ sys/netinet/in_pcb.h 2009/02/22 13:41:20 @@ -228,6 +228,8 @@ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ +#define inp_vnet inp_pcbinfo->ipi_vnet + /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number @@ -308,7 +310,8 @@ * vimage 1 * general use 1 */ - void *ipi_pspare[2]; + struct vnet *ipi_vnet; + void *ipi_pspare[1]; }; #define INP_LOCK_INIT(inp, d, t) \ Index: sys/netinet/in_proto.c =========================================================================== --- sys/netinet/in_proto.c 2009/02/22 13:41:20 #35 +++ sys/netinet/in_proto.c 2009/02/22 13:41:20 @@ -126,6 +126,9 @@ .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = udp_init, +#ifdef VIMAGE + .pr_destroy = udp_destroy, +#endif .pr_usrreqs = &udp_usrreqs }, { @@ -137,6 +140,9 @@ .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, +#ifdef VIMAGE + .pr_destroy = tcp_destroy, +#endif .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs @@ -348,11 +354,15 @@ .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, +#ifdef VIMAGE + .pr_destroy = rip_destroy, +#endif .pr_usrreqs = &rip_usrreqs }, }; extern int in_inithead(void **, int); +extern int in_detachhead(void **, int); struct domain inetdomain = { .dom_family = AF_INET, @@ -364,6 +374,9 @@ #else .dom_rtattach = in_inithead, #endif +#ifdef VIMAGE + .dom_rtdetach = in_detachhead, +#endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in), .dom_ifattach = in_domifattach, Index: sys/netinet/in_rmx.c =========================================================================== --- sys/netinet/in_rmx.c 2009/02/22 13:41:20 #28 +++ sys/netinet/in_rmx.c 2009/02/22 13:41:20 @@ -62,7 +62,10 @@ #include #include -extern int in_inithead(void **head, int off); +int in_inithead(void **head, int off); +#ifdef VIMAGE +int in_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -240,31 +243,10 @@ static struct callout rtq_timer; #endif -static void in_rtqtimo_one(void *rock); - static void -in_rtqtimo(void *rock) -{ - int fibnum; - void *newrock; - struct timeval atv; - - KASSERT((rock == (void *)V_rt_tables[0][AF_INET]), - ("in_rtqtimo: unexpected arg")); - for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { - if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL) - in_rtqtimo_one(newrock); - } - atv.tv_usec = 0; - atv.tv_sec = V_rtq_timeout; - callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); -} - -static void -in_rtqtimo_one(void *rock) +in_rtqtimo_one(struct radix_node_head *rnh) { INIT_VNET_INET(curvnet); - struct radix_node_head *rnh = rock; struct rtqk_arg arg; static time_t last_adjusted_timeout = 0; @@ -303,7 +285,26 @@ rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } +} + +static void +in_rtqtimo(void *rock) +{ + int fibnum; + void *newrock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET((struct vnet *) rock); + struct timeval atv; + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL) + in_rtqtimo_one(newrock); + } + atv.tv_usec = 0; + atv.tv_sec = V_rtq_timeout; + callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + CURVNET_RESTORE(); } void @@ -370,12 +371,23 @@ rnh->rnh_close = in_clsroute; if (_in_rt_was_here == 0 ) { callout_init(&V_rtq_timer, CALLOUT_MPSAFE); - in_rtqtimo(rnh); /* kick off timeout first time */ + in_rtqtimo(curvnet); /* kick off timeout first time */ _in_rt_was_here = 1; } return 1; } +#ifdef VIMAGE +int +in_detachhead(void **head, int off) +{ + INIT_VNET_INET(curvnet); + + callout_drain(&V_rtq_timer); + return 1; +} +#endif + /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes Index: sys/netinet/in_var.h =========================================================================== --- sys/netinet/in_var.h 2009/02/22 13:41:20 #30 +++ sys/netinet/in_var.h 2009/02/22 13:41:20 @@ -92,6 +92,15 @@ extern u_long in_ifaddrhmask; /* mask for hash table */ #endif +/* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) Index: sys/netinet/ip_divert.c =========================================================================== --- sys/netinet/ip_divert.c 2009/02/22 13:41:20 #94 +++ sys/netinet/ip_divert.c 2009/02/22 13:41:20 @@ -126,38 +126,12 @@ static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ -/* - * Initialize divert connection block queue. - */ -static void -div_zone_change(void *tag) -{ - - uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); -} - -static int -div_inpcb_init(void *mem, int size, int flags) -{ - struct inpcb *inp = mem; - - INP_LOCK_INIT(inp, "inp", "divinp"); - return (0); -} - -static void -div_inpcb_fini(void *mem, int size) -{ - struct inpcb *inp = mem; - - INP_LOCK_DESTROY(inp); -} - void div_init(void) { INIT_VNET_INET(curvnet); + V_divcbinfo.ipi_vnet = curvnet; INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); LIST_INIT(&V_divcb); V_divcbinfo.ipi_listhead = &V_divcb; @@ -169,12 +143,6 @@ V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_porthashmask); - V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), - NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE); - uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, - NULL, EVENTHANDLER_PRI_ANY); } /* @@ -719,6 +687,7 @@ static int div_modevent(module_t mod, int type, void *unused) { + INIT_VNET_INET(curvnet); /* XXX fixme! MARKO */ int err = 0; int n; @@ -763,7 +732,6 @@ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); INP_INFO_LOCK_DESTROY(&V_divcbinfo); - uma_zdestroy(V_divcbinfo.ipi_zone); break; default: err = EOPNOTSUPP; Index: sys/netinet/ip_fw.h =========================================================================== --- sys/netinet/ip_fw.h 2009/02/22 13:41:20 #66 +++ sys/netinet/ip_fw.h 2009/02/22 13:41:20 @@ -28,6 +28,9 @@ #ifndef _IPFW2_H #define _IPFW2_H +#include +#include + /* * The default rule number. By the design of ip_fw, the default rule * is the last one, so its number can also serve as the highest number @@ -564,6 +567,34 @@ */ #ifdef _KERNEL +/* + * Data structure to cache our ucred related + * information. This structure only gets used if + * the user specified UID/GID based constraints in + * a firewall rule. + */ +struct ip_fw_ugid { + gid_t fw_groups[NGROUPS]; + int fw_ngroups; + uid_t fw_uid; + int fw_prid; +}; + +#define IPFW_TABLES_MAX 128 +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; + struct rwlock rwmtx; +}; + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, @@ -630,9 +661,7 @@ int ipfw_init(void); void ipfw_destroy(void); -#ifdef NOTYET void ipfw_nat_destroy(void); -#endif typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; @@ -650,13 +679,6 @@ extern ip_fw_chk_t *ip_fw_chk_ptr; #define IPFW_LOADED (ip_fw_chk_ptr != NULL) -struct ip_fw_chain { - struct ip_fw *rules; /* list of rules */ - struct ip_fw *reap; /* list of rules to reap */ - LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head *tables[IPFW_TABLES_MAX]; - struct rwlock rwmtx; -}; #ifdef IPFW_INTERNAL @@ -696,6 +718,7 @@ int _fw_debug; int _autoinc_step; ipfw_dyn_rule **_ipfw_dyn_v; + uma_zone_t _ipfw_dyn_rule_zone; struct ip_fw_chain _layer3_chain; u_int32_t _dyn_buckets; u_int32_t _curr_dyn_buckets; @@ -740,6 +763,7 @@ #define V_fw_debug VNET_IPFW(fw_debug) #define V_autoinc_step VNET_IPFW(autoinc_step) #define V_ipfw_dyn_v VNET_IPFW(ipfw_dyn_v) +#define V_ipfw_dyn_rule_zone VNET_IPFW(ipfw_dyn_rule_zone) #define V_layer3_chain VNET_IPFW(layer3_chain) #define V_dyn_buckets VNET_IPFW(dyn_buckets) #define V_curr_dyn_buckets VNET_IPFW(curr_dyn_buckets) @@ -758,6 +782,7 @@ #define V_dyn_max VNET_IPFW(dyn_max) #define V_norule_counter VNET_IPFW(norule_counter) #define V_ipfw_timeout VNET_IPFW(ipfw_timeout) +#define V_ipfw_timeout VNET_IPFW(ipfw_timeout) #define V_ifaddr_event_tag VNET_IPFW(ifaddr_event_tag) #endif /* _KERNEL */ Index: sys/netinet/ip_fw2.c =========================================================================== --- sys/netinet/ip_fw2.c 2009/02/22 13:41:20 #210 +++ sys/netinet/ip_fw2.c 2009/02/22 13:41:20 @@ -111,6 +111,11 @@ #endif #endif +static int vnet_ipfw_iattach(const void *); +static int vnet_ipfw_idetach(const void *); + +VNET_MOD_DECLARE(IPFW, ipfw, vnet_ipfw_iattach, vnet_ipfw_idetach, INET, NULL) + /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set @@ -124,27 +129,7 @@ static int fw_verbose; static struct callout ipfw_timeout; static int verbose_limit; -#endif - static uma_zone_t ipfw_dyn_rule_zone; - -/* - * Data structure to cache our ucred related - * information. This structure only gets used if - * the user specified UID/GID based constraints in - * a firewall rule. - */ -struct ip_fw_ugid { - gid_t fw_groups[NGROUPS]; - int fw_ngroups; - uid_t fw_uid; - int fw_prid; -}; - -/* - * list of rules for layer 3 - */ -#ifdef VIMAGE_GLOBALS struct ip_fw_chain layer3_chain; #endif @@ -157,12 +142,6 @@ ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - #ifdef VIMAGE_GLOBALS static int fw_debug; static int autoinc_step; @@ -299,24 +278,34 @@ "Lifetime of dyn. rules for other situations"); SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules"); -#endif /* SYSCTL_NODE */ #ifdef INET6 /* * IPv6 specific variables */ -#ifdef SYSCTL_NODE -SYSCTL_DECL(_net_inet6_ip6); -#endif /* SYSCTL_NODE */ +#if 0 static struct sysctl_ctx_list ip6_fw_sysctl_ctx; static struct sysctl_oid *ip6_fw_sysctl_tree; -#endif /* INET6 */ +#endif #ifdef VIMAGE_GLOBALS static int fw_deny_unknown_exthdrs; #endif +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_SECURE, + 0, "Firewall"); +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw6_enable, 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, + deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, + fw_deny_unknown_exthdrs, 0, + "Deny packets with unknown IPv6 Extension Headers"); +#endif /* INET6 */ +#endif /* SYSCTL_NODE */ + /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type @@ -1083,7 +1072,7 @@ else \ head = q = q->next; \ V_dyn_count--; \ - uma_zfree(ipfw_dyn_rule_zone, old_q); } + uma_zfree(V_ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) @@ -1384,7 +1373,7 @@ } i = hash_packet(id); - r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; @@ -1809,7 +1798,6 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { - INIT_VNET_IPFW(curvnet); struct radix_node_head *rnh; struct table_entry *ent; struct radix_node *rn; @@ -2622,7 +2610,8 @@ (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, - src_ip, src_port, &fw_ugid_cache, + src_ip, src_port, + &fw_ugid_cache, &ugid_lookup, args->inp); break; @@ -4474,8 +4463,11 @@ * every dyn_keepalive_period */ static void -ipfw_tick(void * __unused unused) +ipfw_tick(void *arg) { +#ifdef VIMAGE + struct vnet_ipfw *vnet_ipfw = arg; +#endif struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; @@ -4524,11 +4516,10 @@ } done: callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, - ipfw_tick, NULL); + ipfw_tick, arg); } -int -ipfw_init(void) +static int vnet_ipfw_iattach(const void *unused) { INIT_VNET_IPFW(curvnet); struct ip_fw default_rule; @@ -4556,27 +4547,12 @@ V_fw_deny_unknown_exthdrs = 1; -#ifdef INET6 - /* Setup IPv6 fw sysctl tree. */ - sysctl_ctx_init(&ip6_fw_sysctl_ctx); - ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", - CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); - SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, - &V_fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); - SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, - &V_fw_deny_unknown_exthdrs, 0, - "Deny packets with unknown IPv6 Extension Headers"); -#endif V_layer3_chain.rules = NULL; IPFW_LOCK_INIT(&V_layer3_chain); - ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - IPFW_DYN_LOCK_INIT(); callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); @@ -4597,13 +4573,49 @@ if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); - IPFW_DYN_LOCK_DESTROY(); + IPFW_DYN_LOCK_DESTROY(); /* XXX MARKO REVISIT */ IPFW_LOCK_DESTROY(&V_layer3_chain); - uma_zdestroy(ipfw_dyn_rule_zone); + uma_zdestroy(V_ipfw_dyn_rule_zone); return (error); } - ip_fw_default_rule = V_layer3_chain.rules; + ip_fw_default_rule = V_layer3_chain.rules; /* XXX V_ this ? MARKO */ + +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif + + error = init_tables(&V_layer3_chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } +#ifdef VIMAGE + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, (void *) vnet_ipfw); +#else + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); +#endif + +#ifdef IPFIREWALL_NAT + LIST_INIT(&V_layer3_chain.nat); +#endif + + return 0; +} + +int +ipfw_init(void) +{ + IPFW_DYN_LOCK_INIT(); + +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_ipfw_modinfo); +#else + vnet_ipfw_iattach(NULL); +#endif + printf("ipfw2 " #ifdef INET6 "(+ipv6) " @@ -4621,49 +4633,41 @@ #else "loadable", #endif + #ifdef IPFIREWALL_NAT "enabled", #else - "loadable", + "disabled, ", #endif - default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "accept" +#else + "deny" +#endif + ); #ifdef IPFIREWALL_VERBOSE - V_fw_verbose = 1; -#endif -#ifdef IPFIREWALL_VERBOSE_LIMIT - V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; -#endif - if (V_fw_verbose == 0) printf("disabled\n"); - else if (V_verbose_limit == 0) +#else +# ifndef IPFIREWALL_VERBOSE_LIMIT printf("unlimited\n"); - else +# else printf("limited to %d packets/entry by default\n", - V_verbose_limit); + IPFIREWALL_VERBOSE_LIMIT); +# endif +#endif - error = init_tables(&V_layer3_chain); - if (error) { - IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&V_layer3_chain); - uma_zdestroy(ipfw_dyn_rule_zone); - return (error); - } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; - callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); - LIST_INIT(&V_layer3_chain.nat); return (0); } -void -ipfw_destroy(void) +static int vnet_ipfw_idetach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw *reap; - ip_fw_chk_ptr = NULL; - ip_fw_ctl_ptr = NULL; callout_drain(&V_ipfw_timeout); IPFW_WLOCK(&V_layer3_chain); flush_tables(&V_layer3_chain); @@ -4673,16 +4677,30 @@ IPFW_WUNLOCK(&V_layer3_chain); if (reap != NULL) reap_rules(reap); - IPFW_DYN_LOCK_DESTROY(); - uma_zdestroy(ipfw_dyn_rule_zone); + IPFW_LOCK_DESTROY(&V_layer3_chain); if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); - IPFW_LOCK_DESTROY(&V_layer3_chain); + + uma_zdestroy(V_ipfw_dyn_rule_zone); + return 0; +} + +void +ipfw_destroy(void) +{ + ip_fw_chk_ptr = NULL; + ip_fw_ctl_ptr = NULL; + +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ipfw_modinfo); +#else + vnet_ipfw_idetach(NULL); +#endif -#ifdef INET6 - /* Free IPv6 fw sysctl tree. */ - sysctl_ctx_free(&ip6_fw_sysctl_ctx); +#ifdef IPFIREWALL_NAT + ipfw_nat_destroy(); #endif + IPFW_DYN_LOCK_DESTROY(); printf("IP firewall unloaded\n"); } Index: sys/netinet/ip_fw_nat.c =========================================================================== --- sys/netinet/ip_fw_nat.c 2009/02/22 13:41:20 #9 +++ sys/netinet/ip_fw_nat.c 2009/02/22 13:41:20 @@ -609,7 +609,7 @@ NULL, EVENTHANDLER_PRI_ANY); } -static void +void ipfw_nat_destroy(void) { INIT_VNET_IPFW(curvnet); Index: sys/netinet/ip_input.c =========================================================================== --- sys/netinet/ip_input.c 2009/02/22 13:41:20 #176 +++ sys/netinet/ip_input.c 2009/02/22 13:41:20 @@ -223,6 +223,20 @@ static void ip_freef(struct ipqhead *, struct ipq *); +#ifndef VIMAGE_GLOBALS +static void vnet_inet_register(void); + +VNET_MOD_DECLARE(INET, inet, NULL, NULL, NET, NULL) + +static void vnet_inet_register() +{ + + vnet_mod_register(&vnet_inet_modinfo); +} + +SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); +#endif + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -269,6 +283,23 @@ TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); + + /* Initialize IP reassembly queue. */ + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&V_ipq[i]); + V_nipq = 0; + V_maxnipq = nmbclusters / 32; + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + maxnipq_update(); + + V_ip_id = time_second & 0xffff; + + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); @@ -296,26 +327,16 @@ printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); - /* Initialize IP reassembly queue. */ - IPQ_LOCK_INIT(); - for (i = 0; i < IPREASS_NHASH; i++) - TAILQ_INIT(&V_ipq[i]); - V_maxnipq = nmbclusters / 32; - V_maxfragsperpacket = 16; - V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, - NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); - /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); - ipport_tick(NULL); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ - V_ip_id = time_second & 0xffff; + IPQ_LOCK_INIT(); ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, 0); Index: sys/netinet/ip_var.h =========================================================================== --- sys/netinet/ip_var.h 2009/02/22 13:41:20 #51 +++ sys/netinet/ip_var.h 2009/02/22 13:41:20 @@ -223,6 +223,9 @@ int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); +#ifdef VIMAGE +void rip_destroy(void); +#endif void rip_input(struct mbuf *, int); int rip_output(struct mbuf *, struct socket *, u_long); void ipip_input(struct mbuf *, int); Index: sys/netinet/raw_ip.c =========================================================================== --- sys/netinet/raw_ip.c 2009/02/22 13:41:20 #127 +++ sys/netinet/raw_ip.c 2009/02/22 13:41:20 @@ -82,6 +82,7 @@ #ifdef VIMAGE_GLOBALS struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +static struct uma_zone *ripcb_zone; #endif /* control hooks for ipfw and dummynet */ @@ -184,6 +185,10 @@ { INIT_VNET_INET(curvnet); + V_ripcb_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + V_ripcbinfo.ipi_vnet = curvnet; + INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); LIST_INIT(&V_ripcb); V_ripcbinfo.ipi_listhead = &V_ripcb; @@ -191,13 +196,25 @@ hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); V_ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); - V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + V_ripcbinfo.ipi_zone = V_ripcb_zone; uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +rip_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, + V_ripcbinfo.ipi_hashmask); + hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, + V_ripcbinfo.ipi_porthashmask); +} +#endif + static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) Index: sys/netinet/sctp_crc32.c =========================================================================== --- sys/netinet/sctp_crc32.c 2009/02/22 13:41:20 #14 +++ sys/netinet/sctp_crc32.c 2009/02/22 13:41:20 @@ -30,17 +30,16 @@ /* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $ */ - #include __FBSDID("$FreeBSD: src/sys/netinet/sctp_crc32.c,v 1.14 2009/02/14 11:34:57 rrs Exp $"); +#include #include #include #include #include #include #include -#include #include #include Index: sys/netinet/tcp_hostcache.c =========================================================================== --- sys/netinet/tcp_hostcache.c 2009/02/22 13:41:20 #27 +++ sys/netinet/tcp_hostcache.c 2009/02/22 13:41:20 @@ -216,6 +216,8 @@ /* * Allocate the hostcache entries. + * + * XXX don't need a separate zone for each hc instance - revisit!!! */ V_tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), @@ -227,7 +229,17 @@ */ callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, - tcp_hc_purge, 0); + tcp_hc_purge, curvnet); +} + +void +tcp_hc_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX TODO walk the hashtable and free all entries */ + + callout_drain(&V_tcp_hc_callout); } /* @@ -634,9 +646,10 @@ static void tcp_hc_purge(void *arg) { + CURVNET_SET((struct vnet *) arg); INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry, *hc_next; - int all = (intptr_t)arg; + int all = 0; /* XXX FIXME! was: int all = (intptr_t)arg; */ int i; if (V_tcp_hostcache.purgeall) { @@ -662,4 +675,6 @@ callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, arg); + + CURVNET_RESTORE(); } Index: sys/netinet/tcp_reass.c =========================================================================== --- sys/netinet/tcp_reass.c 2009/02/22 13:41:20 #10 +++ sys/netinet/tcp_reass.c 2009/02/22 13:41:20 @@ -108,10 +108,12 @@ INIT_VNET_INET(curvnet); V_tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); } +#ifdef VIMAGE_GLOBALS uma_zone_t tcp_reass_zone; +#endif void tcp_reass_init(void) @@ -126,9 +128,9 @@ V_tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &V_tcp_reass_maxseg); - tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), + V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -180,7 +182,7 @@ * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ - te = uma_zalloc(tcp_reass_zone, M_NOWAIT); + te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT); if (te == NULL) { V_tcpstat.tcps_rcvmemdrop++; m_freem(m); @@ -213,7 +215,7 @@ V_tcpstat.tcps_rcvduppack++; V_tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); - uma_zfree(tcp_reass_zone, te); + uma_zfree(V_tcp_reass_zone, te); tp->t_segqlen--; V_tcp_reass_qsize--; /* @@ -250,7 +252,7 @@ nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - uma_zfree(tcp_reass_zone, q); + uma_zfree(V_tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; q = nq; @@ -287,7 +289,7 @@ m_freem(q->tqe_m); else sbappendstream_locked(&so->so_rcv, q->tqe_m); - uma_zfree(tcp_reass_zone, q); + uma_zfree(V_tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; q = nq; Index: sys/netinet/tcp_sack.c =========================================================================== --- sys/netinet/tcp_sack.c 2009/02/22 13:41:20 #46 +++ sys/netinet/tcp_sack.c 2009/02/22 13:41:20 @@ -123,9 +123,8 @@ #include +#ifdef VIMAGE_GLOBALS extern struct uma_zone *sack_hole_zone; - -#ifdef VIMAGE_GLOBALS int tcp_do_sack; int tcp_sack_maxholes; int tcp_sack_globalmaxholes; @@ -265,7 +264,7 @@ return NULL; } - hole = (struct sackhole *)uma_zalloc(sack_hole_zone, M_NOWAIT); + hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; @@ -287,7 +286,7 @@ { INIT_VNET_INET(tp->t_vnet); - uma_zfree(sack_hole_zone, hole); + uma_zfree(V_sack_hole_zone, hole); tp->snd_numholes--; V_tcp_sack_globalholes--; Index: sys/netinet/tcp_subr.c =========================================================================== --- sys/netinet/tcp_subr.c 2009/02/22 13:41:20 #223 +++ sys/netinet/tcp_subr.c 2009/02/22 13:41:20 @@ -269,7 +269,6 @@ struct tcp_timer tt; }; -static uma_zone_t tcpcb_zone; MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); struct callout isn_callout; static struct mtx isn_mtx; @@ -278,15 +277,21 @@ #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) +#ifdef VIMAGE_GLOBALS +static uma_zone_t tcpcb_zone; +static struct uma_zone *tcp_ipi_zone; +#endif + /* * TCP initialization. */ static void tcp_zone_change(void *tag) { + INIT_VNET_INET(curvnet); /* XXX */ - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); - uma_zone_set_max(tcpcb_zone, maxsockets); + uma_zone_set_max(V_tcp_ipi_zone, maxsockets); + uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } @@ -341,7 +346,20 @@ V_tcp_autosndbuf_inc = 8*1024; V_tcp_autosndbuf_max = 256*1024; - V_nolocaltimewait = 0; + + V_tcp_ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_tcp_ipi_zone, maxsockets); + /* + * These have to be type stable for the benefit of the timers. + */ + V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_tcpcb_zone, maxsockets); + V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + tcp_tw_init(); V_tcp_do_sack = 1; V_tcp_sack_maxholes = 128; @@ -377,9 +395,8 @@ &V_tcbinfo.ipi_hashmask); V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, &V_tcbinfo.ipi_porthashmask); - V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); + V_tcbinfo.ipi_zone = V_tcp_ipi_zone; + V_tcbinfo.ipi_vnet = curvnet; #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -390,27 +407,42 @@ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR - /* - * These have to be type stable for the benefit of the timers. - */ - tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcpcb_zone, maxsockets); - tcp_tw_init(); + syncache_init(); tcp_hc_init(); tcp_reass_init(); + + if (!IS_DEFAULT_VNET(curvnet)) + return; + ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); - tcp_isn_tick(NULL); + callout_reset(&isn_callout, 1, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); - sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +tcp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + tcp_tw_destroy(); + tcp_hc_destroy(); + syncache_destroy(); + + /* XXX check that hashes are empty! */ + hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, + V_tcbinfo.ipi_hashmask); + hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, + V_tcbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_tcbinfo); +} +#endif + void tcp_fini(void *xtp) { @@ -686,11 +718,14 @@ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); + tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; +#ifdef VIMAGE + tp->t_vnet = inp->inp_vnet; +#endif /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 @@ -846,7 +881,7 @@ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - uma_zfree(tcp_reass_zone, q); + uma_zfree(V_tcp_reass_zone, q); tp->t_segqlen--; V_tcp_reass_qsize--; } @@ -856,7 +891,7 @@ tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; - uma_zfree(tcpcb_zone, tp); + uma_zfree(V_tcpcb_zone, tp); } /* @@ -929,7 +964,7 @@ != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); - uma_zfree(tcp_reass_zone, te); + uma_zfree(V_tcp_reass_zone, te); tcpb->t_segqlen--; V_tcp_reass_qsize--; } Index: sys/netinet/tcp_syncache.c =========================================================================== --- sys/netinet/tcp_syncache.c 2009/02/22 13:41:20 #165 +++ sys/netinet/tcp_syncache.c 2009/02/22 13:41:20 @@ -259,6 +259,9 @@ /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { +#ifdef VIMAGE + V_tcp_syncache.hashbase[i].sch_vnet = curvnet; +#endif TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); @@ -268,11 +271,25 @@ } /* Create the syncache entry zone. */ + /* XXX one zone for all vnets should do fine - revisit!!! */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); } +#ifdef VIMAGE +void +syncache_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX walk the cache, free remaining objects, stop timers */ + + uma_zdestroy(V_tcp_syncache.zone); + FREE(V_tcp_syncache.hashbase, M_SYNCACHE); +} +#endif + /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. Index: sys/netinet/tcp_syncache.h =========================================================================== --- sys/netinet/tcp_syncache.h 2009/02/22 13:41:20 #5 +++ sys/netinet/tcp_syncache.h 2009/02/22 13:41:20 @@ -35,6 +35,9 @@ #ifdef _KERNEL void syncache_init(void); +#ifdef VIMAGE +void syncache_destroy(void); +#endif void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); @@ -96,6 +99,7 @@ #define SYNCOOKIE_LIFETIME 16 /* seconds */ struct syncache_head { + struct vnet *sch_vnet; struct mtx sch_mtx; TAILQ_HEAD(sch_head, syncache) sch_bucket; struct callout sch_timer; Index: sys/netinet/tcp_timewait.c =========================================================================== --- sys/netinet/tcp_timewait.c 2009/02/22 13:41:20 #17 +++ sys/netinet/tcp_timewait.c 2009/02/22 13:41:20 @@ -94,7 +94,6 @@ #include -static uma_zone_t tcptw_zone; static int maxtcptw; /* @@ -104,6 +103,7 @@ * tcbinfo lock, which must be held over queue iteration and modification. */ #ifdef VIMAGE_GLOBALS +static uma_zone_t tcptw_zone; static TAILQ_HEAD(, tcptw) twq_2msl; int nolocaltimewait; #endif @@ -132,6 +132,7 @@ static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, new; if (maxtcptw == 0) @@ -142,7 +143,7 @@ if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; - uma_zone_set_max(tcptw_zone, maxtcptw); + uma_zone_set_max(V_tcptw_zone, maxtcptw); } return (error); } @@ -158,9 +159,10 @@ void tcp_tw_zone_change(void) { + INIT_VNET_INET(curvnet); /* XXX */ if (maxtcptw == 0) - uma_zone_set_max(tcptw_zone, tcptw_auto_size()); + uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); } void @@ -168,15 +170,30 @@ { INIT_VNET_INET(curvnet); - tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), + TAILQ_INIT(&V_twq_2msl); + + V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) - uma_zone_set_max(tcptw_zone, tcptw_auto_size()); + uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else - uma_zone_set_max(tcptw_zone, maxtcptw); - TAILQ_INIT(&V_twq_2msl); + uma_zone_set_max(V_tcptw_zone, maxtcptw); +} + +#ifdef VIMAGE +void +tcp_tw_destroy(void) +{ + INIT_VNET_INET(curvnet); + struct tcptw *tw; + + INP_INFO_WLOCK(&V_tcbinfo); + while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + tcp_twclose(tw, 0); + INP_INFO_WUNLOCK(&V_tcbinfo); } +#endif /* * Move a TCP connection into TIME_WAIT state. @@ -204,7 +221,7 @@ return; } - tw = uma_zalloc(tcptw_zone, M_NOWAIT); + tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_tw_2msl_scan(1); if (tw == NULL) { @@ -522,7 +539,7 @@ tw->tw_cred = NULL; if (reuse) return; - uma_zfree(tcptw_zone, tw); + uma_zfree(V_tcptw_zone, tw); } int Index: sys/netinet/tcp_var.h =========================================================================== --- sys/netinet/tcp_var.h 2009/02/22 13:41:20 #102 +++ sys/netinet/tcp_var.h 2009/02/22 13:41:20 @@ -35,6 +35,8 @@ #include +struct vnet; + /* * Kernel variables for tcp. */ @@ -186,7 +188,8 @@ int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ - void *t_pspare[3]; /* toe usrreqs / toepcb * / congestion algo / vimage / 1 general use */ + struct vnet *t_vnet; /* back pointer to parent vnet */ + void *t_pspare[2]; /* toe usrreqs / toepcb * / congestion algo / vimage / 1 general use */ struct toe_usrreqs *t_tu; /* offload operations vector */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ @@ -532,6 +535,7 @@ extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_autorcvbuf; extern int blackhole; extern int drop_synfin; extern int tcp_do_rfc3042; @@ -575,6 +579,7 @@ void tcp_drain(void); void tcp_fasttimo(void); void tcp_init(void); +void tcp_destroy(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); @@ -596,6 +601,9 @@ void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); +#ifdef VIMAGE +void tcp_tw_destroy(void); +#endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); @@ -616,6 +624,7 @@ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); +void tcp_hc_destroy(void); void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); u_long tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, u_long); Index: sys/netinet/udp_usrreq.c =========================================================================== --- sys/netinet/udp_usrreq.c 2009/02/22 13:41:20 #156 +++ sys/netinet/udp_usrreq.c 2009/02/22 13:41:20 @@ -152,11 +152,16 @@ static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); +#ifdef VIMAGE_GLOBALS +static struct uma_zone *udp_ipi_zone; +#endif + static void udp_zone_change(void *tag) { + INIT_VNET_INET(curvnet); /* XXX */ - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); + uma_zone_set_max(V_udp_ipi_zone, maxsockets); } static int @@ -176,6 +181,13 @@ V_udp_blackhole = 0; + V_udp_ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, + NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_udp_ipi_zone, maxsockets); + EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, + EVENTHANDLER_PRI_ANY); + V_udbinfo.ipi_vnet = curvnet; + INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); LIST_INIT(&V_udb); V_udbinfo.ipi_listhead = &V_udb; @@ -183,12 +195,22 @@ &V_udbinfo.ipi_hashmask); V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, &V_udbinfo.ipi_porthashmask); - V_udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, - NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, - EVENTHANDLER_PRI_ANY); + V_udbinfo.ipi_zone = V_udp_ipi_zone; +} + +#ifdef VIMAGE +void +udp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_udbinfo.ipi_hashbase, M_PCB, + V_udbinfo.ipi_hashmask); + hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB, + V_udbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_udbinfo); } +#endif /* * Subroutine of udp_input(), which appends the provided mbuf chain to the Index: sys/netinet/udp_var.h =========================================================================== --- sys/netinet/udp_var.h 2009/02/22 13:41:20 #13 +++ sys/netinet/udp_var.h 2009/02/22 13:41:20 @@ -107,6 +107,9 @@ void udp_ctlinput(int, struct sockaddr *, void *); void udp_init(void); +#ifdef VIMAGE +void udp_destroy(void); +#endif void udp_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); int udp_shutdown(struct socket *so); Index: sys/netinet/vinet.h =========================================================================== --- sys/netinet/vinet.h 2009/02/22 13:41:20 #9 +++ sys/netinet/vinet.h 2009/02/22 13:41:20 @@ -86,6 +86,13 @@ struct tcp_hostcache _tcp_hostcache; struct callout _tcp_hc_callout; + struct uma_zone *_tcp_ipi_zone; + struct uma_zone *_tcp_reass_zone; + struct uma_zone *_tcpcb_zone; + struct uma_zone *_tcptw_zone; + struct uma_zone *_sack_hole_zone; + struct uma_zone *_udp_ipi_zone; + struct tcp_syncache _tcp_syncache; int _tcp_syncookies; int _tcp_syncookiesonly; @@ -149,6 +156,7 @@ struct inpcbhead _ripcb; struct inpcbinfo _ripcbinfo; + struct uma_zone *_ripcb_zone; struct socket *_ip_mrouter; struct socket *_ip_rsvpd; @@ -277,6 +285,7 @@ #define V_reply_src VNET_INET(reply_src) #define V_ripcb VNET_INET(ripcb) #define V_ripcbinfo VNET_INET(ripcbinfo) +#define V_ripcb_zone VNET_INET(ripcb_zone) #define V_router_info_head VNET_INET(router_info_head) #define V_rsvp_on VNET_INET(rsvp_on) #define V_rtq_minreallyold VNET_INET(rtq_minreallyold) @@ -284,6 +293,7 @@ #define V_rtq_timeout VNET_INET(rtq_timeout) #define V_rtq_timer VNET_INET(rtq_timer) #define V_rtq_toomany VNET_INET(rtq_toomany) +#define V_sack_hole_zone VNET_INET(sack_hole_zone) #define V_sameprefixcarponly VNET_INET(sameprefixcarponly) #define V_ss_fltsz VNET_INET(ss_fltsz) #define V_ss_fltsz_local VNET_INET(ss_fltsz_local) @@ -315,6 +325,7 @@ #define V_tcp_inflight_rttthresh VNET_INET(tcp_inflight_rttthresh) #define V_tcp_inflight_stab VNET_INET(tcp_inflight_stab) #define V_tcp_insecure_rst VNET_INET(tcp_insecure_rst) +#define V_tcp_ipi_zone VNET_INET(tcp_ipi_zone) #define V_tcp_isn_reseed_interval VNET_INET(tcp_isn_reseed_interval) #define V_tcp_minmss VNET_INET(tcp_minmss) #define V_tcp_mssdflt VNET_INET(tcp_mssdflt) @@ -322,6 +333,7 @@ #define V_tcp_reass_maxseg VNET_INET(tcp_reass_maxseg) #define V_tcp_reass_overflows VNET_INET(tcp_reass_overflows) #define V_tcp_reass_qsize VNET_INET(tcp_reass_qsize) +#define V_tcp_reass_zone VNET_INET(tcp_reass_zone) #define V_tcp_sack_globalholes VNET_INET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET_INET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET_INET(tcp_sack_maxholes) @@ -330,11 +342,14 @@ #define V_tcp_syncookies VNET_INET(tcp_syncookies) #define V_tcp_syncookiesonly VNET_INET(tcp_syncookiesonly) #define V_tcp_v6mssdflt VNET_INET(tcp_v6mssdflt) +#define V_tcpcb_zone VNET_INET(tcpcb_zone) +#define V_tcptw_zone VNET_INET(tcptw_zone) #define V_tcpstat VNET_INET(tcpstat) #define V_twq_2msl VNET_INET(twq_2msl) #define V_udb VNET_INET(udb) #define V_udbinfo VNET_INET(udbinfo) #define V_udp_blackhole VNET_INET(udp_blackhole) +#define V_udp_ipi_zone VNET_INET(udp_ipi_zone) #define V_udpstat VNET_INET(udpstat) #define V_useloopback VNET_INET(useloopback) Index: sys/netinet6/frag6.c =========================================================================== --- sys/netinet6/frag6.c 2009/02/22 13:41:20 #36 +++ sys/netinet6/frag6.c 2009/02/22 13:41:20 @@ -109,14 +109,17 @@ { INIT_VNET_INET6(curvnet); + V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; + + if (!IS_DEFAULT_VNET(curvnet)) + return; + EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); IP6Q_LOCK_INIT(); - - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; } /* Index: sys/netinet6/in6_ifattach.c =========================================================================== --- sys/netinet6/in6_ifattach.c 2009/02/22 13:41:20 #50 +++ sys/netinet6/in6_ifattach.c 2009/02/22 13:41:20 @@ -870,8 +870,9 @@ } void -in6_tmpaddrtimer(void *ignored_arg) +in6_tmpaddrtimer(void *arg) { + CURVNET_SET((struct vnet *) arg); INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct nd_ifinfo *ndi; @@ -880,7 +881,7 @@ callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); + V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, arg); bzero(nullbuf, sizeof(nullbuf)); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; @@ -896,6 +897,7 @@ } } + CURVNET_RESTORE(); } static void Index: sys/netinet6/in6_proto.c =========================================================================== --- sys/netinet6/in6_proto.c 2009/02/22 13:41:20 #42 +++ sys/netinet6/in6_proto.c 2009/02/22 13:41:20 @@ -147,6 +147,9 @@ .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, +#ifdef VIMAGE + .pr_destroy = ip6_destroy, +#endif .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, @@ -347,6 +350,9 @@ }; extern int in6_inithead(void **, int); +#ifdef VIMAGE +extern int in6_detachhead(void **, int); +#endif struct domain inet6domain = { .dom_family = AF_INET6, @@ -442,6 +448,9 @@ sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; @@ -462,6 +471,9 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; Index: sys/netinet6/in6_rmx.c =========================================================================== --- sys/netinet6/in6_rmx.c 2009/02/22 13:41:20 #31 +++ sys/netinet6/in6_rmx.c 2009/02/22 13:41:20 @@ -110,6 +110,9 @@ #include extern int in6_inithead(void **head, int off); +#ifdef VIMAGE +extern int in6_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -287,7 +290,7 @@ CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); - struct radix_node_head *rnh = rock; + struct radix_node_head *rnh = V_rt_tables[0][AF_INET6]; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; @@ -375,7 +378,7 @@ CURVNET_SET_QUIET((struct vnet *) rock); INIT_VNET_NET((struct vnet *) rock); INIT_VNET_INET6((struct vnet *) rock); - struct radix_node_head *rnh = rock; + struct radix_node_head *rnh = V_rt_tables[0][AF_INET6]; struct mtuex_arg arg; struct timeval atv; @@ -443,8 +446,20 @@ rnh->rnh_addaddr = in6_addroute; rnh->rnh_matchaddr = in6_matroute; callout_init(&V_rtq_timer6, CALLOUT_MPSAFE); - in6_rtqtimo(rnh); /* kick off timeout first time */ callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE); - in6_mtutimo(rnh); /* kick off timeout first time */ + in6_rtqtimo(curvnet); /* kick off timeout first time */ + in6_mtutimo(curvnet); /* kick off timeout first time */ + return 1; +} + +#ifdef VIMAGE +int +in6_detachhead(void **head, int off) +{ + INIT_VNET_INET6(curvnet); + + callout_drain(&V_rtq_timer6); + callout_drain(&V_rtq_mtutimer); return 1; } +#endif Index: sys/netinet6/in6_src.c =========================================================================== --- sys/netinet6/in6_src.c 2009/02/22 13:41:20 #63 +++ sys/netinet6/in6_src.c 2009/02/22 13:41:20 @@ -920,8 +920,6 @@ void addrsel_policy_init(void) { - ADDRSEL_LOCK_INIT(); - ADDRSEL_SXLOCK_INIT(); INIT_VNET_INET6(curvnet); V_ip6_prefer_tempaddr = 0; @@ -931,6 +929,12 @@ /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + + if (!IS_DEFAULT_VNET(curvnet)) + return; + + ADDRSEL_LOCK_INIT(); + ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * Index: sys/netinet6/ip6_input.c =========================================================================== --- sys/netinet6/ip6_input.c 2009/02/22 13:41:20 #87 +++ sys/netinet6/ip6_input.c 2009/02/22 13:41:20 @@ -154,6 +154,20 @@ static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif +#ifndef VIMAGE_GLOBALS +static void vnet_inet6_register(void); + +VNET_MOD_DECLARE(INET6, inet6, NULL, NULL, INET, NULL) + +static void +vnet_inet6_register(void) +{ + vnet_mod_register(&vnet_inet6_modinfo); +} + +SYSINIT(inet6, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet6_register, 0); +#endif + /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. @@ -233,6 +247,17 @@ /* 40 1K datagrams */ V_dad_init = 0; + scope6_init(); + addrsel_policy_init(); + nd6_init(); + frag6_init(); + + V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; + + /* Skip global initialization stuff for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); @@ -267,12 +292,18 @@ ip6intrq.ifq_maxlen = V_ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); - scope6_init(); - addrsel_policy_init(); - nd6_init(); - frag6_init(); - V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; +} + +#ifdef VIMAGE +void +ip6_destroy() +{ + INIT_VNET_INET6(curvnet); + + nd6_destroy(); + callout_drain(&V_in6_tmpaddrtimer_ch); } +#endif static void ip6_init2(void *dummy) @@ -281,14 +312,14 @@ /* nd6_timer_init */ callout_init(&V_nd6_timer_ch, 0); - callout_reset(&V_nd6_timer_ch, hz, nd6_timer, NULL); + callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); /* timer for regeneranation of temporary addresses randomize ID */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, - in6_tmpaddrtimer, NULL); + in6_tmpaddrtimer, curvnet); } /* cheat */ @@ -342,7 +373,7 @@ #define M2MMAX (sizeof(V_ip6stat.ip6s_m2m)/sizeof(V_ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { - V_ip6stat.ip6s_m2m[V_loif[0].if_index]++; /* XXX */ + V_ip6stat.ip6s_m2m[V_loif->if_index]++; } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) V_ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else Index: sys/netinet6/ip6_mroute.c =========================================================================== --- sys/netinet6/ip6_mroute.c 2009/02/22 13:41:20 #46 +++ sys/netinet6/ip6_mroute.c 2009/02/22 13:41:20 @@ -185,7 +185,7 @@ #ifdef MRT6DEBUG #ifdef VIMAGE_GLOBALS -static u_int mrt6debug = 0; /* debug level */ +static u_int mrt6debug; /* debug level */ #endif #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 @@ -492,7 +492,11 @@ { INIT_VNET_INET6(curvnet); + V_ip6_mrouter_ver = 0; + #ifdef MRT6DEBUG + V_mrt6debug = 0; + if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", Index: sys/netinet6/ip6_var.h =========================================================================== --- sys/netinet6/ip6_var.h 2009/02/22 13:41:20 #44 +++ sys/netinet6/ip6_var.h 2009/02/22 13:41:20 @@ -327,6 +327,9 @@ struct in6_ifaddr; void ip6_init __P((void)); +#ifdef VIMAGE +void ip6_destroy __P((void)); +#endif void ip6_input __P((struct mbuf *)); struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *)); void ip6_freepcbopts __P((struct ip6_pktopts *)); Index: sys/netinet6/ip6protosw.h =========================================================================== --- sys/netinet6/ip6protosw.h 2009/02/22 13:41:20 #10 +++ sys/netinet6/ip6protosw.h 2009/02/22 13:41:20 @@ -129,6 +129,8 @@ /* utility hooks */ void (*pr_init) /* initialization hook */ __P((void)); + void (*pr_destroy) /* cleanup hook */ + __P((void)); void (*pr_fasttimo) /* fast timeout (200ms) */ __P((void)); Index: sys/netinet6/mld6.c =========================================================================== --- sys/netinet6/mld6.c 2009/02/22 13:41:20 #33 +++ sys/netinet6/mld6.c 2009/02/22 13:41:20 @@ -175,6 +175,7 @@ callout_stop(in6m->in6m_timer_ch); + CURVNET_SET(in6m->in6m_ifp->if_vnet); switch (in6m->in6m_state) { case MLD_REPORTPENDING: mld6_start_listening(in6m); @@ -183,6 +184,7 @@ mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); break; } + CURVNET_RESTORE(); splx(s); } Index: sys/netinet6/nd6.c =========================================================================== --- sys/netinet6/nd6.c 2009/02/22 13:41:20 #92 +++ sys/netinet6/nd6.c 2009/02/22 13:41:20 @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -134,14 +135,8 @@ nd6_init(void) { INIT_VNET_INET6(curvnet); - static int nd6_init_done = 0; int i; - if (nd6_init_done) { - log(LOG_NOTICE, "nd6_init called more than once(ignored)\n"); - return; - } - V_nd6_prune = 1; /* walk list every 1 seconds */ V_nd6_delay = 5; /* delay first probe time 5 second */ V_nd6_umaxtries = 3; /* maximum unicast query */ @@ -180,6 +175,8 @@ V_ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; V_ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; + V_ip6_desync_factor = 0; + all1_sa.sin6_family = AF_INET6; all1_sa.sin6_len = sizeof(struct sockaddr_in6); for (i = 0; i < sizeof(all1_sa.sin6_addr); i++) @@ -190,11 +187,19 @@ /* start timer */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + nd6_slowtimo, curvnet); +} - nd6_init_done = 1; +#ifdef VIMAGE +void +nd6_destroy() +{ + INIT_VNET_INET6(curvnet); + callout_drain(&V_nd6_slowtimo_ch); + callout_drain(&V_nd6_timer_ch); } +#endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) @@ -280,7 +285,6 @@ if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ -#undef MIN } void @@ -600,7 +604,7 @@ struct in6_addrlifetime *lt6; callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, - nd6_timer, NULL); + nd6_timer, arg); /* expire default router list */ s = splnet(); @@ -868,7 +872,6 @@ struct llentry * nd6_lookup(struct in6_addr *addr6, int flags, struct ifnet *ifp) { - INIT_VNET_INET6(curvnet); struct sockaddr_in6 sin6; struct llentry *ln; int llflags = 0; @@ -1150,7 +1153,6 @@ LLE_WUNLOCK(ln); } - int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { @@ -1665,7 +1667,7 @@ struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + nd6_slowtimo, arg); IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { Index: sys/netinet6/nd6.h =========================================================================== --- sys/netinet6/nd6.h 2009/02/22 13:41:20 #21 +++ sys/netinet6/nd6.h 2009/02/22 13:41:20 @@ -371,6 +371,9 @@ /* XXX: need nd6_var.h?? */ /* nd6.c */ void nd6_init __P((void)); +#ifdef VIMAGE +void nd6_destroy __P((void)); +#endif struct nd_ifinfo *nd6_ifattach __P((struct ifnet *)); void nd6_ifdetach __P((struct nd_ifinfo *)); int nd6_is_addr_neighbor __P((struct sockaddr_in6 *, struct ifnet *)); Index: sys/netinet6/nd6_nbr.c =========================================================================== --- sys/netinet6/nd6_nbr.c 2009/02/22 13:41:20 #52 +++ sys/netinet6/nd6_nbr.c 2009/02/22 13:41:20 @@ -86,7 +86,7 @@ static struct dadq *nd6_dad_find(struct ifaddr *); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_stoptimer(struct dadq *); -static void nd6_dad_timer(struct ifaddr *); +static void nd6_dad_timer(struct dadq *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); static void nd6_dad_na_input(struct ifaddr *); @@ -1105,7 +1105,6 @@ } } -TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; @@ -1115,10 +1114,11 @@ int dad_ns_icount; int dad_na_icount; struct callout dad_timer_ch; + struct vnet *dad_vnet; }; #ifdef VIMAGE_GLOBALS -static struct dadq_head dadq; +static TAILQ_HEAD(, dadq) dadq; int dad_init; #endif @@ -1140,7 +1140,7 @@ { callout_reset(&dp->dad_timer_ch, ticks, - (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); + (void (*)(void *))nd6_dad_timer, (void *)dp); } static void @@ -1208,6 +1208,9 @@ } bzero(dp, sizeof(*dp)); callout_init(&dp->dad_timer_ch, 0); +#ifdef VIMAGE + dp->dad_vnet = curvnet; +#endif TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), @@ -1259,13 +1262,13 @@ } static void -nd6_dad_timer(struct ifaddr *ifa) +nd6_dad_timer(struct dadq *dp) { CURVNET_SET(dp->dad_vnet); INIT_VNET_INET6(curvnet); int s; + struct ifaddr *ifa = dp->dad_ifa; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; - struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; s = splnet(); /* XXX */ @@ -1275,11 +1278,6 @@ log(LOG_ERR, "nd6_dad_timer: called with null parameter\n"); goto done; } - dp = nd6_dad_find(ifa); - if (dp == NULL) { - log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); - goto done; - } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", Index: sys/netinet6/nd6_rtr.c =========================================================================== --- sys/netinet6/nd6_rtr.c 2009/02/22 13:41:20 #52 +++ sys/netinet6/nd6_rtr.c 2009/02/22 13:41:20 @@ -1542,6 +1542,7 @@ int nd6_prefix_onlink(struct nd_prefix *pr) { + INIT_VNET_NET(curvnet); INIT_VNET_INET6(curvnet); struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; Index: sys/netinet6/scope6.c =========================================================================== --- sys/netinet6/scope6.c 2009/02/22 13:41:20 #21 +++ sys/netinet6/scope6.c 2009/02/22 13:41:20 @@ -81,8 +81,13 @@ #else V_ip6_use_defzone = 0; #endif + + bzero(&V_sid_default, sizeof(V_sid_default)); + + if (!IS_DEFAULT_VNET(curvnet)) + return; + SCOPE6_LOCK_INIT(); - bzero(&V_sid_default, sizeof(V_sid_default)); } struct scope6_id * Index: sys/netinet6/sctp6_usrreq.c =========================================================================== --- sys/netinet6/sctp6_usrreq.c 2009/02/22 13:41:20 #48 +++ sys/netinet6/sctp6_usrreq.c 2009/02/22 13:41:20 @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -803,6 +804,7 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { + INIT_VNET_INET6(curvnet); struct sctp_inpcb *inp; struct inpcb *in_inp; struct in6pcb *inp6; @@ -915,6 +917,7 @@ static int sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) { + INIT_VNET_INET6(curvnet); uint32_t vrf_id; int error = 0; struct sctp_inpcb *inp; Index: sys/netipsec/ipsec.c =========================================================================== --- sys/netipsec/ipsec.c 2009/02/22 13:41:20 #46 +++ sys/netipsec/ipsec.c 2009/02/22 13:41:20 @@ -103,6 +103,11 @@ #endif #endif +static int vnet_ipsec_iattach(const void *); +#ifdef VIMAGE +static int vnet_ipsec_idetach(const void *); +#endif + #ifdef VIMAGE_GLOBALS /* NB: name changed so netstat doesn't use it. */ struct ipsecstat ipsec4stat; @@ -228,6 +233,9 @@ "IPsec IPv6 statistics."); #endif /* INET6 */ +VNET_MOD_DECLARE(IPSEC, ipsec, vnet_ipsec_iattach, vnet_ipsec_idetach, + INET, NULL) + static int ipsec_setspidx_inpcb __P((struct mbuf *, struct inpcb *)); static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int)); static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); @@ -1758,9 +1766,34 @@ ipsec_attach(void) { +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_ipsec_modinfo); +#else + vnet_ipsec_iattach(NULL); +#endif +} + +static int +vnet_ipsec_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + SECPOLICY_LOCK_INIT(&V_ip4_def_policy); V_ip4_def_policy.refcnt = 1; /* NB: disallow free. */ + + return (0); +} + +/* XXX finish this! */ +#ifdef VIMAGE +static int +vnet_ipsec_idetach(unused) + const void *unused; +{ + return (0); } +#endif SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL); Index: sys/netipsec/key.c =========================================================================== --- sys/netipsec/key.c 2009/02/22 13:41:20 #40 +++ sys/netipsec/key.c 2009/02/22 13:41:20 @@ -7190,12 +7190,6 @@ V_ipsec_esp_auth = 0; V_ipsec_ah_keymin = 128; - SPTREE_LOCK_INIT(); - REGTREE_LOCK_INIT(); - SAHTREE_LOCK_INIT(); - ACQ_LOCK_INIT(); - SPACQ_LOCK_INIT(); - for (i = 0; i < IPSEC_DIR_MAX; i++) LIST_INIT(&V_sptree[i]); @@ -7211,6 +7205,15 @@ V_ip4_def_policy.policy = IPSEC_POLICY_NONE; V_ip4_def_policy.refcnt++; /*never reclaim this*/ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + SPTREE_LOCK_INIT(); + REGTREE_LOCK_INIT(); + SAHTREE_LOCK_INIT(); + ACQ_LOCK_INIT(); + SPACQ_LOCK_INIT(); + #ifndef IPSEC_DEBUG2 timeout((void *)key_timehandler, (void *)0, hz); #endif /*IPSEC_DEBUG2*/ @@ -7219,9 +7222,74 @@ keystat.getspi_count = 1; printf("IPsec: Initialized Security Association Processing.\n"); +} + +#ifdef VIMAGE +void key_destroy(void) +{ + INIT_VNET_IPSEC(curvnet); + struct secpolicy *sp, *nextsp; + struct secspacq *acq, *nextacq; + struct secashead *sah, *nextsah; + struct secreg *reg; + int i; + + SPTREE_LOCK(); + for (i = 0; i < IPSEC_DIR_MAX; i++) { + for (sp = LIST_FIRST(&V_sptree[i]); + sp != NULL; sp = nextsp) { + nextsp = LIST_NEXT(sp, chain); + if (__LIST_CHAINED(sp)) { + LIST_REMOVE(sp, chain); + free(sp, M_IPSEC_SP); + } + } + } + SPTREE_UNLOCK(); + + SAHTREE_LOCK(); + for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { + nextsah = LIST_NEXT(sah, chain); + if (__LIST_CHAINED(sah)) { + LIST_REMOVE(sah, chain); + free(sah, M_IPSEC_SAH); + } + } + SAHTREE_UNLOCK(); - return; + REGTREE_LOCK(); + for (i = 0; i <= SADB_SATYPE_MAX; i++) { + LIST_FOREACH(reg, &V_regtree[i], chain) { + if (__LIST_CHAINED(reg)) { + LIST_REMOVE(reg, chain); + free(reg, M_IPSEC_SAR); + break; + } + } + } + REGTREE_UNLOCK(); + + ACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + ACQ_UNLOCK(); + + SPACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + SPACQ_UNLOCK(); } +#endif /* * XXX: maybe This function is called after INBOUND IPsec processing. Index: sys/netipsec/key.h =========================================================================== --- sys/netipsec/key.h 2009/02/22 13:41:20 #4 +++ sys/netipsec/key.h 2009/02/22 13:41:20 @@ -96,6 +96,9 @@ extern void key_freereg __P((struct socket *)); extern int key_parse __P((struct mbuf *, struct socket *)); extern void key_init __P((void)); +#ifdef VIMAGE +extern void key_destroy(void); +#endif extern void key_sa_recordxfer __P((struct secasvar *, struct mbuf *)); extern void key_sa_routechange __P((struct sockaddr *)); extern void key_sa_stir_iv __P((struct secasvar *)); Index: sys/netipsec/keysock.c =========================================================================== --- sys/netipsec/keysock.c 2009/02/22 13:41:20 #28 +++ sys/netipsec/keysock.c 2009/02/22 13:41:20 @@ -578,6 +578,9 @@ .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init0, +#ifdef VIMAGE + .dom_destroy = key_destroy, +#endif .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[sizeof(keysw)/sizeof(keysw[0])] }; Index: sys/netipsec/xform_ah.c =========================================================================== --- sys/netipsec/xform_ah.c 2009/02/22 13:41:20 #20 +++ sys/netipsec/xform_ah.c 2009/02/22 13:41:20 @@ -102,6 +102,10 @@ SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ah, IPSECCTL_STATS, stats, CTLFLAG_RD, ahstat, ahstat, ""); +static int ah_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(AH, ah, ah_iattach, NULL, IPSEC) + static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ static int ah_input_cb(struct cryptop*); @@ -1216,13 +1220,27 @@ ah_init, ah_zeroize, ah_input, ah_output, }; +static int +ah_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ah_enable = 1; /* control flow of packets with AH */ + V_ah_cleartos = 1; /* clear ip_tos when doing AH calc */ + + return 0; +} + static void ah_attach(void) { - V_ah_enable = 1; /* control flow of packets with AH */ - V_ah_cleartos = 1; /* clear ip_tos when doing AH calc */ - +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_ah_modinfo); +#else + ah_iattach(NULL); +#endif xform_register(&ah_xformsw); } SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL); Index: sys/netipsec/xform_esp.c =========================================================================== --- sys/netipsec/xform_esp.c 2009/02/22 13:41:20 #24 +++ sys/netipsec/xform_esp.c 2009/02/22 13:41:20 @@ -91,6 +91,10 @@ static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); +static int esp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(ESP, esp, esp_iattach, NULL, IPSEC) + /* * NB: this is public for use by the PF_KEY support. * NB: if you add support here; be sure to add code to esp_attach below! @@ -987,9 +991,12 @@ esp_output }; -static void -esp_attach(void) +static int +esp_iattach(unused) + const void *unused; { + INIT_VNET_IPSEC(curvnet); + #define MAXIV(xform) \ if (xform.blocksize > V_esp_max_ivlen) \ V_esp_max_ivlen = xform.blocksize \ @@ -1006,7 +1013,20 @@ MAXIV(enc_xform_null); /* SADB_EALG_NULL */ MAXIV(enc_xform_camellia); /* SADB_X_EALG_CAMELLIACBC */ +#undef MAXIV + + return 0; +} + +static void +esp_attach(void) +{ + +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_esp_modinfo); +#else + esp_iattach(NULL); +#endif xform_register(&esp_xformsw); -#undef MAXIV } SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL); Index: sys/netipsec/xform_ipcomp.c =========================================================================== --- sys/netipsec/xform_ipcomp.c 2009/02/22 13:41:20 #15 +++ sys/netipsec/xform_ipcomp.c 2009/02/22 13:41:20 @@ -78,6 +78,10 @@ SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipcomp, IPSECCTL_STATS, stats, CTLFLAG_RD, ipcompstat, ipcompstat, ""); +static int ipcomp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPCOMP, ipcomp, ipcomp_iattach, NULL, IPSEC) + static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); @@ -596,11 +600,26 @@ ipcomp_output }; +static int +ipcomp_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ipcomp_enable = 0; + + return 0; +} + static void ipcomp_attach(void) { - V_ipcomp_enable = 0; +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_ipcomp_modinfo); +#else + ipcomp_iattach(NULL); +#endif xform_register(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); Index: sys/netipsec/xform_ipip.c =========================================================================== --- sys/netipsec/xform_ipip.c 2009/02/22 13:41:20 #23 +++ sys/netipsec/xform_ipip.c 2009/02/22 13:41:20 @@ -108,6 +108,10 @@ static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp); +static int ipip_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPIP, ipip, ipip_iattach, NULL, IPSEC) + #ifdef INET6 /* * Really only a wrapper for ipip_input(), for use with IPv6. @@ -696,12 +700,17 @@ return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0); } -static void -ipe4_attach(void) +static int +ipip_iattach(unused) + const void *unused; { + INIT_VNET_IPSEC(curvnet); V_ipip_allow = 0; + if (!IS_DEFAULT_VNET(curvnet)) + return 0; + xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ @@ -711,6 +720,19 @@ (void) encap_attach_func(AF_INET6, -1, ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); #endif + + return 0; +} + +static void +ipe4_attach(void) +{ + +#ifndef VIMAGE_GLOBALS + vnet_mod_register(&vnet_ipip_modinfo); +#else + ipip_iattach(NULL); +#endif } SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL); #endif /* IPSEC */ Index: sys/netipsec/xform_tcp.c =========================================================================== --- sys/netipsec/xform_tcp.c 2009/02/22 13:41:20 #5 +++ sys/netipsec/xform_tcp.c 2009/02/22 13:41:20 @@ -82,6 +82,7 @@ static int tcpsignature_init(struct secasvar *sav, struct xformsw *xsp) { + INIT_VNET_IPSEC(curvnet); int keylen; if (sav->spi != htonl(TCP_SIG_SPI)) { Index: sys/nfsclient/bootp_subr.c =========================================================================== --- sys/nfsclient/bootp_subr.c 2009/02/22 13:41:20 #48 +++ sys/nfsclient/bootp_subr.c 2009/02/22 13:41:20 @@ -395,9 +395,11 @@ printf("\n"); } +/* XXX we are only goin gto look at intefaces in the base vimage */ void bootpboot_p_iflist(void) { + INIT_VNET_NET(basevnet); struct ifnet *ifp; struct ifaddr *ifa; @@ -1607,6 +1609,7 @@ void bootpc_init(void) { + INIT_VNET_NET(basevnet); /* XXX only look at base vnet interfaces? */ struct bootpc_ifcontext *ifctx, *nctx; /* Interface BOOTP contexts */ struct bootpc_globalcontext *gctx; /* Global BOOTP context */ struct ifnet *ifp; Index: sys/nfsclient/nfs_socket.c =========================================================================== --- sys/nfsclient/nfs_socket.c 2009/02/22 13:41:20 #99 +++ sys/nfsclient/nfs_socket.c 2009/02/22 13:41:20 @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1513,6 +1514,7 @@ mtx_unlock(&nmp->nm_mtx); continue; } + CURVNET_SET(so->so_vnet); /* * If there is enough space and the window allows.. * Resend it @@ -1578,6 +1580,7 @@ mtx_unlock(&rep->r_mtx); mtx_unlock(&nmp->nm_mtx); } + CURVNET_RESTORE(); } mtx_unlock(&nfs_reqq_mtx); callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL); Index: sys/nfsclient/nfs_vfsops.c =========================================================================== --- sys/nfsclient/nfs_vfsops.c 2009/02/22 13:41:20 #120 +++ sys/nfsclient/nfs_vfsops.c 2009/02/22 13:41:20 @@ -423,14 +423,17 @@ char buf[128]; char *cp; + CURVNET_SET(TD_TO_VNET(td)); #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif - if (nfs_diskless_valid == 0) + if (nfs_diskless_valid == 0) { + CURVNET_RESTORE(); return (-1); + } if (nfs_diskless_valid == 1) nfs_convert_diskless(); @@ -514,6 +517,7 @@ nd->root_args.hostname = buf; if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { + CURVNET_RESTORE(); return (error); } @@ -530,6 +534,8 @@ break; mtx_unlock(&hostname_mtx); inittodr(ntohl(nd->root_time)); + + CURVNET_RESTORE(); return (0); } Index: sys/nfsclient/nfs_vnops.c =========================================================================== --- sys/nfsclient/nfs_vnops.c 2009/02/22 13:41:20 #129 +++ sys/nfsclient/nfs_vnops.c 2009/02/22 13:41:20 @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include #include @@ -1444,7 +1444,11 @@ if (v3) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { +#ifdef NFS_LEGACYRPC CURVNET_SET(VFSTONFS(dvp->v_mount)->nm_so->so_vnet); +#else + CURVNET_SET(VFSTONFS(dvp->v_mount)->nm_rpcclnt.rc_so->so_vnet); +#endif *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET Index: sys/nlm/nlm_advlock.c =========================================================================== --- sys/nlm/nlm_advlock.c 2009/02/22 13:41:20 #3 +++ sys/nlm/nlm_advlock.c 2009/02/22 13:41:20 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1222,12 +1223,12 @@ } mtx_lock(&hostname_mtx); - snprintf(oh_space, 32, "%d@%s", svid, hostname); + snprintf(oh_space, 32, "%d@%s", svid, G_hostname); mtx_unlock(&hostname_mtx); oh_len = strlen(oh_space); memset(lock, 0, sizeof(*lock)); - lock->caller_name = hostname; + lock->caller_name = G_hostname; lock->fh.n_len = fhlen; lock->fh.n_bytes = fh; lock->oh.n_len = oh_len; Index: sys/rpc/rpc.h =========================================================================== --- sys/rpc/rpc.h 2009/02/22 13:41:20 #1 +++ sys/rpc/rpc.h 2009/02/22 13:41:20 @@ -42,6 +42,7 @@ #ifndef _RPC_RPC_H #define _RPC_RPC_H +#include #include /* some typedefs */ #include #include Index: sys/rpc/rpc_generic.c =========================================================================== --- sys/rpc/rpc_generic.c 2009/02/22 13:41:20 #4 +++ sys/rpc/rpc_generic.c 2009/02/22 13:41:20 @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -184,9 +185,12 @@ struct sockopt opt; int error; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); - if (error) + if (error) { + CURVNET_RESTORE(); return 0; + } sip->si_alen = sa->sa_len; family = sa->sa_family; @@ -199,6 +203,7 @@ opt.sopt_valsize = sizeof type; opt.sopt_td = NULL; error = sogetopt(so, &opt); + CURVNET_RESTORE(); if (error) return 0; @@ -695,7 +700,9 @@ struct sockaddr *sa; int error, bound; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + CURVNET_RESTORE(); if (error) return (0); Index: sys/rpc/svc_dg.c =========================================================================== --- sys/rpc/svc_dg.c 2009/02/22 13:41:20 #3 +++ sys/rpc/svc_dg.c 2009/02/22 13:41:20 @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -123,7 +124,9 @@ xprt->xp_p2 = NULL; xprt->xp_ops = &svc_dg_ops; + CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); + CURVNET_RESTORE(); if (error) goto freedata; Index: sys/rpc/svc_generic.c =========================================================================== --- sys/rpc/svc_generic.c 2009/02/22 13:41:20 #3 +++ sys/rpc/svc_generic.c 2009/02/22 13:41:20 @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -235,6 +236,7 @@ /* * If the socket is unbound, try to bind it. */ + CURVNET_SET(so->so_vnet); if (madeso || !__rpc_sockisbound(so)) { if (bindaddr == NULL) { if (bindresvport(so, NULL)) { @@ -304,9 +306,11 @@ if (nconf) { xprt->xp_netid = strdup(nconf->nc_netid, M_RPC); } + CURVNET_RESTORE(); return (xprt); freedata: + CURVNET_RESTORE(); if (madeso) (void)soclose(so); if (xprt) { Index: sys/sys/domain.h =========================================================================== --- sys/sys/domain.h 2009/02/22 13:41:20 #10 +++ sys/sys/domain.h 2009/02/22 13:41:20 @@ -48,6 +48,8 @@ char *dom_name; void (*dom_init) /* initialize domain data structures */ (void); + void (*dom_destroy) /* cleanup structures / state */ + (void); int (*dom_externalize) /* externalize access rights */ (struct mbuf *, struct mbuf **); void (*dom_dispose) /* dispose of internalized rights */ @@ -56,6 +58,8 @@ struct domain *dom_next; int (*dom_rtattach) /* initialize routing table */ (void **, int); + int (*dom_rtdetach) /* clean up routing table */ + (void **, int); int dom_rtoffset; /* an arg to rtattach, in bits */ /* XXX MRT. * rtoffset May be 0 if the domain supplies its own rtattach(), Index: sys/sys/kernel.h =========================================================================== --- sys/sys/kernel.h 2009/02/22 13:41:20 #54 +++ sys/sys/kernel.h 2009/02/22 13:41:20 @@ -58,8 +58,10 @@ extern struct mtx hostname_mtx; extern unsigned long hostid; extern char hostuuid[64]; +#ifndef VIMAGE extern char hostname[MAXHOSTNAMELEN]; extern char domainname[MAXHOSTNAMELEN]; +#endif extern char kernelname[MAXPATHLEN]; extern int tick; /* usec per tick (1000000 / hz) */ @@ -169,6 +171,7 @@ SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ + SI_SUB_VIMAGE_DONE = 0xef00000, /* clear curvnet*/ SI_SUB_SMP = 0xf000000, /* start the APs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; Index: sys/sys/mbuf.h =========================================================================== --- sys/sys/mbuf.h 2009/02/22 13:41:20 #147 +++ sys/sys/mbuf.h 2009/02/22 13:41:20 @@ -195,6 +195,7 @@ #define M_PROTO6 0x00080000 /* protocol-specific */ #define M_PROTO7 0x00100000 /* protocol-specific */ #define M_PROTO8 0x00200000 /* protocol-specific */ +#define M_REMOTE_VNET 0x00400000 /* mbuf crossed boundary between two vnets */ /* * For RELENG_{6,7} steal these flags for limited multiple routing table * support. In RELENG_8 and beyond, use just one flag and a tag. Index: sys/sys/proc.h =========================================================================== --- sys/sys/proc.h 2009/02/22 13:41:20 #353 +++ sys/sys/proc.h 2009/02/22 13:41:20 @@ -277,6 +277,8 @@ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ + struct vnet *td_vnet; /* (*) Effective vnet */ + const char *td_vnet_lpush; /* (*) Debugging vnet push / pop */ }; struct mtx *thread_lock_block(struct thread *); Index: sys/sys/protosw.h =========================================================================== --- sys/sys/protosw.h 2009/02/22 13:41:20 #30 +++ sys/sys/protosw.h 2009/02/22 13:41:20 @@ -70,6 +70,7 @@ typedef void pr_ctlinput_t (int, struct sockaddr *, void *); typedef int pr_ctloutput_t (struct socket *, struct sockopt *); typedef void pr_init_t (void); +typedef void pr_destroy_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); typedef void pr_drain_t (void); @@ -86,6 +87,7 @@ pr_ctloutput_t *pr_ctloutput; /* control output (from above) */ /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ Index: sys/sys/socketvar.h =========================================================================== --- sys/sys/socketvar.h 2009/02/22 13:41:20 #110 +++ sys/sys/socketvar.h 2009/02/22 13:41:20 @@ -45,6 +45,8 @@ #include #endif +struct vnet; + /* * Kernel structure per socket. * Contains send and receive buffer queues, @@ -72,6 +74,7 @@ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ + struct vnet *so_vnet; /* network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. @@ -292,6 +295,7 @@ MALLOC_DECLARE(M_SONAME); #endif +extern int accf_unloadable; extern int maxsockets; extern u_long sb_max; extern struct uma_zone *socket_zone; Index: sys/sys/sockio.h =========================================================================== --- sys/sys/sockio.h 2009/02/22 13:41:20 #14 +++ sys/sys/sockio.h 2009/02/22 13:41:20 @@ -108,6 +108,10 @@ #define SIOCGPRIVATE_0 _IOWR('i', 80, struct ifreq) /* device private 0 */ #define SIOCGPRIVATE_1 _IOWR('i', 81, struct ifreq) /* device private 1 */ +#define SIOCSPVIMAGE _IOW('i', 101, struct vi_req) /* set proc vimage */ +#define SIOCGPVIMAGE _IOWR('i', 102, struct vi_req) /* get proc vimage */ +#define SIOCSIFVIMAGE _IOWR('i', 103, struct vi_req) /* set ifc vi/net */ + #define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific parameters */ #define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific Index: sys/sys/sysctl.h =========================================================================== --- sys/sys/sysctl.h 2009/02/22 13:41:20 #72 +++ sys/sys/sysctl.h 2009/02/22 13:41:20 @@ -163,6 +163,8 @@ const char *oid_fmt; int oid_refcnt; const char *oid_descr; + short oid_v_subs; + short oid_v_mod; }; #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) @@ -440,6 +442,37 @@ #define FEATURE(name, desc) \ SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD, 0, 1, desc) +/* + * Resolve void *arg1 in a proper virtualization container. + */ +#ifdef VIMAGE +#define SYSCTL_RESOLVE_V_ARG1() do { \ + char *cp; \ + switch (oidp->oid_v_subs) { \ + case V_GLOBAL: \ + /* do nothing - this is NOT a virtualized variable! */ \ + break; \ + case V_NET: \ + cp = (char *) \ + TD_TO_VNET(curthread)->mod_data[oidp->oid_v_mod]; \ + arg1 = cp + (size_t) arg1; \ + break; \ + case V_PROCG: \ + cp = (char *) TD_TO_VPROCG(curthread); \ + arg1 = cp + (size_t) arg1; \ + break; \ + case V_CPU: \ + cp = (char *) TD_TO_VCPU(curthread); \ + arg1 = cp + (size_t) arg1; \ + break; \ + default: \ + panic("unsupported module id %d", oidp->oid_v_subs); \ + } \ +} while (0) +#else +#define SYSCTL_RESOLVE_V_ARG1() +#endif + #endif /* _KERNEL */ /* Index: sys/sys/ucred.h =========================================================================== --- sys/sys/ucred.h 2009/02/22 13:41:20 #32 +++ sys/sys/ucred.h 2009/02/22 13:41:20 @@ -35,6 +35,8 @@ #include +struct vimage; + /* * Credentials. * @@ -55,7 +57,9 @@ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ - void *cr_pspare[3]; /* vimage 2; general use 1 */ + struct vimage *cr_vimage; /* effective vimage */ + struct vimage *cr_rvimage; /* real vimage */ + void *cr_pspare[1]; /* vimage 2; general use 1 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ struct auditinfo_addr cr_audit; /* Audit properties. */ Index: sys/sys/vimage.h =========================================================================== --- sys/sys/vimage.h 2009/02/22 13:41:20 #8 +++ sys/sys/vimage.h 2009/02/22 13:41:20 @@ -30,57 +30,288 @@ * $FreeBSD: src/sys/sys/vimage.h,v 1.8 2008/12/11 15:44:53 bz Exp $ */ -#ifndef _SYS_VIMAGE_H_ -#define _SYS_VIMAGE_H_ +#ifndef _SYS_VIMAGE_H_ +#define _SYS_VIMAGE_H_ + +#include +#include +#include +#include + +#ifdef INVARIANTS +#define VNET_DEBUG +#endif + +#if defined(VIMAGE) && defined(VIMAGE_GLOBALS) +#error "You cannot have both option VIMAGE and option VIMAGE_GLOBALS!" +#endif + +struct vimage; +struct vprocg; +struct vnet; +struct vi_req; +struct kld_sym_lookup; + +struct ifnet; /* XXX must go away */ + +#ifdef VIMAGE_GLOBALS +#define VSYM(base, sym) (sym) +#else +#ifdef VIMAGE +#define VSYM(base, sym) ((base)->_##sym) +#else +#define VSYM(base, sym) (base ## _0._ ## sym) +#endif +#endif + +#ifdef VIMAGE +#define curvnet curthread->td_vnet +#else +#define curvnet NULL +#endif + +#define VNET_SYMMAP(mod, name) \ + { #name, offsetof(struct vnet_##mod, _##name), \ + sizeof(((struct vnet_##mod *) curthread)->_##name) } + +#define VNET_MOD_NONE_VERSION 2008122301 + +#ifndef VIMAGE_GLOBALS +#ifdef VIMAGE +#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson, m_symmap) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach, \ + .vmi_idetach = m_idetach, \ + .vmi_struct_size = \ + sizeof(struct vnet_##m_name_lc), \ + .vmi_symmap = m_symmap \ +}; +#define VNET_MOD_DECLARE_STATELESS(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach, \ + .vmi_idetach = m_idetach \ +}; +#else /* !VIMAGE */ +#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson, m_symmap) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach, \ + .vmi_struct_size = \ + sizeof(struct vnet_##m_name_lc), \ + .vmi_symmap = m_symmap \ +}; +#define VNET_MOD_DECLARE_STATELESS(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach \ +}; +#endif +#else /* VIMAGE_GLOBALS */ +#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson, m_symmap) +#define VNET_MOD_DECLARE_STATELESS(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson) +#endif -#include +typedef int vnet_attach_fn(const void *); +typedef int vnet_detach_fn(const void *); -struct kld_sym_lookup; +#ifndef VIMAGE_GLOBALS struct vnet_symmap { char *name; - void *base; + size_t offset; size_t size; }; struct vnet_modinfo { + u_int vmi_id; + u_int vmi_dependson; char *vmi_name; + vnet_attach_fn *vmi_iattach; + vnet_detach_fn *vmi_idetach; + size_t vmi_struct_size; struct vnet_symmap *vmi_symmap; }; struct vnet_modlink { TAILQ_ENTRY(vnet_modlink) vml_mod_le; const struct vnet_modinfo *vml_modinfo; + const void *vml_iarg; + const char *vml_iname; }; +#endif + +#define VNET_SYMMAP_END { NULL, 0 } + +#define basevnet thread0.td_ucred->cr_vimage->v_net +#define basevprocg thread0.td_ucred->cr_vimage->v_procg +#define basevcpu thread0.td_ucred->cr_vimage->v_cpu + +#define V_GLOBAL 0 +#define V_NET 1 +#define V_PROCG 2 +#define V_CPU 3 + +#define VNET_MOD_NONE -1 +/*statefull modules */ +#define VNET_MOD_NET 0 +#define VNET_MOD_NETGRAPH 1 +#define VNET_MOD_INET 2 +#define VNET_MOD_INET6 3 +#define VNET_MOD_IPSEC 4 +#define VNET_MOD_IPFW 5 +#define VNET_MOD_DUMMYNET 6 +#define VNET_MOD_PF 7 +#define VNET_MOD_ALTQ 8 +#define VNET_MOD_IPX 9 +#define VNET_MOD_ATALK 10 +#define VNET_MOD_ACCF_HTTP 11 +/* stateless modules */ +#define VNET_MOD_NG_ETHER 20 +#define VNET_MOD_NG_IFACE 21 +#define VNET_MOD_NG_EIFACE 22 +#define VNET_MOD_ESP 23 +#define VNET_MOD_IPIP 24 +#define VNET_MOD_AH 25 +#define VNET_MOD_IPCOMP 26 +#define VNET_MOD_GIF 27 +#define VNET_MOD_ARP 28 +#define VNET_MOD_RTABLE 29 +#define VNET_MOD_LOIF 30 +#define VNET_MOD_DOMAIN 31 +#define VNET_MOD_DYNAMIC_START 32 +#define VNET_MOD_MAX 64 + +/* Needed for ugly sysctl virtualization macros */ +#define V_MOD_vnet_net VNET_MOD_NET +#define V_MOD_vnet_netgraph VNET_MOD_NETGRAPH +#define V_MOD_vnet_inet VNET_MOD_INET +#define V_MOD_vnet_inet6 VNET_MOD_INET6 +#define V_MOD_vnet_ipfw VNET_MOD_IPFW +#define V_MOD_vnet_pf VNET_MOD_PF +#define V_MOD_vnet_gif VNET_MOD_GIF +#define V_MOD_vnet_ipsec VNET_MOD_IPSEC + +#define V_MOD_vprocg 0 +#define V_MOD_vcpu 0 + +#ifdef VIMAGE + +struct vnet { + void *mod_data[VNET_MOD_MAX]; + + u_int vnet_ref; /* reference count */ + LIST_ENTRY(vnet) vnet_le; /* all vnets list */ + u_int vnet_id; /* ID num */ -#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ - m_dependson, m_symmap) \ - static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ - .vmi_name = #m_name_lc, \ - .vmi_symmap = m_symmap \ + u_int ifccnt; + u_int sockcnt; + + u_int vnet_magic_n; }; -#if defined(VIMAGE) && defined(VIMAGE_GLOBALS) -#error "You cannot have both option VIMAGE and option VIMAGE_GLOBALS!" -#endif +#define VNET_MAGIC_N 0x3e0d8f29 + + +#ifdef VNET_DEBUG + +#define VNET_ASSERT(condition) \ + if (!(condition)) { \ + printf("VNET_ASSERT @ %s:%d %s():\n", \ + __FILE__, __LINE__, __FUNCTION__); \ + panic(#condition); \ + } + +#define CURVNET_SET_QUIET(arg) \ + VNET_ASSERT((arg)->vnet_magic_n == VNET_MAGIC_N); \ + struct vnet *saved_vnet = curvnet; \ + const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ + curvnet = arg; \ + curthread->td_vnet_lpush = __FUNCTION__; + +#define CURVNET_SET_VERBOSE(arg) \ + CURVNET_SET_QUIET(arg) \ + if (saved_vnet) \ + printf("curvnet_set(%p) in %s() on cpu %d, prev %p in %s()\n", \ + curvnet, curthread->td_vnet_lpush, curcpu, \ + saved_vnet, saved_vnet_lpush); + +#define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) + +#define CURVNET_RESTORE() \ + VNET_ASSERT(saved_vnet == NULL || \ + saved_vnet->vnet_magic_n == VNET_MAGIC_N); \ + curvnet = saved_vnet; \ + curthread->td_vnet_lpush = saved_vnet_lpush; + +#define INIT_FROM_VNET(vnet, modindex, modtype, sym) \ + if (vnet != curvnet) \ + panic("in %s:%d %s()\n vnet=%p curvnet=%p", \ + __FILE__, __LINE__, __FUNCTION__, \ + vnet, curvnet); \ + modtype *sym = (vnet)->mod_data[modindex]; + +#else /* !VNET_DEBUG */ + +#define VNET_ASSERT(condition) + +#define CURVNET_SET(arg) \ + struct vnet *saved_vnet = curvnet; \ + curvnet = arg; + +#define CURVNET_SET_VERBOSE(arg) CURVNET_SET(arg) +#define CURVNET_SET_QUIET(arg) CURVNET_SET(arg) + +#define CURVNET_RESTORE() \ + curvnet = saved_vnet; + +#define INIT_FROM_VNET(vnet, modindex, modtype, sym) \ + modtype *sym = (vnet)->mod_data[modindex]; + +#endif /* !VNET_DEBUG */ + +#define VNET_ITERATOR_DECL(arg) struct vnet *arg; +#define VNET_FOREACH(arg) LIST_FOREACH(arg, &vnet_head, vnet_le) + +#define INIT_VPROCG(arg) struct vprocg *vprocg = (arg); + +#define VPROCG_ITERLOOP_BEGIN() \ + struct vprocg *vprocg_iter; \ + LIST_FOREACH(vprocg_iter, &vprocg_head, vprocg_le) { \ + +#define VPROCG_ITERLOOP_END() \ + } \ -#ifdef VIMAGE_GLOBALS -#define VSYM(base, sym) (sym) -#else -#ifdef VIMAGE -#error "No option VIMAGE yet!" -#else -#define VSYM(base, sym) (base ## _0._ ## sym) -#endif -#endif +#define INIT_VCPU(arg) struct vcpu *vcpu = (arg); -#define VNET_SYMMAP(mod, name) \ - { #name, &(vnet_ ## mod ## _0._ ## name), \ - sizeof(vnet_ ## mod ## _0._ ## name) } +#define TD_TO_VIMAGE(td) (td)->td_ucred->cr_vimage +#define TD_TO_VNET(td) (td)->td_ucred->cr_vimage->v_net +#define TD_TO_VPROCG(td) (td)->td_ucred->cr_vimage->v_procg +#define TD_TO_VCPU(td) (td)->td_ucred->cr_vimage->v_cpu +#define P_TO_VIMAGE(p) (p)->p_ucred->cr_vimage +#define P_TO_VNET(p) (p)->p_ucred->cr_vimage->v_net +#define P_TO_VPROCG(p) (p)->p_ucred->cr_vimage->v_procg +#define P_TO_VCPU(p) (p)->p_ucred->cr_vimage->v_cpu -#define VNET_SYMMAP_END { NULL, 0 } +#else /* !VIMAGE */ /* Non-VIMAGE null-macros */ +#define IS_DEFAULT_VNET(arg) 1 #define CURVNET_SET(arg) #define CURVNET_SET_QUIET(arg) #define CURVNET_RESTORE() @@ -91,6 +322,8 @@ #define VNET_LIST_RLOCK() #define VNET_LIST_RUNLOCK() #define INIT_VPROCG(arg) +#define VPROCG_ITERLOOP_BEGIN() +#define VPROCG_ITERLOOP_END() #define INIT_VCPU(arg) #define TD_TO_VIMAGE(td) #define TD_TO_VNET(td) @@ -101,15 +334,159 @@ #define P_TO_VPROCG(p) #define P_TO_VCPU(p) +#endif /* !VIMAGE */ + /* XXX those defines bellow should probably go into vprocg.h and vcpu.h */ -#define VPROCG(sym) (sym) -#define VCPU(sym) (sym) +#define VPROCG(sym) VSYM(vprocg, sym) +#define VCPU(sym) VSYM(vcpu, sym) + +#define V_hostname VPROCG(hostname) +#ifdef VIMAGE +#define G_hostname VSYM(basevprocg, hostname) /* global hostname */ +#else +#define G_hostname VSYM(vprocg, hostname) /* global hostname */ +#endif +#define V_domainname VPROCG(domainname) +#define V_morphing_symlinks VPROCG(morphing_symlinks) -#define V_hostname VPROCG(hostname) -#define G_hostname VPROCG(hostname) /* global hostname */ -#define V_domainname VPROCG(domainname) +#define V_acc_statcalls VCPU(acc_statcalls) +#define V_avg1_fixp VCPU(avg1_fixp) +#define V_avg2_fixp VCPU(avg2_fixp) +#ifndef VIMAGE_GLOBALS +void vnet_mod_register(const struct vnet_modinfo *); +void vnet_mod_deregister(const struct vnet_modinfo *); +void vnet_mod_register_multi(const struct vnet_modinfo *, void *, char *); +void vnet_mod_deregister_multi(const struct vnet_modinfo *, void *, char *); int vi_symlookup(struct kld_sym_lookup *, char *); -void vnet_mod_register(const struct vnet_modinfo *); +#endif + +#ifdef VIMAGE +int vi_td_ioctl(u_long, struct vi_req *, struct thread *); +int vi_if_move(struct vi_req *, struct ifnet *, struct vimage *); +void if_reassign_common(struct ifnet *, struct vnet *, const char *); + +struct vimage *vnet2vimage(struct vnet *); +struct vimage *vimage_by_name(struct vimage *, char *); +char *vnet_name(struct vnet *); +int vi_child_of(struct vimage *, struct vimage *); + +LIST_HEAD(vimage_list_head, vimage); +extern struct vimage_list_head vimage_head; + +LIST_HEAD(vprocg_list_head, vprocg); +extern struct vprocg_list_head vprocg_head; + +LIST_HEAD(vcpu_list_head, vcpu); +extern struct vcpu_list_head vcpu_head; + +LIST_HEAD(vnet_list_head, vnet); +extern struct vnet_list_head vnet_head; +extern int vnet_list_refc; +extern struct mtx vnet_list_refc_mtx; +extern struct cv vnet_list_condvar; +extern struct mtx vcpu_list_mtx; + +#define VNET_LIST_RLOCK() do { \ + mtx_lock(&vnet_list_refc_mtx); \ + vnet_list_refc++; \ + mtx_unlock(&vnet_list_refc_mtx); \ +} while (0) + +#define VNET_LIST_RUNLOCK() do { \ + mtx_lock(&vnet_list_refc_mtx); \ + vnet_list_refc--; \ + mtx_unlock(&vnet_list_refc_mtx); \ + cv_signal(&vnet_list_condvar); \ +} while (0) + +#define IS_DEFAULT_VIMAGE(arg) ((arg)->vi_id == 0) +#define IS_DEFAULT_VNET(arg) ((arg)->vnet_id == 0) + +struct vimage { + LIST_ENTRY(vimage) vi_le; /* all vimage list */ + LIST_ENTRY(vimage) vi_sibling; /* vimages with same parent */ + LIST_HEAD(, vimage) vi_child_head; /* direct offspring list */ + struct vimage *vi_parent; /* ptr to parent vimage */ + u_int vi_id; /* ID num */ + u_int vi_ucredrefc; /* refc of ucreds pointing to us */ + + char vi_name[MAXHOSTNAMELEN]; /* assigned by parent */ + + struct vprocg *v_procg; + struct vcpu *v_cpu; + struct vnet *v_net; +}; + +#endif /* VIMAGE */ + +struct vprocg { + LIST_ENTRY(vprocg) vprocg_le; + u_int vprocg_ref; /* reference count */ + u_int vprocg_id; /* ID num */ + + u_int nprocs; + + char _hostname[MAXHOSTNAMELEN]; + char _domainname[MAXHOSTNAMELEN]; + + int _morphing_symlinks; +}; + +struct vcpu { + LIST_ENTRY(vcpu) vcpu_le; + u_int vcpu_ref; /* reference count */ + u_int vcpu_id; /* ID num */ + + u_int _acc_statcalls; /* statclocks since last avg update*/ + u_int _avg1_fixp; /* "fast" avg in 16:16 bit fixedpoint */ + u_int _avg2_fixp; /* "slow" avg in 16:16 bit fixedpoint */ +}; + +#ifndef VIMAGE_GLOBALS +#ifndef VIMAGE +extern struct vprocg vprocg_0; +#endif +#endif + +struct vi_req { + int req_action; /* What to do with this reqest? */ + u_int vi_cpu_min; /* Guaranteed CPU share */ + u_int vi_cpu_max; /* Maximum average CPU usage */ + u_int vi_cpu_weight; /* Prop. share scheduling priority */ + int vi_intr_limit; /* Limit on CPU usage in intr ctx */ + int vi_maxsockets; + u_short vi_proc_limit; /* max. number of processes */ + u_short vi_proc_count; /* current number of processes */ + u_short vi_child_limit; /* max. number of child vnets */ + u_short vi_child_count; /* current number of child vnets */ + int vi_if_count; /* current number network interfaces */ + int vi_sock_count; + char vi_name[MAXPATHLEN]; + char vi_chroot[MAXPATHLEN]; + char vi_if_xname[MAXPATHLEN]; /* XXX should be IFNAMSIZ */ + u_int cp_time_avg; + struct loadavg averunnable; +}; + +#define VI_CREATE 0x00000001 +#define VI_DESTROY 0x00000002 +#define VI_MODIFY 0x00000004 +#define VI_SWITCHTO 0x00000008 +#define VI_IFACE 0x00000010 + +#define VI_GET 0x00000100 +#define VI_GETNEXT 0x00000200 +#define VI_GETNEXT_RECURSE 0x00000300 + +#define VI_SET_CPU_MIN 0x00001000 +#define VI_SET_CPU_MAX 0x00002000 +#define VI_SET_CPU_WEIGHT 0x00004000 +#define VI_SET_INTR_LIMIT 0x00008000 +#define VI_SET_PROC_LIMIT 0x00010000 +#define VI_SET_CHILD_LIMIT 0x00020000 +#define VI_SET_SOCK_LIMIT 0x00040000 +#define VI_SET_NAME 0x00100000 +#define VI_SET_CHROOT 0x00200000 #endif /* !_SYS_VIMAGE_H_ */ Index: usr.bin/kdump/mkioctls =========================================================================== --- usr.bin/kdump/mkioctls 2009/02/22 13:41:20 #16 +++ usr.bin/kdump/mkioctls 2009/02/22 13:41:20 @@ -40,6 +40,7 @@ print "#include " print "#include " print "#include " + print "#include " print "#include " print "#include " print "#include " Index: usr.sbin/vimage/Makefile =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- usr.sbin/vimage/Makefile Sun Feb 22 13:41:23 2009 *************** *** 0 **** --- 1,13 ---- + # $FreeBSD$ + + PROG= vimage + + WARNS?= 2 + CFLAGS+= -I../../sys + + MAN= vimage.8 + + BINDIR?= /usr/sbin + NO_SHARED?= YES + + .include Index: usr.sbin/vimage/vimage.8 =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- usr.sbin/vimage/vimage.8 Sun Feb 22 13:41:24 2009 *************** *** 0 **** --- 1,252 ---- + .\" Copyright (c) 2002, 2003 Marko Zec + .\" + .\" All rights reserved. + .\" + .\" Redistribution and use in source and binary forms, with or without + .\" modification, are permitted provided that the following conditions + .\" are met: + .\" 1. Redistributions of source code must retain the above copyright + .\" notice, this list of conditions and the following disclaimer. + .\" 2. Redistributions in binary form must reproduce the above copyright + .\" notice, this list of conditions and the following disclaimer in the + .\" documentation and/or other materials provided with the distribution. + .\" + .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + .\" SUCH DAMAGE. + .\" + .\" + .\" @(#)vimage.8 1.01 (M. Zec) 2003/09/06 + .\" + .Dd November 3, 2003 + .Dt VIMAGE 8 + .Os + .Sh NAME + .Nm vimage + .Nd manage the FreeBSD virtual image facility + .Sh SYNOPSIS + .Nm + .Nm + .Ar vi_name + .Op command + .Nm + .Brq Fl c | m + .Ar vi_name + .Op options + .Nm + .Fl d + .Ar vi_name + .Nm + .Fl l + .Op Ar vi_name + .Nm + .Fl i + .Ar vi_name interface + .Op target_interface + .Sh DESCRIPTION + .Nm + command is the user interface for controlling the virtual image facility + in FreeBSD. + .Ss Overview + Each virtual image presents an isolated operating environment with its own + private view of vital system resources, most notably user processes, + CPU time share and an independent network stack instance. + Accordingly, every process and every network interface present + in the system is always assigned + to a single and unique virtual image. During the system bootup sequence + the + .So default + .Sc virtual image is created to which all the configured + interfaces and user processes are initially assigned. + Assuming that enough system resources + and per virtual image privileges are provided, the super-user can create and + manage a hierarchy of subordinated virtual images. The + .Nm + command allows + creation, deletion, modification and monitoring of virtual images, as well as + execution of arbitrary processes in the target virtual image. + .Ss Invocation + With no arguments provided, the + .Nm + command returns the name of the current virtual image + on the standard output and exits. + .Pp + If invoked with no modifiers, the + .Nm + command spawns a new process in virtual + image + .Ar vi_name . + If provided, the optional arguments following the virtual image name + .Ar vi_name + are executed as a standard command line, otherwise an interactive + shell is started in the target virtual image. + .Pp + The following parameters are available: + .Bl -tag -width indent + .It Fl c + Create a new virtual image named + .So + .Ar vi_name + .Sc . + If additional arguments are present following the + .Ar vi_name + parameter, they are interpreted as custom options (see below). + .It Fl m + Modify the custom options of the existing virtual image + .Ar vi_name , + in accordance with the additional arguments following the + .Ar vi_name + parameter (see below). + .It Fl d + Delete the virtual image + .Ar vi_name . + No processes should exist in the target virtual image, in order for + deletion to succeed. Non-loopback interfaces residing in the target + virtual image will be reassigned to the virtual image's parent. + .It Fl l + List the properties, custom parameters and statistics for virtual + images bellow the current one in the hierarchy. If an optional argument + .Ar vi_name + is provided, only the information regarding the target virtual image + .Ar vi_name + is displayed. + .It Fl i + Move the interface + .Ar interface + to the target virtual image + .Ar vi_name . + If the value of + .Ar vi_name + argument is + .So - + .Sc , + the interface is returned to the parent of the current virtual image. + .El + .Pp + The following options to + .Fl c + and + .Fl m + modifiers are available: + .Bl -tag -width indent + .It Cm cpumin + Set the minimum guaranteed average CPU share for the target virtual image. + The parameter is specified as percentage in range between 0 and 90. + The guaranteed CPU share for the + .So default + .Sc virtual image cannot be set bellow 10%. + Note that the system does not enforce strict global budgeting on guaranteed + CPU time shares. Therefore it is in the sole responsibility of the system + administrator whether he/she will allow for guaranteed CPU shares to be + oversubscribed or not. By default no virtual image is granted a guaranteed + CPU share, except the + .So default + .Sc virtual image, which normally runs with + .Cm cpumin + level of 10%. + .It Cm cpumax + Set the upper limit to average total CPU usage for the target virtual image. + The limit is specified as a percentage (1-100%). However, the limit cannot + be raised above the current upper CPU limit of the parent virtual image. + By default there is no CPU usage limit (100%). + .It Cm cpuweight + If the current average CPU usage of a virtual image is above the + .Cm cpumin + level, but bellow the + .Cm cpumax, + the virtual image becomes subject to a proportional share CPU scheduler. + The + .Cm cpuweight + parameter determines how will the virtual image compete for the available + CPU time. The higher the + .Cm cpuweight, + the less often will the virtual image be allocated a CPU time slice. + Valid parameter values range from 1 (default) to 10. + .It Cm proc + Set the maximum number of processes that are allowed to exist simultaneously + in the target virtual image. The default is 0, which means no process limit. + .It Cm chroot + Set the chroot directory for the virtual image. All new processes spawned + into the target virtual image using the + .Nm + command will be initially chrooted to that directory. This parameter can + be changed only when no processes are running within the target virtual + image. Note that it is not required to have a chrooted environment for + a virtual image operate, which is also the default behavior. + .It Cm child + Limit the number of children the target virtual image is allowed to create. + The limit cannot be raised above the lowest child limit of all the ancestors + of the target virtual image. By default all created virtual images are + prohibited from creating new virtual images, except the + .So default + .Sc virtual image. + .El + .Sh EXAMPLES + Create a new virtual image named + .So v1 + .Sc with average CPU usage limited to 20%: + .Pp + .Dl vimage -c v1 cpumax 20% + .Pp + Execute the + .So ifconfig + .Sc command in the virtual image + .So v1 + .Sc : + .Pp + .Dl vimage v1 ifconfig + .Pp + Move the interface + .So vlan0 + .Sc to the virtual image + .So v1 + .Sc : + .Pp + .Dl vimage -i v1 vlan0 + .Pp + Show the status information for virtual image + .So v1 + .Sc : + .Pp + .Dl vimage -l v1 + .Sh DIAGNOSTICS + The + .Nm + command exits 0 on success, and >0 if an error occurs. + .Sh SEE ALSO + .Xr jail 8 + .Sh BUGS + If memory allocation failure occurs during the vimage creation, it will remain + undetected/ignored in the current implementation, thus latently scheduling + an almost imminent system crash in the future. + .Pp + The current (experimental) implementation provides support for only IPv4 + protocol, though many features are not included, such as IPSEC or IPF. + IPv6, IPX, AppleTalk, XNS and OSI/ISO protocols are not yet supported. + .Pp + .Xr netgraph 4 + naming has to be extended to reflect virtual image association of netgraph + nodes and interfaces. + .Pp + No testing has been performed on SMP systems. There is absolutely no guarantee + that the kernel will even compile with SMP options enabled. + .Pp + At the time of writing this document the code is still in highly experimental + phase, so one should expect to encounter numerous undocumented problems. + The author will welcome and appreciate all (decently documented) bugreports. + You can check for updated versions of the vimage framework at + http://www.tel.fer.hr/zec/BSD/vimage/ + .Sh AUTHOR + .An "Marko Zec" Aq zec@tel.fer.hr + .Sh HISTORY + The + .Nm + facility first appeared as a FreeBSD 4.7-RELEASE patch. Index: usr.sbin/vimage/vimage.c =========================================================================== *** /dev/null Sun Feb 22 13:33:00 2009 --- usr.sbin/vimage/vimage.c Sun Feb 22 13:41:24 2009 *************** *** 0 **** --- 1,300 ---- + /* + * Copyright (c) 2002, 2003, 2004 Marko Zec + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + + #include + #include + #include + #include + #include + + + int main __P((int, char *[])); + void vi_print(struct vi_req *); + + + void vi_print(struct vi_req *vi_req) + { + double lf = 1.0/vi_req->averunnable.fscale; + + printf("\"%s\":\n", vi_req->vi_name); + printf(" Processes (cur/max): %d/%d;", + vi_req->vi_proc_count, vi_req->vi_proc_limit); + printf(" load averages: %3.2f, %3.2f, %3.2f\n", + lf * vi_req->averunnable.ldavg[0], + lf * vi_req->averunnable.ldavg[1], + lf * vi_req->averunnable.ldavg[2]); + + printf(" CPU usage: %3.2f%%\n", vi_req->cp_time_avg / 655.04); + + printf(" Sockets (cur/max): %d/%d;", vi_req->vi_sock_count, + vi_req->vi_maxsockets); + printf(" %d network interfaces\n", vi_req->vi_if_count); + + #if 0 + printf(" CPU limits: min %3.2f%%, ", 0.0001 * vi_req->vi_cpu_min); + if (vi_req->vi_cpu_max == 0) + vi_req->vi_cpu_max = 1000000; + printf("max %3.2f%%, ", 0.0001 * vi_req->vi_cpu_max); + printf("weight %d, ", vi_req->vi_cpu_weight); + if (vi_req->vi_intr_limit) + printf("intr limit: %3.2f%%\n", + 0.0001 * vi_req->vi_intr_limit); + else + printf("no intr limit\n"); + + if (vi_req->vi_child_limit) + printf(" child limit: %d\n", vi_req->vi_child_limit); + if (vi_req->vi_child_count) + printf(" %d child vimages\n", vi_req->vi_child_count); + if (vi_req->vi_chroot[0]) + printf(" Chroot dir: %s\n", vi_req->vi_chroot); + #endif + } + + + /* + * The command syntax and argument parser are both uggly, as they have been + * "stiched" together on the fly, but they fullfil their current experimental + * purpose. The whole code should be rewritten properly one day... + */ + + int + main(argc, argv) + int argc; + char *argv[]; + { + int s, i; + char *shell; + int cmd = VI_SWITCHTO; + struct vi_req vi_req; + + s = socket(AF_INET, SOCK_DGRAM, 0); + if (s == -1) + goto abort; + + bzero(&vi_req, sizeof(vi_req)); + if (argc == 1) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GET; + } + + if (argc == 2 && strcmp(argv[1], "-l") == 0) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GETNEXT; + } + + if (argc == 2 && strcmp(argv[1], "-lr") == 0) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GETNEXT_RECURSE; + } + + if (argc == 3) { + strcpy(vi_req.vi_name, argv[2]); + if (strcmp(argv[1], "-l") == 0) + cmd = VI_GET; + if (strcmp(argv[1], "-c") == 0) + cmd = VI_CREATE; + if (strcmp(argv[1], "-d") == 0) + cmd = VI_DESTROY; + } + + if (argc >= 3) { + strcpy(vi_req.vi_name, argv[2]); + if (strcmp(argv[1], "-c") == 0) + cmd = VI_CREATE; + if (strcmp(argv[1], "-m") == 0) + cmd = VI_MODIFY; + if (strcmp(argv[1], "-i") == 0) + cmd = VI_IFACE; + } + + vi_req.req_action = cmd; + switch (cmd) { + + case VI_GET: + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + if (argc == 1) + printf("%s\n", vi_req.vi_name); + else + vi_print(&vi_req); + exit(0); + + case VI_GETNEXT: + case VI_GETNEXT_RECURSE: + vi_req.req_action = VI_GET; + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + vi_print(&vi_req); + vi_req.req_action = VI_GETNEXT_RECURSE; + while (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) == 0) { + vi_print(&vi_req); + vi_req.req_action = cmd; + } + exit(0); + + case VI_IFACE: + /* here vi_chroot stores the current ifc name */ + strncpy(vi_req.vi_chroot, argv[3], sizeof(vi_req.vi_chroot)); + if (argc >= 5) + strncpy(vi_req.vi_if_xname, argv[4], + sizeof(vi_req.vi_if_xname)); + else + vi_req.vi_if_xname[0] = 0; + if (ioctl(s, SIOCSIFVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + printf("%s@%s\n", vi_req.vi_chroot, vi_req.vi_name); + exit(0); + + case VI_CREATE: + case VI_MODIFY: + for (i = 3; i < argc-1; i += 2) { + if (strcmp(argv[i], "maxsockets") == 0) { + vi_req.req_action |= VI_SET_SOCK_LIMIT; + vi_req.vi_maxsockets = strtod(argv[i+1], NULL); + } + if (strcmp(argv[i], "cpumin") == 0) { + vi_req.req_action |= VI_SET_CPU_MIN; + vi_req.vi_cpu_min = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_cpu_min > 900000) { + fprintf(stderr, "error: cpumin must be between 0 and 90\n"); + exit(1); + } + } + if (strcmp(argv[i], "cpumax") == 0) { + vi_req.req_action |= VI_SET_CPU_MAX; + vi_req.vi_cpu_max = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_cpu_max < 10000 || + vi_req.vi_cpu_max > 1000000) { + fprintf(stderr, "error: cpumax must be between 1 and 100\n"); + exit(1); + } + } + if (strcmp(argv[i], "cpuweight") == 0) { + vi_req.req_action |= VI_SET_CPU_WEIGHT; + vi_req.vi_cpu_weight = strtod(argv[i+1], NULL); + if (vi_req.vi_cpu_weight < 1 || + vi_req.vi_cpu_weight > 10) { + fprintf(stderr, "error: cpuweight must be between 1 and 10\n"); + exit(1); + } + } + if (strcmp(argv[i], "intr") == 0) { + vi_req.req_action |= VI_SET_INTR_LIMIT; + vi_req.vi_intr_limit = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_intr_limit < 10000 || + vi_req.vi_intr_limit > 1000000) { + fprintf(stderr, "error: intr limit must be between 1 and 100\n"); + exit(1); + } + } + if (strcmp(argv[i], "child") == 0) { + vi_req.req_action |= VI_SET_CHILD_LIMIT; + vi_req.vi_child_limit = atoi(argv[i+1]); + } + if (strcmp(argv[i], "proc") == 0) { + vi_req.req_action |= VI_SET_PROC_LIMIT; + vi_req.vi_proc_limit = atoi(argv[i+1]); + } + if (strcmp(argv[i], "chroot") == 0) { + vi_req.req_action |= VI_SET_CHROOT; + strncpy(vi_req.vi_chroot, argv[i+1], + sizeof(vi_req.vi_chroot)); + } + } + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + exit(0); + + case VI_SWITCHTO: + strcpy(vi_req.vi_name, argv[1]); + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + + vi_req.req_action = VI_GET; + strcpy(vi_req.vi_name, "."); + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) { + printf("XXX this should have not happened!\n"); + goto abort; + } + + if (strlen(vi_req.vi_chroot) && (chdir(vi_req.vi_chroot) || + chroot(vi_req.vi_chroot))) + goto abort; + close(s); + + if (argc == 2) { + printf("Switched to vimage %s\n", argv[1]); + if ((shell=getenv("SHELL")) == NULL) + execlp("/bin/sh", argv[0], NULL); + else + execlp(shell, argv[0], NULL); + } else + execvp(argv[2], &argv[2]); + break; + + case VI_DESTROY: + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + exit(0); + + default: + fprintf(stderr, "usage: %s bla bla\n", argv[0]); + exit(1); + } + + abort: + perror("Error"); + exit(1); + }