diff -uprN linux-2.6.24/COPYING.SWsoft linux-2.6.24.ovz/COPYING.SWsoft --- linux-2.6.24/COPYING.SWsoft 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/COPYING.SWsoft 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,350 @@ + +Nothing in this license should be construed as a grant by SWsoft of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by SWsoft of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff -uprN linux-2.6.24/Documentation/video4linux/CARDLIST.cx23885 linux-2.6.24.ovz/Documentation/video4linux/CARDLIST.cx23885 --- linux-2.6.24/Documentation/video4linux/CARDLIST.cx23885 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/Documentation/video4linux/CARDLIST.cx23885 2008-03-25 18:53:59.000000000 -0500 @@ -1,5 +1,5 @@ 0 -> UNKNOWN/GENERIC [0070:3400] 1 -> Hauppauge WinTV-HVR1800lp [0070:7600] - 2 -> Hauppauge WinTV-HVR1800 [0070:7800,0070:7801] + 2 -> Hauppauge WinTV-HVR1800 [0070:7800,0070:7801,0070:7809] 3 -> Hauppauge WinTV-HVR1250 [0070:7911] 4 -> DViCO FusionHDTV5 Express [18ac:d500] diff -uprN linux-2.6.24/Makefile linux-2.6.24.ovz/Makefile --- linux-2.6.24/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -597,7 +598,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ grsecurity/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff -uprN linux-2.6.24/arch/arm/kernel/smp.c linux-2.6.24.ovz/arch/arm/kernel/smp.c --- linux-2.6.24/arch/arm/kernel/smp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/arm/kernel/smp.c 2008-03-25 18:53:59.000000000 -0500 @@ -201,7 +201,7 @@ int __cpuexit __cpu_disable(void) local_flush_tlb_all(); read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (p->mm) cpu_clear(cpu, p->mm->cpu_vm_mask); } diff -uprN linux-2.6.24/arch/ia64/Kconfig linux-2.6.24.ovz/arch/ia64/Kconfig --- linux-2.6.24/arch/ia64/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -570,6 +570,7 @@ source "fs/Kconfig" source "lib/Kconfig" +source "kernel/bc/Kconfig" # # Use the generic interrupt handling code in kernel/irq/: # @@ -596,6 +597,8 @@ source "kernel/Kconfig.instrumentation" source "arch/ia64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" diff -uprN linux-2.6.24/arch/ia64/ia32/binfmt_elf32.c linux-2.6.24.ovz/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.24/arch/ia64/ia32/binfmt_elf32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/ia32/binfmt_elf32.c 2008-03-25 18:53:59.000000000 -0500 @@ -17,6 +17,8 @@ #include #include +#include + #include "ia32priv.h" #include "elfcore32.h" @@ -132,6 +134,12 @@ ia64_elf32_init (struct pt_regs *regs) up_write(¤t->mm->mmap_sem); } + if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, + NULL, UB_SOFT)) + goto skip; + /* * Install LDT as anonymous memory. This gives us all-zero segment descriptors * until a task modifies them via modify_ldt(). @@ -152,7 +160,12 @@ ia64_elf32_init (struct pt_regs *regs) } } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * + IA32_LDT_ENTRY_SIZE), + VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); + +skip: ia64_psr(regs)->ac = 0; /* turn off alignment checking */ regs->loadrs = 0; diff -uprN linux-2.6.24/arch/ia64/kernel/asm-offsets.c linux-2.6.24.ovz/arch/ia64/kernel/asm-offsets.c --- linux-2.6.24/arch/ia64/kernel/asm-offsets.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/asm-offsets.c 2008-03-25 18:53:59.000000000 -0500 @@ -46,11 +46,9 @@ void foo(void) DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); - DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); - DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); diff -uprN linux-2.6.24/arch/ia64/kernel/entry.S linux-2.6.24.ovz/arch/ia64/kernel/entry.S --- linux-2.6.24/arch/ia64/kernel/entry.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/entry.S 2008-03-25 18:53:59.000000000 -0500 @@ -504,6 +504,74 @@ GLOBAL_ENTRY(clone) br.ret.sptk.many rp END(clone) +GLOBAL_ENTRY(ia64_ret_from_resume) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + /* + * We need to call schedule_tail() to complete the scheduling process. + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the + * address of the previously executing task. + */ + br.call.sptk.many rp=ia64_invoke_schedule_tail +} + br.call.sptk.many rp=ia64_invoke_resume + ;; + adds sp=256,sp + ;; + /* Return from interrupt, we are all right. */ +(pNonSys) br ia64_leave_kernel + ;; + /* Tricky part follows. We must restore correct syscall + * register frame before doing normal syscall exit job. + * It would the most natural to keep sw->ar_pfs correct, + * then we would be here with correct register frame. + * Unfortunately, IA64 has a feature. Registers were in backstore + * after context switch, and the first br.ret does _NOT_ fetch + * output registers. + * It is quite natural: look, if caller has output regs in his + * frame, they should be consumed. If callee does not have (enough of) + * input/local registers (1 in this case), the situation is unusual. + * Practical evidence: they are filled with something random crap. + * The only case, when this is essential in mainstream kernel + * is sys_clone(). The result is that new process gets some kernel + * information in its register frame. Which is a security problem, btw. + * + * So, we set sw->ar_pfs to pretend the whole frame is of local + * regs. And we have to repartition the frame it manually, using + * information from pt->cr_ifs (the register is invalid in this + * case, but it holds correct pfm). + */ + adds r3=PT(CR_IFS)+16,sp + ;; + ld8 r2=[r3],-(PT(CR_IFS)-PT(R8)) + ;; + extr.u r2=r2,0,37 + mov r8=ar.ec + ;; + extr.u r8=r8,0,5 + ;; + shl r8=r8,52 + ;; + or r2=r2,r8 + ;; + mov ar.pfs=r2 + ;; + movl r2=ia64_leave_syscall + ;; + mov rp=r2 + /* Plus, we should fetch r8 and r10 from pt_regs. Something else? */ + ld8 r8=[r3],PT(R10)-PT(R8) + ;; + ld8 r10=[r3] + ;; + br.ret.sptk.many rp +END(ia64_ret_from_resume) + /* * Invoke a system call, but do some tracing before and after the call. * We MUST preserve the current register frame throughout this routine @@ -1167,6 +1235,34 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail) br.ret.sptk.many rp END(ia64_invoke_schedule_tail) +GLOBAL_ENTRY(ia64_invoke_resume) + alloc loc1=ar.pfs,0,3,1,0 + mov loc0=rp + adds out0=16,sp + ;; + ld8 r8=[out0] + ;; + cmp.eq p6,p0=r8,r0 + ;; +(p6) br.cond.sptk 1f + ;; + mov loc2=gp + ;; + ld8 r10=[r8],8 + ;; + ld8 gp=[r8] + ;; + mov b7=r10 + ;; + br.call.sptk.many rp=b7 + ;; + mov gp=loc2 +1: + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(ia64_invoke_resume) + /* * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to * be set up by the caller. We declare 8 input registers so the system call @@ -1588,5 +1684,20 @@ sys_call_table: data8 sys_signalfd data8 sys_timerfd data8 sys_eventfd +.rept 1499-1310 + data8 sys_ni_syscall +.endr + data8 sys_fairsched_vcpus + data8 sys_fairsched_mknod // 1500 + data8 sys_fairsched_rmnod + data8 sys_fairsched_chwt + data8 sys_fairsched_mvpr + data8 sys_fairsched_rate + data8 sys_getluid // 1505 + data8 sys_setluid + data8 sys_setublimit + data8 sys_ubstat + data8 sys_lchmod + data8 sys_lutime // 1510 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff -uprN linux-2.6.24/arch/ia64/kernel/fsys.S linux-2.6.24.ovz/arch/ia64/kernel/fsys.S --- linux-2.6.24/arch/ia64/kernel/fsys.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/fsys.S 2008-03-25 18:53:59.000000000 -0500 @@ -57,96 +57,6 @@ ENTRY(fsys_ni_syscall) FSYS_RETURN END(fsys_ni_syscall) -ENTRY(fsys_getpid) - .prologue - .altrp b6 - .body - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - ;; - ld4 r9=[r9] - add r8=IA64_TASK_TGID_OFFSET,r16 - ;; - and r9=TIF_ALLWORK_MASK,r9 - ld4 r8=[r8] // r8 = current->tgid - ;; - cmp.ne p8,p0=0,r9 -(p8) br.spnt.many fsys_fallback_syscall - FSYS_RETURN -END(fsys_getpid) - -ENTRY(fsys_getppid) - .prologue - .altrp b6 - .body - add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 - ;; - ld8 r17=[r17] // r17 = current->group_leader - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - ;; - - ld4 r9=[r9] - add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent - ;; - and r9=TIF_ALLWORK_MASK,r9 - -1: ld8 r18=[r17] // r18 = current->group_leader->real_parent - ;; - cmp.ne p8,p0=0,r9 - add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid - ;; - - /* - * The .acq is needed to ensure that the read of tgid has returned its data before - * we re-check "real_parent". - */ - ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid -#ifdef CONFIG_SMP - /* - * Re-read current->group_leader->real_parent. - */ - ld8 r19=[r17] // r19 = current->group_leader->real_parent -(p8) br.spnt.many fsys_fallback_syscall - ;; - cmp.ne p6,p0=r18,r19 // did real_parent change? - mov r19=0 // i must not leak kernel bits... -(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check - ;; - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... -#else - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... - mov r19=0 // i must not leak kernel bits... -#endif - FSYS_RETURN -END(fsys_getppid) - -ENTRY(fsys_set_tid_address) - .prologue - .altrp b6 - .body - add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - ;; - ld4 r9=[r9] - tnat.z p6,p7=r32 // check argument register for being NaT - ;; - and r9=TIF_ALLWORK_MASK,r9 - add r8=IA64_TASK_PID_OFFSET,r16 - add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16 - ;; - ld4 r8=[r8] - cmp.ne p8,p0=0,r9 - mov r17=-1 - ;; -(p6) st8 [r18]=r32 -(p7) st8 [r18]=r17 -(p8) br.spnt.many fsys_fallback_syscall - ;; - mov r17=0 // i must not leak kernel bits... - mov r18=0 // i must not leak kernel bits... - FSYS_RETURN -END(fsys_set_tid_address) - #if IA64_GTOD_LOCK_OFFSET !=0 #error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t #endif @@ -718,8 +628,8 @@ fsyscall_table: data8 0 // chmod data8 0 // chown data8 0 // lseek // 1040 - data8 fsys_getpid // getpid - data8 fsys_getppid // getppid + data8 0 // getpid + data8 0 // getppid data8 0 // mount data8 0 // umount data8 0 // setuid // 1045 @@ -910,7 +820,7 @@ fsyscall_table: data8 0 // futex // 1230 data8 0 // sched_setaffinity data8 0 // sched_getaffinity - data8 fsys_set_tid_address // set_tid_address + data8 0 // set_tid_address data8 0 // fadvise64_64 data8 0 // tgkill // 1235 data8 0 // exit_group diff -uprN linux-2.6.24/arch/ia64/kernel/head.S linux-2.6.24.ovz/arch/ia64/kernel/head.S --- linux-2.6.24/arch/ia64/kernel/head.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/head.S 2008-03-25 18:53:59.000000000 -0500 @@ -1011,7 +1011,7 @@ GLOBAL_ENTRY(start_kernel_thread) mov out1 = r11;; br.call.sptk.many rp = kernel_thread_helper;; mov out0 = r8 - br.call.sptk.many rp = sys_exit;; + br.call.sptk.many rp = do_exit;; 1: br.sptk.few 1b // not reached END(start_kernel_thread) diff -uprN linux-2.6.24/arch/ia64/kernel/ia64_ksyms.c linux-2.6.24.ovz/arch/ia64/kernel/ia64_ksyms.c --- linux-2.6.24/arch/ia64/kernel/ia64_ksyms.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/ia64_ksyms.c 2008-03-25 18:53:59.000000000 -0500 @@ -78,6 +78,8 @@ EXPORT_SYMBOL(xor_ia64_4); EXPORT_SYMBOL(xor_ia64_5); #endif +EXPORT_SYMBOL(empty_zero_page); + #include EXPORT_SYMBOL(ia64_pal_call_phys_stacked); EXPORT_SYMBOL(ia64_pal_call_phys_static); diff -uprN linux-2.6.24/arch/ia64/kernel/mca.c linux-2.6.24.ovz/arch/ia64/kernel/mca.c --- linux-2.6.24/arch/ia64/kernel/mca.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/mca.c 2008-03-25 18:53:59.000000000 -0500 @@ -1551,10 +1551,10 @@ default_monarch_init_process(struct noti } printk("\n\n"); if (read_trylock(&tasklist_lock)) { - do_each_thread (g, t) { + do_each_thread_all (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); show_stack(t, NULL); - } while_each_thread (g, t); + } while_each_thread_all (g, t); read_unlock(&tasklist_lock); } /* FIXME: This will not restore zapped printk locks. */ diff -uprN linux-2.6.24/arch/ia64/kernel/perfmon.c linux-2.6.24.ovz/arch/ia64/kernel/perfmon.c --- linux-2.6.24/arch/ia64/kernel/perfmon.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/perfmon.c 2008-03-25 18:53:59.000000000 -0500 @@ -4216,12 +4216,12 @@ pfm_check_task_exist(pfm_context_t *ctx) read_lock(&tasklist_lock); - do_each_thread (g, t) { + do_each_thread_ve (g, t) { if (t->thread.pfm_context == ctx) { ret = 0; break; } - } while_each_thread (g, t); + } while_each_thread_ve (g, t); read_unlock(&tasklist_lock); diff -uprN linux-2.6.24/arch/ia64/kernel/process.c linux-2.6.24.ovz/arch/ia64/kernel/process.c --- linux-2.6.24/arch/ia64/kernel/process.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/process.c 2008-03-25 18:53:59.000000000 -0500 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -369,6 +370,9 @@ ia64_load_extra (struct task_struct *tas #endif } +extern char ia64_ret_from_resume; +EXPORT_SYMBOL(ia64_ret_from_resume); + /* * Copy the state of an ia-64 thread. * @@ -442,7 +446,6 @@ copy_thread (int nr, unsigned long clone child_ptregs->r12 = user_stack_base + user_stack_size - 16; child_ptregs->ar_bspstore = user_stack_base; child_ptregs->ar_rnat = 0; - child_ptregs->loadrs = 0; } } else { /* @@ -684,16 +687,25 @@ out: return error; } +extern void start_kernel_thread (void); +EXPORT_SYMBOL(start_kernel_thread); + pid_t kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) { - extern void start_kernel_thread (void); unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; struct { struct switch_stack sw; struct pt_regs pt; } regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ regs.pt.r1 = helper_fptr[1]; /* set GP */ diff -uprN linux-2.6.24/arch/ia64/kernel/ptrace.c linux-2.6.24.ovz/arch/ia64/kernel/ptrace.c --- linux-2.6.24/arch/ia64/kernel/ptrace.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/ptrace.c 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,7 @@ * Derived from the x86 and Alpha versions. */ #include +#include #include #include #include @@ -100,6 +101,8 @@ ia64_get_scratch_nat_bits (struct pt_reg # undef GET_BITS } +EXPORT_SYMBOL(ia64_get_scratch_nat_bits); +EXPORT_SYMBOL(__ia64_save_fpu); /* * Set the NaT bits for the scratch registers according to NAT and @@ -456,6 +459,7 @@ ia64_peek (struct task_struct *child, st *val = ret; return 0; } +EXPORT_SYMBOL(ia64_peek); long ia64_poke (struct task_struct *child, struct switch_stack *child_stack, @@ -520,6 +524,7 @@ ia64_get_user_rbs_end (struct task_struc *cfmp = cfm; return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty); } +EXPORT_SYMBOL(ia64_get_user_rbs_end); /* * Synchronize (i.e, write) the RSE backing store living in kernel @@ -757,20 +762,20 @@ access_nat_bits (struct task_struct *chi if (write_access) { nat_bits = *data; scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits); - if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) { - dprintk("ptrace: failed to set ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + *info->pri_unat_loc = scratch_unat; + else + info->sw->caller_unat = scratch_unat; for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); unw_set_gr(info, regnum, dummy, (nat_bits >> regnum) & 1); } } else { - if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) { - dprintk("ptrace: failed to read ar.unat\n"); - return -1; - } + if (info->pri_unat_loc) + scratch_unat = *info->pri_unat_loc; + else + scratch_unat = info->sw->caller_unat; nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat); for (regnum = 4; regnum <= 7; ++regnum) { unw_get_gr(info, regnum, &dummy, &nat); diff -uprN linux-2.6.24/arch/ia64/kernel/signal.c linux-2.6.24.ovz/arch/ia64/kernel/signal.c --- linux-2.6.24/arch/ia64/kernel/signal.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/signal.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -446,6 +447,12 @@ ia64_do_signal (struct sigscratch *scr, if (!user_mode(&scr->pt)) return; + if (try_to_freeze() && !signal_pending(current)) { + if ((long) scr->pt.r10 != -1) + restart = 0; + goto no_signal; + } + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else @@ -501,8 +508,10 @@ ia64_do_signal (struct sigscratch *scr, if (IS_IA32_PROCESS(&scr->pt)) { scr->pt.r8 = scr->pt.r1; scr->pt.cr_iip -= 2; - } else + } else { ia64_decrement_ip(&scr->pt); + scr->pt.r10 = 0; + } restart = 0; /* don't restart twice if handle_signal() fails... */ } } @@ -523,6 +532,7 @@ ia64_do_signal (struct sigscratch *scr, } /* Did we come from a system call? */ +no_signal: if (restart) { /* Restart the system call - no handlers present */ if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR @@ -542,6 +552,7 @@ ia64_do_signal (struct sigscratch *scr, ia64_decrement_ip(&scr->pt); if (errno == ERESTART_RESTARTBLOCK) scr->pt.r15 = __NR_restart_syscall; + scr->pt.r10 = 0; } } } diff -uprN linux-2.6.24/arch/ia64/kernel/sys_ia64.c linux-2.6.24.ovz/arch/ia64/kernel/sys_ia64.c --- linux-2.6.24/arch/ia64/kernel/sys_ia64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/sys_ia64.c 2008-03-25 18:53:59.000000000 -0500 @@ -204,7 +204,7 @@ do_mmap2 (unsigned long addr, unsigned l /* Careful about overflows.. */ len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { + if (len > TASK_SIZE) { addr = -EINVAL; goto out; } diff -uprN linux-2.6.24/arch/ia64/kernel/unaligned.c linux-2.6.24.ovz/arch/ia64/kernel/unaligned.c --- linux-2.6.24/arch/ia64/kernel/unaligned.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/kernel/unaligned.c 2008-03-25 18:53:59.000000000 -0500 @@ -1289,7 +1289,7 @@ within_logging_rate_limit (void) { static unsigned long count, last_time; - if (jiffies - last_time > 5*HZ) + if (jiffies - last_time > 60 * HZ) count = 0; if (count < 5) { last_time = jiffies; diff -uprN linux-2.6.24/arch/ia64/mm/contig.c linux-2.6.24.ovz/arch/ia64/mm/contig.c --- linux-2.6.24/arch/ia64/mm/contig.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/mm/contig.c 2008-03-25 18:53:59.000000000 -0500 @@ -94,6 +94,7 @@ void show_mem(void) quicklist_total_size()); printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); } +EXPORT_SYMBOL(show_mem); /* physical address where the bootmem map is located */ diff -uprN linux-2.6.24/arch/ia64/mm/discontig.c linux-2.6.24.ovz/arch/ia64/mm/discontig.c --- linux-2.6.24/arch/ia64/mm/discontig.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/mm/discontig.c 2008-03-25 18:53:59.000000000 -0500 @@ -49,6 +49,7 @@ static struct early_node_data mem_data[M static nodemask_t memory_less_mask __initdata; pg_data_t *pgdat_list[MAX_NUMNODES]; +EXPORT_SYMBOL(pgdat_list); /* * To prevent cache aliasing effects, align per-node structures so that they @@ -567,6 +568,7 @@ void show_mem(void) quicklist_total_size()); printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); } +EXPORT_SYMBOL(show_mem); /** * call_pernode_memory - use SRAT to call callback functions with node info diff -uprN linux-2.6.24/arch/ia64/mm/fault.c linux-2.6.24.ovz/arch/ia64/mm/fault.c --- linux-2.6.24/arch/ia64/mm/fault.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/mm/fault.c 2008-03-25 18:53:59.000000000 -0500 @@ -148,7 +148,6 @@ ia64_do_page_fault (unsigned long addres if ((vma->vm_flags & mask) != mask) goto bad_area; - survive: /* * If for any reason at all we couldn't handle the fault, make * sure we exit gracefully rather than endlessly redo the @@ -274,13 +273,13 @@ ia64_do_page_fault (unsigned long addres out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk(KERN_CRIT "VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); + if (user_mode(regs)) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, current); + return; + } goto no_context; } diff -uprN linux-2.6.24/arch/ia64/mm/init.c linux-2.6.24.ovz/arch/ia64/mm/init.c --- linux-2.6.24/arch/ia64/mm/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ia64/mm/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,6 +37,8 @@ #include #include +#include + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); extern void ia64_tlb_init (void); @@ -117,6 +119,10 @@ ia64_init_addr_space (void) ia64_set_rbs_bot(); + if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, + NULL, UB_SOFT)) + goto skip; + /* * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore * the problem. When the process attempts to write to the register backing store @@ -133,11 +139,16 @@ ia64_init_addr_space (void) if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); kmem_cache_free(vm_area_cachep, vma); + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); return; } up_write(¤t->mm->mmap_sem); - } + } else + ub_memory_uncharge(current->mm, PAGE_SIZE, + VM_DATA_DEFAULT_FLAGS, NULL); +skip: /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); diff -uprN linux-2.6.24/arch/powerpc/Kconfig linux-2.6.24.ovz/arch/powerpc/Kconfig --- linux-2.6.24/arch/powerpc/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -674,10 +674,14 @@ source "arch/powerpc/sysdev/qe_lib/Kconf source "lib/Kconfig" +source "kernel/bc/Kconfig" + source "kernel/Kconfig.instrumentation" source "arch/powerpc/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" config KEYS_COMPAT diff -uprN linux-2.6.24/arch/powerpc/kernel/misc_32.S linux-2.6.24.ovz/arch/powerpc/kernel/misc_32.S --- linux-2.6.24/arch/powerpc/kernel/misc_32.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/kernel/misc_32.S 2008-03-25 18:53:59.000000000 -0500 @@ -766,7 +766,7 @@ _GLOBAL(abs) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff -uprN linux-2.6.24/arch/powerpc/kernel/misc_64.S linux-2.6.24.ovz/arch/powerpc/kernel/misc_64.S --- linux-2.6.24/arch/powerpc/kernel/misc_64.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/kernel/misc_64.S 2008-03-25 18:53:59.000000000 -0500 @@ -427,7 +427,7 @@ _GLOBAL(scom970_write) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) std r29,-24(r1) std r30,-16(r1) stdu r1,-STACK_FRAME_OVERHEAD(r1) diff -uprN linux-2.6.24/arch/powerpc/kernel/process.c linux-2.6.24.ovz/arch/powerpc/kernel/process.c --- linux-2.6.24/arch/powerpc/kernel/process.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/kernel/process.c 2008-03-25 18:53:59.000000000 -0500 @@ -48,6 +48,8 @@ #include #endif +#include + extern unsigned long _get_SP(void); #ifndef CONFIG_SMP @@ -446,8 +448,9 @@ void show_regs(struct pt_regs * regs) printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); - printk("REGS: %p TRAP: %04lx %s (%s)\n", - regs, regs->trap, print_tainted(), init_utsname()->release); + printk("REGS: %p TRAP: %04lx %s (%s %s)\n", + regs, regs->trap, print_tainted(), init_utsname()->release, + VZVERSION); printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); @@ -1015,6 +1018,20 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); +long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + extern long ppc_kernel_thread(int (*fn)(void *), void *arg, + unsigned long flags); + + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + + return ppc_kernel_thread(fn, arg, flags); +} + #ifdef CONFIG_PPC64 void ppc64_runlatch_on(void) { diff -uprN linux-2.6.24/arch/powerpc/kernel/systbl.S linux-2.6.24.ovz/arch/powerpc/kernel/systbl.S --- linux-2.6.24/arch/powerpc/kernel/systbl.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/kernel/systbl.S 2008-03-25 18:53:59.000000000 -0500 @@ -43,5 +43,9 @@ .p2align 3 #endif +#define SYS_SKIP(from, to) .rept to - from \ + SYSCALL(sys_ni_syscall) \ + .endr + _GLOBAL(sys_call_table) #include diff -uprN linux-2.6.24/arch/powerpc/kernel/vdso.c linux-2.6.24.ovz/arch/powerpc/kernel/vdso.c --- linux-2.6.24/arch/powerpc/kernel/vdso.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/kernel/vdso.c 2008-03-25 18:53:59.000000000 -0500 @@ -184,7 +184,7 @@ static void dump_vdso_pages(struct vm_ar * vDSO and insert it into the mm struct tree */ int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack) + int executable_stack, unsigned long map_adress) { struct mm_struct *mm = current->mm; struct page **vdso_pagelist; diff -uprN linux-2.6.24/arch/powerpc/mm/fault.c linux-2.6.24.ovz/arch/powerpc/mm/fault.c --- linux-2.6.24/arch/powerpc/mm/fault.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/mm/fault.c 2008-03-25 18:53:59.000000000 -0500 @@ -335,7 +335,6 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: ret = handle_mm_fault(mm, vma, address, is_write); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) @@ -375,14 +374,12 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); if (user_mode(regs)) - do_group_exit(SIGKILL); + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. Den + */ + force_sig(SIGKILL, current); return SIGKILL; do_sigbus: diff -uprN linux-2.6.24/arch/powerpc/mm/init_64.c linux-2.6.24.ovz/arch/powerpc/mm/init_64.c --- linux-2.6.24/arch/powerpc/mm/init_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/mm/init_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -171,7 +171,7 @@ void pgtable_cache_init(void) "for size: %08x...\n", name, i, size); pgtable_cache[i] = kmem_cache_create(name, size, size, - SLAB_PANIC, + SLAB_PANIC|SLAB_UBC|SLAB_NO_CHARGE, zero_ctor); } } diff -uprN linux-2.6.24/arch/powerpc/mm/mem.c linux-2.6.24.ovz/arch/powerpc/mm/mem.c --- linux-2.6.24/arch/powerpc/mm/mem.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/mm/mem.c 2008-03-25 18:53:59.000000000 -0500 @@ -170,6 +170,7 @@ void show_mem(void) printk("%ld pages shared\n", shared); printk("%ld pages swap cached\n", cached); } +EXPORT_SYMBOL(show_mem); /* * Initialize the bootmem system and give it all the memory we diff -uprN linux-2.6.24/arch/powerpc/mm/pgtable_32.c linux-2.6.24.ovz/arch/powerpc/mm/pgtable_32.c --- linux-2.6.24/arch/powerpc/mm/pgtable_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/mm/pgtable_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -82,7 +82,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_ZERO, PGDIR_ORDER); return ret; } @@ -116,6 +117,7 @@ struct page *pte_alloc_one(struct mm_str #else gfp_t flags = GFP_KERNEL | __GFP_REPEAT; #endif + flags |= (__GFP_UBC | __GFP_SOFT_UBC); ptepage = alloc_pages(flags, 0); if (ptepage) diff -uprN linux-2.6.24/arch/powerpc/platforms/cell/spu_callbacks.c linux-2.6.24.ovz/arch/powerpc/platforms/cell/spu_callbacks.c --- linux-2.6.24/arch/powerpc/platforms/cell/spu_callbacks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/powerpc/platforms/cell/spu_callbacks.c 2008-03-25 18:53:59.000000000 -0500 @@ -46,6 +46,8 @@ static void *spu_syscall_table[] = { #define PPC_SYS_SPU(func) ppc_##func, #define SYSX_SPU(f, f3264, f32) f, +#define SYS_SKIP(from, to) [from ... to] = sys_ni_syscall, + #include }; diff -uprN linux-2.6.24/arch/ppc/Kconfig linux-2.6.24.ovz/arch/ppc/Kconfig --- linux-2.6.24/arch/ppc/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ppc/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -1321,6 +1321,10 @@ source "kernel/Kconfig.instrumentation" source "arch/ppc/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" +source "kernel/bc/Kconfig" + source "crypto/Kconfig" diff -uprN linux-2.6.24/arch/ppc/kernel/misc.S linux-2.6.24.ovz/arch/ppc/kernel/misc.S --- linux-2.6.24/arch/ppc/kernel/misc.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ppc/kernel/misc.S 2008-03-25 18:53:59.000000000 -0500 @@ -868,7 +868,7 @@ _GLOBAL(_get_SP) * Create a kernel thread * kernel_thread(fn, arg, flags) */ -_GLOBAL(kernel_thread) +_GLOBAL(ppc_kernel_thread) stwu r1,-16(r1) stw r30,8(r1) stw r31,12(r1) diff -uprN linux-2.6.24/arch/ppc/mm/fault.c linux-2.6.24.ovz/arch/ppc/mm/fault.c --- linux-2.6.24/arch/ppc/mm/fault.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ppc/mm/fault.c 2008-03-25 18:53:59.000000000 -0500 @@ -249,7 +249,6 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: fault = handle_mm_fault(mm, vma, address, is_write); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) @@ -290,14 +289,12 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); if (user_mode(regs)) - do_group_exit(SIGKILL); + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. Den + */ + force_sig(SIGKILL, current); return SIGKILL; do_sigbus: diff -uprN linux-2.6.24/arch/ppc/mm/init.c linux-2.6.24.ovz/arch/ppc/mm/init.c --- linux-2.6.24/arch/ppc/mm/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ppc/mm/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -131,6 +131,7 @@ void show_mem(void) printk("%d pages shared\n",shared); printk("%d pages swap cached\n",cached); } +EXPORT_SYMBOL(show_mem); /* Free up now-unused memory */ static void free_sec(unsigned long start, unsigned long end, const char *name) diff -uprN linux-2.6.24/arch/ppc/mm/pgtable.c linux-2.6.24.ovz/arch/ppc/mm/pgtable.c --- linux-2.6.24/arch/ppc/mm/pgtable.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/ppc/mm/pgtable.c 2008-03-25 18:53:59.000000000 -0500 @@ -83,7 +83,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | + __GFP_ZERO, PGDIR_ORDER); return ret; } @@ -117,6 +118,7 @@ struct page *pte_alloc_one(struct mm_str #else gfp_t flags = GFP_KERNEL | __GFP_REPEAT; #endif + flags |= (__GFP_UBC | __GFP_SOFT_UBC); ptepage = alloc_pages(flags, 0); if (ptepage) diff -uprN linux-2.6.24/arch/s390/Kconfig linux-2.6.24.ovz/arch/s390/Kconfig --- linux-2.6.24/arch/s390/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/s390/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -533,8 +533,12 @@ source "kernel/Kconfig.instrumentation" source "arch/s390/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/bc/Kconfig" diff -uprN linux-2.6.24/arch/s390/kernel/smp.c linux-2.6.24.ovz/arch/s390/kernel/smp.c --- linux-2.6.24/arch/s390/kernel/smp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/s390/kernel/smp.c 2008-03-25 18:53:59.000000000 -0500 @@ -430,8 +430,19 @@ static unsigned int __init smp_count_cpu */ int __cpuinit start_secondary(void *cpuvoid) { - /* Setup the cpu */ - cpu_init(); + /* Setup the cpu */ + cpu_init(); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; + /* + * Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. + */ + VE_TASK_INFO(idle)->sleep_time = 0; +#endif + preempt_disable(); /* Enable TOD clock interrupts on the secondary cpu. */ init_cpu_timer(); @@ -677,6 +688,11 @@ void __init smp_prepare_cpus(unsigned in for_each_possible_cpu(cpu) if (cpu != smp_processor_id()) smp_create_idle(cpu); + +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif } void __init smp_prepare_boot_cpu(void) diff -uprN linux-2.6.24/arch/s390/mm/init.c linux-2.6.24.ovz/arch/s390/mm/init.c --- linux-2.6.24/arch/s390/mm/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/s390/mm/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -77,6 +77,7 @@ void show_mem(void) global_page_state(NR_SLAB_UNRECLAIMABLE)); printk("%lu pages pagetables\n", global_page_state(NR_PAGETABLE)); } +EXPORT_SYMBOL(show_mem); static void __init setup_ro_region(void) { diff -uprN linux-2.6.24/arch/sh64/kernel/process.c linux-2.6.24.ovz/arch/sh64/kernel/process.c --- linux-2.6.24/arch/sh64/kernel/process.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sh64/kernel/process.c 2008-03-25 18:53:59.000000000 -0500 @@ -663,7 +663,7 @@ asids_proc_info(char *buf, char **start, int len=0; struct task_struct *p; read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_ve(p) { int pid = p->pid; struct mm_struct *mm; if (!pid) continue; diff -uprN linux-2.6.24/arch/sparc64/Kconfig linux-2.6.24.ovz/arch/sparc64/Kconfig --- linux-2.6.24/arch/sparc64/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -175,6 +175,8 @@ config NR_CPUS depends on SMP default "64" +source "kernel/Kconfig.fairsched" + source "drivers/cpufreq/Kconfig" config US3_FREQ @@ -466,8 +468,12 @@ source "kernel/Kconfig.instrumentation" source "arch/sparc64/Kconfig.debug" +source "kernel/Kconfig.openvz" + source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/bc/Kconfig" diff -uprN linux-2.6.24/arch/sparc64/kernel/process.c linux-2.6.24.ovz/arch/sparc64/kernel/process.c --- linux-2.6.24/arch/sparc64/kernel/process.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/kernel/process.c 2008-03-25 18:53:59.000000000 -0500 @@ -699,6 +699,13 @@ pid_t kernel_thread(int (*fn)(void *), v { long retval; + /* Don't allow kernel_thread() inside VE */ + if (!ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + /* If the parent runs before fn(arg) is called by the child, * the input registers of this function can be clobbered. * So we stash 'fn' and 'arg' into global registers which diff -uprN linux-2.6.24/arch/sparc64/kernel/sparc64_ksyms.c linux-2.6.24.ovz/arch/sparc64/kernel/sparc64_ksyms.c --- linux-2.6.24/arch/sparc64/kernel/sparc64_ksyms.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/kernel/sparc64_ksyms.c 2008-03-25 18:53:59.000000000 -0500 @@ -310,6 +310,7 @@ EXPORT_SYMBOL(copy_from_user_fixup); EXPORT_SYMBOL(copy_in_user_fixup); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(mem_map_zero); /* Various address conversion macros use this. */ EXPORT_SYMBOL(sparc64_valid_addr_bitmap); diff -uprN linux-2.6.24/arch/sparc64/kernel/systbls.S linux-2.6.24.ovz/arch/sparc64/kernel/systbls.S --- linux-2.6.24/arch/sparc64/kernel/systbls.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/kernel/systbls.S 2008-03-25 18:53:59.000000000 -0500 @@ -82,6 +82,24 @@ sys_call_table32: .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, compat_sys_timerfd, sys_eventfd, compat_sys_fallocate + .rept 500-315 + .word sys_nis_syscall + .endr + .word sys_fairsched_mknod /* 500 */ + .word sys_fairsched_rmnod + .word sys_fairsched_chwt + .word sys_fairsched_mvpr + .word sys_fairsched_rate + .word sys_nis_syscall /* 505 */ + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_getluid /* 510 */ + .word sys_setluid + .word compat_sys_setublimit + .word compat_sys_ubstat + #endif /* CONFIG_COMPAT */ /* Now the 64-bit native Linux syscall table. */ @@ -154,6 +172,25 @@ sys_call_table: .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate + .rept 500-315 + .word sys_nis_syscall + .endr + .word sys_fairsched_mknod /* 500 */ + .word sys_fairsched_rmnod + .word sys_fairsched_chwt + .word sys_fairsched_mvpr + .word sys_fairsched_rate + .word sys_nis_syscall /* 505 */ + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_nis_syscall + .word sys_getluid /* 510 */ + .word sys_setluid + .word sys_setublimit + .word sys_ubstat + + #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \ defined(CONFIG_SOLARIS_EMUL_MODULE) /* Now the 32-bit SunOS syscall table. */ @@ -272,5 +309,8 @@ sunos_sys_table: .word sunos_nosys /*310*/ .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys, sunos_nosys + .rept 520-315 + .word sunos_nosys + .endr #endif diff -uprN linux-2.6.24/arch/sparc64/kernel/traps.c linux-2.6.24.ovz/arch/sparc64/kernel/traps.c --- linux-2.6.24/arch/sparc64/kernel/traps.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/kernel/traps.c 2008-03-25 18:53:59.000000000 -0500 @@ -2229,6 +2229,10 @@ void die_if_kernel(char *str, struct pt_ " \\__U_/\n"); printk("%s(%d): %s [#%d]\n", current->comm, task_pid_nr(current), str, ++die_counter); + printk("VE:EXCVE %d:%d, CPU %d, VCPU %d:%d\n", + VEID(VE_TASK_INFO(current)->owner_env), VEID(get_exec_env()), + smp_processor_id(), + task_vsched_id(current), task_cpu(current)); notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); __asm__ __volatile__("flushw"); __show_regs(regs); diff -uprN linux-2.6.24/arch/sparc64/mm/init.c linux-2.6.24.ovz/arch/sparc64/mm/init.c --- linux-2.6.24/arch/sparc64/mm/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/sparc64/mm/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -434,6 +434,7 @@ void show_mem(void) printk(KERN_INFO "%lu pages pagetables\n", global_page_state(NR_PAGETABLE)); } +EXPORT_SYMBOL(show_mem); void mmu_info(struct seq_file *m) { diff -uprN linux-2.6.24/arch/x86/Kconfig linux-2.6.24.ovz/arch/x86/Kconfig --- linux-2.6.24/arch/x86/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -536,6 +536,14 @@ config X86_VISWS_APIC depends on X86_32 && X86_VISWS default y +config NMI_WATCHDOG + bool "NMI Watchdog" + default y + help + If you say Y here the kernel will activate NMI watchdog by default + on boot. You can still activate NMI watchdog via nmi_watchdog + command line option even if you say N here. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER @@ -1602,6 +1610,7 @@ config SYSVIPC_COMPAT endmenu +source "kernel/Kconfig.openvz" source "net/Kconfig" @@ -1620,3 +1629,5 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/bc/Kconfig" diff -uprN linux-2.6.24/arch/x86/ia32/ia32_binfmt.c linux-2.6.24.ovz/arch/x86/ia32/ia32_binfmt.c --- linux-2.6.24/arch/x86/ia32/ia32_binfmt.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/ia32_binfmt.c 2008-03-25 18:53:59.000000000 -0500 @@ -47,7 +47,7 @@ #define AT_SYSINFO 32 #define AT_SYSINFO_EHDR 33 -int sysctl_vsyscall32 = 1; +int sysctl_vsyscall32 = 0; #undef ARCH_DLINFO #define ARCH_DLINFO do { \ @@ -225,9 +225,7 @@ MODULE_AUTHOR("Eric Youngdale, Andi Klee static void elf32_init(struct pt_regs *); -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -#define arch_setup_additional_pages syscall32_setup_pages -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +extern int arch_setup_additional_pages(struct linux_binprm *, int exstack, unsigned long map_address); #include "../../../fs/binfmt_elf.c" diff -uprN linux-2.6.24/arch/x86/ia32/ia32entry.S linux-2.6.24.ovz/arch/x86/ia32/ia32entry.S --- linux-2.6.24/arch/x86/ia32/ia32entry.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/ia32entry.S 2008-03-25 18:53:59.000000000 -0500 @@ -104,7 +104,8 @@ ENTRY(ia32_sysenter_target) pushfq CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ - movl $VSYSCALL32_SYSEXIT, %r10d + GET_THREAD_INFO(%r10) + movl threadinfo_sysenter_return(%r10), %r10d CFI_REGISTER rip,r10 pushq $__USER32_CS CFI_ADJUST_CFA_OFFSET 8 @@ -149,7 +150,7 @@ sysenter_do_call: popq %rcx /* User %esp */ CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rsp,rcx - movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ + movl threadinfo_sysenter_return(%r10),%edx /* User %eip */ CFI_REGISTER rip,rdx TRACE_IRQS_ON swapgs @@ -514,7 +515,7 @@ ia32_sys_call_table: .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ + .quad quiet_ni_syscall /* vm86old */ .quad compat_sys_wait4 .quad sys_swapoff /* 115 */ .quad compat_sys_sysinfo @@ -567,7 +568,7 @@ ia32_sys_call_table: .quad sys_mremap .quad sys_setresuid16 .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl diff -uprN linux-2.6.24/arch/x86/ia32/sys_ia32.c linux-2.6.24.ovz/arch/x86/ia32/sys_ia32.c --- linux-2.6.24/arch/x86/ia32/sys_ia32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/sys_ia32.c 2008-03-25 18:53:59.000000000 -0500 @@ -842,18 +842,6 @@ long sys32_fadvise64_64(int fd, __u32 of advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, char __user * buf, size_t len) { diff -uprN linux-2.6.24/arch/x86/ia32/syscall32.c linux-2.6.24.ovz/arch/x86/ia32/syscall32.c --- linux-2.6.24/arch/x86/ia32/syscall32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/syscall32.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,27 +10,54 @@ #include #include #include +#include #include #include #include #include +#include + extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; extern int sysctl_vsyscall32; -static struct page *syscall32_pages[1]; +char *syscall32_page; +EXPORT_SYMBOL_GPL(syscall32_page); +struct page *syscall32_pages[1]; +EXPORT_SYMBOL_GPL(syscall32_pages); static int use_sysenter = -1; struct linux_binprm; /* Setup a VMA at program startup for the vsyscall page */ -int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address) { + int npages = (__VSYSCALL32_END - __VSYSCALL32_BASE) >> PAGE_SHIFT; struct mm_struct *mm = current->mm; + unsigned long flags; + unsigned long addr = map_address ? : __VSYSCALL32_BASE; int ret; + if (sysctl_vsyscall32 == 0 && map_address == 0) + return 0; + + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | + mm->def_flags; + + ret = -ENOMEM; + if (ub_memory_charge(mm, __VSYSCALL32_END - __VSYSCALL32_BASE, + flags, NULL, UB_SOFT)) + goto err_charge; + down_write(&mm->mmap_sem); + addr = get_unmapped_area(NULL, addr, PAGE_SIZE * npages, 0, + MAP_PRIVATE | MAP_FIXED); + if (unlikely(addr & ~PAGE_MASK)) { + ret = addr; + goto err_ins; + } /* * MAYWRITE to allow gdb to COW and set breakpoints * @@ -40,18 +67,27 @@ int syscall32_setup_pages(struct linux_b * what PC values meant. */ /* Could randomize here */ - ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE, + ret = install_special_mapping(mm, addr, PAGE_SIZE * npages, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, syscall32_pages); + if (ret == 0) { + mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = VSYSCALL32_SYSEXIT; + } up_write(&mm->mmap_sem); + if (ret < 0) +err_ins: + ub_memory_uncharge(mm, __VSYSCALL32_END - __VSYSCALL32_BASE, flags, NULL); +err_charge: return ret; } +EXPORT_SYMBOL_GPL(syscall32_setup_pages); static int __init init_syscall32(void) { - char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); + syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); if (!syscall32_page) panic("Cannot allocate syscall32 page"); syscall32_pages[0] = virt_to_page(syscall32_page); diff -uprN linux-2.6.24/arch/x86/ia32/vsyscall-sysenter.S linux-2.6.24.ovz/arch/x86/ia32/vsyscall-sysenter.S --- linux-2.6.24/arch/x86/ia32/vsyscall-sysenter.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/vsyscall-sysenter.S 2008-03-25 18:53:59.000000000 -0500 @@ -20,9 +20,9 @@ __kernel_vsyscall: .Lenter_kernel: movl %esp,%ebp sysenter - .space 7,0x90 + .space 23,0x90 jmp .Lenter_kernel - /* 16: System call normal return point is here! */ + /* 32: System call normal return point is here! */ pop %ebp .Lpop_ebp: pop %edx diff -uprN linux-2.6.24/arch/x86/ia32/vsyscall.lds linux-2.6.24.ovz/arch/x86/ia32/vsyscall.lds --- linux-2.6.24/arch/x86/ia32/vsyscall.lds 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/ia32/vsyscall.lds 2008-03-25 18:53:59.000000000 -0500 @@ -4,11 +4,11 @@ */ /* This must match . */ -VSYSCALL_BASE = 0xffffe000; +__VSYSCALL_BASE = 0xffffe000; SECTIONS { - . = VSYSCALL_BASE + SIZEOF_HEADERS; + . = __VSYSCALL_BASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -22,18 +22,18 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; + . = __VSYSCALL_BASE + 0x400; .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 /* This is an 32bit object and we cannot easily get the offsets into the 64bit kernel. Just hardcode them here. This assumes that all the stubs don't need more than 0x100 bytes. */ - . = VSYSCALL_BASE + 0x500; + . = __VSYSCALL_BASE + 0x500; .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 - . = VSYSCALL_BASE + 0x600; + . = __VSYSCALL_BASE + 0x600; .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 diff -uprN linux-2.6.24/arch/x86/kernel/Makefile_32 linux-2.6.24.ovz/arch/x86/kernel/Makefile_32 --- linux-2.6.24/arch/x86/kernel/Makefile_32 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/Makefile_32 2008-03-25 18:53:59.000000000 -0500 @@ -19,7 +19,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o obj-$(CONFIG_SMP) += smpcommon_32.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o diff -uprN linux-2.6.24/arch/x86/kernel/asm-offsets_64.c linux-2.6.24.ovz/arch/x86/kernel/asm-offsets_64.c --- linux-2.6.24/arch/x86/kernel/asm-offsets_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/asm-offsets_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -47,6 +47,7 @@ int main(void) ENTRY(addr_limit); ENTRY(preempt_count); ENTRY(status); + ENTRY(sysenter_return); BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) diff -uprN linux-2.6.24/arch/x86/kernel/cpu/mtrr/if.c linux-2.6.24.ovz/arch/x86/kernel/cpu/mtrr/if.c --- linux-2.6.24/arch/x86/kernel/cpu/mtrr/if.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/cpu/mtrr/if.c 2008-03-25 18:53:59.000000000 -0500 @@ -427,7 +427,7 @@ static int __init mtrr_if_init(void) return -ENODEV; proc_root_mtrr = - create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); + create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); if (proc_root_mtrr) { proc_root_mtrr->owner = THIS_MODULE; proc_root_mtrr->proc_fops = &mtrr_fops; diff -uprN linux-2.6.24/arch/x86/kernel/entry_32.S linux-2.6.24.ovz/arch/x86/kernel/entry_32.S --- linux-2.6.24/arch/x86/kernel/entry_32.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/entry_32.S 2008-03-25 18:53:59.000000000 -0500 @@ -221,6 +221,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%ebp) popl %eax CFI_ADJUST_CFA_OFFSET -4 +ret_from_fork_tail: pushl $0x0202 # Reset kernel eflags CFI_ADJUST_CFA_OFFSET 4 popfl @@ -229,6 +230,25 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +ENTRY(i386_ret_from_resume) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + movl (%esp),%eax + testl %eax,%eax + jz 1f + pushl %esp + call *%eax + addl $4,%esp +1: + addl $256,%esp + jmp ret_from_fork_tail + CFI_ENDPROC + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to diff -uprN linux-2.6.24/arch/x86/kernel/entry_64.S linux-2.6.24.ovz/arch/x86/kernel/entry_64.S --- linux-2.6.24/arch/x86/kernel/entry_64.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/entry_64.S 2008-03-25 18:53:59.000000000 -0500 @@ -160,7 +160,12 @@ ENTRY(ret_from_fork) popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -4 call schedule_tail +ret_from_fork_tail: GET_THREAD_INFO(%rcx) + btr $TIF_RESUME,threadinfo_flags(%rcx) + jc x86_64_ret_from_resume + +ret_from_fork_check: testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) jnz rff_trace rff_action: @@ -176,6 +181,19 @@ rff_trace: call syscall_trace_leave GET_THREAD_INFO(%rcx) jmp rff_action + +x86_64_ret_from_resume: + movq (%rsp),%rax + testq %rax,%rax + jz 1f + movq %rsp,%rdi + call *%rax +1: + addq $256,%rsp + cmpq $0,ORIG_RAX(%rsp) + jge ret_from_fork_tail + RESTORE_REST + jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -283,7 +301,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f /* Really a signal */ @@ -377,7 +395,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -603,7 +621,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz retint_swapgs TRACE_IRQS_ON sti @@ -960,7 +978,7 @@ ENTRY(kernel_thread) xorl %r9d,%r9d # clone now - call do_fork + call do_fork_kthread movq %rax,RAX(%rsp) xorl %edi,%edi diff -uprN linux-2.6.24/arch/x86/kernel/ldt_32.c linux-2.6.24.ovz/arch/x86/kernel/ldt_32.c --- linux-2.6.24/arch/x86/kernel/ldt_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/ldt_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,8 @@ #include #include +#include + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) { @@ -36,9 +39,9 @@ static int alloc_ldt(mm_context_t *pc, i oldsize = pc->size; mincount = (mincount+511)&(~511); if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL_UBC); if (!newldt) return -ENOMEM; @@ -102,6 +105,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * No need to lock the MM as we are the last user diff -uprN linux-2.6.24/arch/x86/kernel/ldt_64.c linux-2.6.24.ovz/arch/x86/kernel/ldt_64.c --- linux-2.6.24/arch/x86/kernel/ldt_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/ldt_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,8 @@ #include #include +#include + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) { @@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, u oldsize = pc->size; mincount = (mincount+511)&(~511); if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL_UBC); if (!newldt) return -ENOMEM; @@ -106,6 +109,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL_GPL(init_new_context); /* * diff -uprN linux-2.6.24/arch/x86/kernel/nmi_32.c linux-2.6.24.ovz/arch/x86/kernel/nmi_32.c --- linux-2.6.24/arch/x86/kernel/nmi_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/nmi_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -318,6 +318,21 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); +void smp_show_regs(struct pt_regs *regs, void *info) +{ + static DEFINE_SPINLOCK(show_regs_lock); + + if (regs == NULL) + return; + + spin_lock(&show_regs_lock); + bust_spinlocks(1); + printk("----------- IPI show regs -----------"); + show_regs(regs); + bust_spinlocks(0); + spin_unlock(&show_regs_lock); +} + __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { diff -uprN linux-2.6.24/arch/x86/kernel/nmi_64.c linux-2.6.24.ovz/arch/x86/kernel/nmi_64.c --- linux-2.6.24/arch/x86/kernel/nmi_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/nmi_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -41,7 +41,12 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ int panic_on_timeout; +#ifdef CONFIG_NMI_WATCHDOG unsigned int nmi_watchdog = NMI_DEFAULT; +#else +unsigned int nmi_watchdog = NMI_NONE; +#endif + static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); @@ -354,10 +359,10 @@ int __kprobes nmi_watchdog_tick(struct p if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... + * wait a few IRQs (30 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 30*nmi_hz) die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); } else { @@ -385,15 +390,34 @@ int __kprobes nmi_watchdog_tick(struct p static unsigned ignore_nmis; +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); add_pda(__nmi_count,1); - if (!ignore_nmis) + if (!ignore_nmis) { default_do_nmi(regs); + nmi_ipi_callback(regs, smp_processor_id()); + } nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + int do_nmi_callback(struct pt_regs * regs, int cpu) { #ifdef CONFIG_SYSCTL diff -uprN linux-2.6.24/arch/x86/kernel/process_32.c linux-2.6.24.ovz/arch/x86/kernel/process_32.c --- linux-2.6.24/arch/x86/kernel/process_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/process_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -52,11 +53,15 @@ #endif #include +#include #include #include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +EXPORT_SYMBOL(ret_from_fork); +asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); +EXPORT_SYMBOL_GPL(i386_ret_from_resume); static int hlt_counter; @@ -324,16 +329,17 @@ void __show_registers(struct pt_regs *re } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + printk("Pid: %d, comm: %s %s (%s %.*s %s)\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, VZVERSION); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 0xffff & regs->xcs, regs->eip, regs->eflags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + if (decode_call_traces) + print_symbol("EIP is at %s\n", regs->eip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->eax, regs->ebx, regs->ecx, regs->edx); @@ -370,6 +376,8 @@ void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); show_trace(NULL, regs, ®s->esp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n",regs->eip); } /* @@ -378,6 +386,7 @@ void show_regs(struct pt_regs *regs) * the "args". */ extern void kernel_thread_helper(void); +EXPORT_SYMBOL_GPL(kernel_thread_helper); /* * Create a kernel thread @@ -386,6 +395,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.ebx = (unsigned long) fn; diff -uprN linux-2.6.24/arch/x86/kernel/process_64.c linux-2.6.24.ovz/arch/x86/kernel/process_64.c --- linux-2.6.24/arch/x86/kernel/process_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/process_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,12 +26,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -52,8 +54,6 @@ #include #include -asmlinkage extern void ret_from_fork(void); - unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; unsigned long boot_option_idle_override = 0; @@ -325,13 +325,14 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk("Pid: %d, comm: %.20s %s %s %.*s %s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, VZVERSION); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); + if (decode_call_traces) + printk_address(regs->rip); printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", @@ -379,7 +380,22 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, ®s->rsp); +} + +void smp_show_regs(struct pt_regs *regs, void *data) +{ + static DEFINE_SPINLOCK(show_regs_lock); + + if (regs == NULL) + return; + + spin_lock(&show_regs_lock); + bust_spinlocks(1); + printk("----------- IPI show regs -----------\n"); + show_regs(regs); + bust_spinlocks(0); + spin_unlock(&show_regs_lock); } /* @@ -914,3 +930,20 @@ unsigned long arch_align_stack(unsigned sp -= get_random_int() % 8192; return sp & ~0xf; } + +long do_fork_kthread(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + if (ve_allow_kthreads || ve_is_super(get_exec_env())) + return do_fork(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr); + + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; +} diff -uprN linux-2.6.24/arch/x86/kernel/ptrace_32.c linux-2.6.24.ovz/arch/x86/kernel/ptrace_32.c --- linux-2.6.24/arch/x86/kernel/ptrace_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/ptrace_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -682,8 +682,11 @@ int do_syscall_trace(struct pt_regs *reg return 0; /* Fake a debug trap */ - if (is_singlestep) + if (is_singlestep) { + set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); send_sigtrap(current, regs, 0); + clear_pn_state(current); + } if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) goto out; diff -uprN linux-2.6.24/arch/x86/kernel/ptrace_64.c linux-2.6.24.ovz/arch/x86/kernel/ptrace_64.c --- linux-2.6.24/arch/x86/kernel/ptrace_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/ptrace_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -616,6 +616,10 @@ asmlinkage void syscall_trace_leave(stru if ((test_thread_flag(TIF_SYSCALL_TRACE) || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) + && (current->ptrace & PT_PTRACED)) { + set_pn_state(current, (regs->rax != -ENOSYS) ? + PN_STOP_LEAVE : PN_STOP_ENTRY); syscall_trace(regs); + clear_pn_state(current); + } } diff -uprN linux-2.6.24/arch/x86/kernel/setup64.c linux-2.6.24.ovz/arch/x86/kernel/setup64.c --- linux-2.6.24/arch/x86/kernel/setup64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/setup64.c 2008-03-25 18:53:59.000000000 -0500 @@ -293,3 +293,5 @@ void __cpuinit cpu_init (void) raw_local_save_flags(kernel_eflags); } + +EXPORT_SYMBOL_GPL(cpu_gdt_descr); diff -uprN linux-2.6.24/arch/x86/kernel/signal_32.c linux-2.6.24.ovz/arch/x86/kernel/signal_32.c --- linux-2.6.24/arch/x86/kernel/signal_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/signal_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -587,6 +588,9 @@ static void fastcall do_signal(struct pt if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else @@ -615,6 +619,7 @@ static void fastcall do_signal(struct pt return; } +no_signal: /* Did we come from a system call? */ if (regs->orig_eax >= 0) { /* Restart the system call - no handlers present */ diff -uprN linux-2.6.24/arch/x86/kernel/signal_64.c linux-2.6.24.ovz/arch/x86/kernel/signal_64.c --- linux-2.6.24/arch/x86/kernel/signal_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/signal_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -403,6 +404,9 @@ static void do_signal(struct pt_regs *re if (!user_mode(regs)) return; + if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else @@ -429,6 +433,7 @@ static void do_signal(struct pt_regs *re return; } +no_signal: /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ diff -uprN linux-2.6.24/arch/x86/kernel/smp_32.c linux-2.6.24.ovz/arch/x86/kernel/smp_32.c --- linux-2.6.24/arch/x86/kernel/smp_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/smp_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -427,6 +428,8 @@ void flush_tlb_mm (struct mm_struct * mm preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); + void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; @@ -602,6 +605,89 @@ native_smp_call_function_mask(cpumask_t return 0; } +static DEFINE_SPINLOCK(nmi_call_lock); +static struct nmi_call_data_struct { + smp_nmi_function func; + void *info; + atomic_t started; + atomic_t finished; + cpumask_t cpus_called; + int wait; +} *nmi_call_data; + +static int smp_nmi_callback(struct pt_regs * regs, int cpu) +{ + smp_nmi_function func; + void *info; + int wait; + + func = nmi_call_data->func; + info = nmi_call_data->info; + wait = nmi_call_data->wait; + ack_APIC_irq(); + /* prevent from calling func() multiple times */ + if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) + return 0; + /* + * notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&nmi_call_data->started); + /* at this point the nmi_call_data structure is out of scope */ + irq_enter(); + func(regs, info); + irq_exit(); + if (wait) + atomic_inc(&nmi_call_data->finished); + + return 0; +} + +/* + * This function tries to call func(regs, info) on each cpu. + * Func must be fast and non-blocking. + * May be called with disabled interrupts and from any context. + */ +int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + struct nmi_call_data_struct data; + int cpus; + + cpus = num_online_cpus() - 1; + if (!cpus) + return 0; + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.started, 0); + atomic_set(&data.finished, 0); + cpus_clear(data.cpus_called); + /* prevent this cpu from calling func if NMI happens */ + cpu_set(smp_processor_id(), data.cpus_called); + + if (!spin_trylock(&nmi_call_lock)) + return -1; + + nmi_call_data = &data; + set_nmi_ipi_callback(smp_nmi_callback); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(APIC_DM_NMI); + while (atomic_read(&data.started) != cpus) + barrier(); + + unset_nmi_ipi_callback(); + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&nmi_call_lock); + + return 0; +} + static void stop_this_cpu (void * dummy) { local_irq_disable(); diff -uprN linux-2.6.24/arch/x86/kernel/smp_64.c linux-2.6.24.ovz/arch/x86/kernel/smp_64.c --- linux-2.6.24/arch/x86/kernel/smp_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/smp_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -27,6 +27,7 @@ #include #include #include +#include /* * Smarter SMP flushing macros. @@ -463,6 +464,84 @@ int smp_call_function (void (*func) (voi } EXPORT_SYMBOL(smp_call_function); +static DEFINE_SPINLOCK(nmi_call_lock); +static struct nmi_call_data_struct { + smp_nmi_function func; + void *info; + atomic_t started; + atomic_t finished; + cpumask_t cpus_called; + int wait; +} *nmi_call_data; + +static int smp_nmi_callback(struct pt_regs * regs, int cpu) +{ + smp_nmi_function func; + void *info; + int wait; + + func = nmi_call_data->func; + info = nmi_call_data->info; + wait = nmi_call_data->wait; + ack_APIC_irq(); + /* prevent from calling func() multiple times */ + if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) + return 0; + /* + * notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&nmi_call_data->started); + /* at this point the nmi_call_data structure is out of scope */ + irq_enter(); + func(regs, info); + irq_exit(); + if (wait) + atomic_inc(&nmi_call_data->finished); + + return 0; +} + +int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + struct nmi_call_data_struct data; + int cpus; + + cpus = num_online_cpus() - 1; + if (!cpus) + return 0; + + data.func = func; + data.info = info; + data.wait = wait; + atomic_set(&data.started, 0); + atomic_set(&data.finished, 0); + cpus_clear(data.cpus_called); + /* prevent this cpu from calling func if NMI happens */ + cpu_set(smp_processor_id(), data.cpus_called); + + if (!spin_trylock(&nmi_call_lock)) + return -1; + + nmi_call_data = &data; + set_nmi_ipi_callback(smp_nmi_callback); + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(APIC_DM_NMI); + while (atomic_read(&data.started) != cpus) + barrier(); + + unset_nmi_ipi_callback(); + if (wait) + while (atomic_read(&data.finished) != cpus) + barrier(); + spin_unlock(&nmi_call_lock); + + return 0; +} + static void stop_this_cpu(void *dummy) { local_irq_disable(); diff -uprN linux-2.6.24/arch/x86/kernel/smpboot_32.c linux-2.6.24.ovz/arch/x86/kernel/smpboot_32.c --- linux-2.6.24/arch/x86/kernel/smpboot_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/smpboot_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -800,6 +800,13 @@ static int __cpuinit do_boot_cpu(int api early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); idle->thread.eip = (unsigned long) start_secondary; + +#ifdef CONFIG_VE + /* Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. */ + VE_TASK_INFO(idle)->sleep_time = 0; +#endif + /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff -uprN linux-2.6.24/arch/x86/kernel/syscall_table_32.S linux-2.6.24.ovz/arch/x86/kernel/syscall_table_32.S --- linux-2.6.24/arch/x86/kernel/syscall_table_32.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/syscall_table_32.S 2008-03-25 18:53:59.000000000 -0500 @@ -324,3 +324,24 @@ ENTRY(sys_call_table) .long sys_timerfd .long sys_eventfd .long sys_fallocate + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + .long sys_fairsched_vcpus /* 505 */ + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_lchmod /* 516 */ + .long sys_lutime diff -uprN linux-2.6.24/arch/x86/kernel/sysenter_32.c linux-2.6.24.ovz/arch/x86/kernel/sysenter_32.c --- linux-2.6.24/arch/x86/kernel/sysenter_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/sysenter_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -215,7 +215,10 @@ static int __init gate_vma_init(void) */ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static struct page *syscall_pages[1]; +void *syscall_page; +EXPORT_SYMBOL(syscall_page); +struct page *syscall_pages[1]; +EXPORT_SYMBOL_GPL(syscall_pages); static void map_compat_vdso(int map) { @@ -235,10 +238,10 @@ static void map_compat_vdso(int map) int __init sysenter_setup(void) { - void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); const void *vsyscall; size_t vsyscall_len; + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); syscall_pages[0] = virt_to_page(syscall_page); gate_vma_init(); @@ -261,15 +264,22 @@ int __init sysenter_setup(void) /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; +EXPORT_SYMBOL_GPL(SYSENTER_RETURN); /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address) { struct mm_struct *mm = current->mm; unsigned long addr; int ret = 0; bool compat; + if (unlikely(!vdso_enabled) && map_address == 0) { + current->mm->context.vdso = NULL; + return 0; + } + down_write(&mm->mmap_sem); /* Test compat mode once here, in case someone @@ -281,7 +291,7 @@ int arch_setup_additional_pages(struct l if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area(NULL, map_address, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; @@ -315,6 +325,7 @@ int arch_setup_additional_pages(struct l return ret; } +EXPORT_SYMBOL(arch_setup_additional_pages); const char *arch_vma_name(struct vm_area_struct *vma) { diff -uprN linux-2.6.24/arch/x86/kernel/traps_32.c linux-2.6.24.ovz/arch/x86/kernel/traps_32.c --- linux-2.6.24/arch/x86/kernel/traps_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/traps_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -58,6 +58,7 @@ #include #include +#include #include "mach_traps.h" @@ -220,7 +221,8 @@ static int print_trace_stack(void *data, static void print_trace_address(void *data, unsigned long addr) { printk("%s [<%08lx>] ", (char *)data, addr); - print_symbol("%s\n", addr); + if (decode_call_traces) + print_symbol("%s\n", addr); touch_nmi_watchdog(); } @@ -236,7 +238,10 @@ show_trace_log_lvl(struct task_struct *t unsigned long * stack, char *log_lvl) { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); + if (decode_call_traces) + printk("%s =======================\n", log_lvl); + else + printk("%s ==", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -266,8 +271,13 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } - printk("\n%sCall Trace:\n", log_lvl); + if (decode_call_traces) + printk("\n%s Call Trace:\n", log_lvl); + else + printk("\n%s Call Trace: ", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + if (!decode_call_traces) + printk("\n"); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -289,6 +299,8 @@ void dump_stack(void) (int)strcspn(init_utsname()->version, " "), init_utsname()->version); show_trace(current, NULL, &stack); + if (!decode_call_traces) + printk("\n"); } EXPORT_SYMBOL(dump_stack); @@ -299,8 +311,9 @@ void show_registers(struct pt_regs *regs print_modules(); __show_registers(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, task_pid_nr(current), + VEID(current->ve_task_info.owner_env), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the @@ -351,6 +364,13 @@ int is_valid_bugaddr(unsigned long eip) return ud2 == 0x0b0f; } +static void inline check_kernel_csum_bug(void) +{ + if (kernel_text_csum_broken) + printk("Kernel code checksum mismatch detected %d times\n", + kernel_text_csum_broken); +} + /* * This is gone through when something in the kernel has done something bad and * is about to be terminated. @@ -420,6 +440,7 @@ void die(const char * str, struct pt_reg } else printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + check_kernel_csum_bug(); bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); @@ -690,12 +711,27 @@ unknown_nmi_error(unsigned char reason, printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -static DEFINE_SPINLOCK(nmi_print_lock); +/* + * Voyager doesn't implement these + */ +void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) +{ +} +#ifdef CONFIG_SMP +int __attribute__((weak)) +smp_nmi_call_function(smp_nmi_function func, void *info, int wait) +{ + return 0; +} +#endif + void __kprobes die_nmi(struct pt_regs *regs, const char *msg) { + static DEFINE_SPINLOCK(nmi_print_lock); + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + NOTIFY_STOP) return; spin_lock(&nmi_print_lock); @@ -708,6 +744,10 @@ void __kprobes die_nmi(struct pt_regs *r printk(" on CPU%d, eip %08lx, registers:\n", smp_processor_id(), regs->eip); show_registers(regs); + smp_nmi_call_function(smp_show_regs, NULL, 1); + bust_spinlocks(1); + if (!decode_call_traces) + show_registers(regs); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); @@ -723,6 +763,13 @@ void __kprobes die_nmi(struct pt_regs *r do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + static __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -742,10 +789,15 @@ static __kprobes void default_do_nmi(str */ if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs, smp_processor_id())) -#endif + if (!do_nmi_callback(regs, smp_processor_id())) { unknown_nmi_error(reason, regs); + return; + } +#endif + if (nmi_ipi_callback != dummy_nmi_callback) + return; + unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -773,12 +825,24 @@ fastcall __kprobes void do_nmi(struct pt ++nmi_count(cpu); - if (!ignore_nmis) + if (!ignore_nmis) { default_do_nmi(regs); + nmi_ipi_callback(regs, cpu); + } nmi_exit(); } +void set_nmi_ipi_callback(nmi_callback_t callback) +{ + nmi_ipi_callback = callback; +} + +void unset_nmi_ipi_callback(void) +{ + nmi_ipi_callback = dummy_nmi_callback; +} + void stop_nmi(void) { acpi_nmi_disable(); diff -uprN linux-2.6.24/arch/x86/kernel/traps_64.c linux-2.6.24.ovz/arch/x86/kernel/traps_64.c --- linux-2.6.24/arch/x86/kernel/traps_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/traps_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -107,6 +107,11 @@ void printk_address(unsigned long addres char *delim = ":"; char namebuf[128]; + if (!decode_call_traces) { + printk("[<%016lx>]", address); + return; + } + symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); if (!symname) { @@ -382,7 +387,7 @@ _show_stack(struct task_struct *tsk, str if (((long) stack & (THREAD_SIZE-1)) == 0) break; } - if (i && ((i % 4) == 0)) + if (i && ((i % 4) == 0) && decode_call_traces) printk("\n"); printk(" %016lx", *stack++); touch_nmi_watchdog(); @@ -421,10 +426,12 @@ void show_registers(struct pt_regs *regs struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; - printk("CPU %d ", cpu); + printk("CPU: %d ", cpu); __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, + VEID(VE_TASK_INFO(current)->owner_env), + task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the diff -uprN linux-2.6.24/arch/x86/kernel/tsc_sync.c linux-2.6.24.ovz/arch/x86/kernel/tsc_sync.c --- linux-2.6.24/arch/x86/kernel/tsc_sync.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/tsc_sync.c 2008-03-25 18:53:59.000000000 -0500 @@ -145,7 +145,10 @@ void __cpuinit check_tsc_sync_source(int } else { printk(" passed.\n"); } - +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif /* * Let the target continue with the bootup: */ diff -uprN linux-2.6.24/arch/x86/kernel/vsyscall-sigreturn_32.S linux-2.6.24.ovz/arch/x86/kernel/vsyscall-sigreturn_32.S --- linux-2.6.24/arch/x86/kernel/vsyscall-sigreturn_32.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/vsyscall-sigreturn_32.S 2008-03-25 18:53:59.000000000 -0500 @@ -15,7 +15,7 @@ */ .text - .org __kernel_vsyscall+32,0x90 + .org __kernel_vsyscall+0x100,0x90 .globl __kernel_sigreturn .type __kernel_sigreturn,@function __kernel_sigreturn: @@ -27,6 +27,7 @@ __kernel_sigreturn: .size __kernel_sigreturn,.-.LSTART_sigreturn .balign 32 + .org __kernel_vsyscall+0x200,0x90 .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function __kernel_rt_sigreturn: diff -uprN linux-2.6.24/arch/x86/kernel/vsyscall-sysenter_32.S linux-2.6.24.ovz/arch/x86/kernel/vsyscall-sysenter_32.S --- linux-2.6.24/arch/x86/kernel/vsyscall-sysenter_32.S 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/vsyscall-sysenter_32.S 2008-03-25 18:53:59.000000000 -0500 @@ -39,12 +39,12 @@ __kernel_vsyscall: movl %esp,%ebp sysenter - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 + /* 17: align return point with nop's to make disassembly easier */ + .space 13,0x90 - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ + /* 30: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel - /* 16: System call normal return point is here! */ + /* 32: System call normal return point is here! */ .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ SYSENTER_RETURN: pop %ebp diff -uprN linux-2.6.24/arch/x86/kernel/x8664_ksyms_64.c linux-2.6.24.ovz/arch/x86/kernel/x8664_ksyms_64.c --- linux-2.6.24/arch/x86/kernel/x8664_ksyms_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/kernel/x8664_ksyms_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -3,12 +3,14 @@ #include #include +#include #include #include #include #include +EXPORT_SYMBOL(kernel_execve); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(__down_failed); diff -uprN linux-2.6.24/arch/x86/mm/fault_32.c linux-2.6.24.ovz/arch/x86/mm/fault_32.c --- linux-2.6.24/arch/x86/mm/fault_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/fault_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -279,7 +279,7 @@ static inline int vmalloc_fault(unsigned return 0; } -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; /* * This routine handles page faults. It determines the address, @@ -420,7 +420,6 @@ good_area: goto bad_area; } - survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -593,14 +592,14 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; + if (error_code & 4) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, tsk); + return; } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); goto no_context; do_sigbus: diff -uprN linux-2.6.24/arch/x86/mm/fault_64.c linux-2.6.24.ovz/arch/x86/mm/fault_64.c --- linux-2.6.24/arch/x86/mm/fault_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/fault_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -285,7 +285,7 @@ static int vmalloc_fault(unsigned long a return 0; } -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; /* * This routine handles page faults. It determines the address, @@ -375,7 +375,6 @@ asmlinkage void __kprobes do_page_fault( if (user_mode_vm(regs)) error_code |= PF_USER; - again: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -487,7 +486,7 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk( + ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, tsk->pid, address, regs->rip, @@ -537,7 +536,8 @@ no_context: else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); + if (decode_call_traces) + printk_address(regs->rip); dump_pagetable(address); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -554,13 +554,14 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - goto again; + if (error_code & 4) { + /* + * 0-order allocation always success if something really + * fatal not happen: beancounter overdraft or OOM. + */ + force_sig(SIGKILL, tsk); + return; } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); goto no_context; do_sigbus: diff -uprN linux-2.6.24/arch/x86/mm/hugetlbpage.c linux-2.6.24.ovz/arch/x86/mm/hugetlbpage.c --- linux-2.6.24/arch/x86/mm/hugetlbpage.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/hugetlbpage.c 2008-03-25 18:53:59.000000000 -0500 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -206,6 +207,7 @@ int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_PSE); } +EXPORT_SYMBOL(pmd_huge); struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, diff -uprN linux-2.6.24/arch/x86/mm/init_32.c linux-2.6.24.ovz/arch/x86/mm/init_32.c --- linux-2.6.24/arch/x86/mm/init_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/init_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -757,7 +757,7 @@ void __init pgtable_cache_init(void) pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, + SLAB_PANIC|SLAB_UBC, pmd_ctor); } diff -uprN linux-2.6.24/arch/x86/mm/init_64.c linux-2.6.24.ovz/arch/x86/mm/init_64.c --- linux-2.6.24/arch/x86/mm/init_64.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/init_64.c 2008-03-25 18:53:59.000000000 -0500 @@ -96,6 +96,7 @@ void show_mem(void) printk(KERN_INFO "%lu pages shared\n",shared); printk(KERN_INFO "%lu pages swap cached\n",cached); } +EXPORT_SYMBOL(show_mem); int after_bootmem; diff -uprN linux-2.6.24/arch/x86/mm/pgtable_32.c linux-2.6.24.ovz/arch/x86/mm/pgtable_32.c --- linux-2.6.24/arch/x86/mm/pgtable_32.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/mm/pgtable_32.c 2008-03-25 18:53:59.000000000 -0500 @@ -4,9 +4,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -71,6 +73,7 @@ void show_mem(void) printk(KERN_INFO "%lu pages pagetables\n", global_page_state(NR_PAGETABLE)); } +EXPORT_SYMBOL(show_mem); /* * Associate a virtual page frame with a given physical page frame @@ -188,9 +191,11 @@ struct page *pte_alloc_one(struct mm_str struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM| + __GFP_REPEAT|__GFP_ZERO, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC| + __GFP_REPEAT|__GFP_ZERO, 0); #endif return pte; } diff -uprN linux-2.6.24/arch/x86/vdso/vma.c linux-2.6.24.ovz/arch/x86/vdso/vma.c --- linux-2.6.24/arch/x86/vdso/vma.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/arch/x86/vdso/vma.c 2008-03-25 18:53:59.000000000 -0500 @@ -4,6 +4,7 @@ * Subject to the GPL, v.2 */ #include +#include #include #include #include @@ -100,18 +101,24 @@ static unsigned long vdso_addr(unsigned /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); - if (!vdso_enabled) + if (!vdso_enabled && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, len); + if (map_address) + addr = map_address; + else + addr = vdso_addr(mm->start_stack, len); addr = get_unmapped_area(NULL, addr, len, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; @@ -131,6 +138,7 @@ up_fail: up_write(&mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(arch_setup_additional_pages); static __init int vdso_setup(char *s) { diff -uprN linux-2.6.24/block/cfq-iosched.c linux-2.6.24.ovz/block/cfq-iosched.c --- linux-2.6.24/block/cfq-iosched.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/block/cfq-iosched.c 2008-03-25 18:53:59.000000000 -0500 @@ -11,6 +11,11 @@ #include #include #include +#include +#include +#include +#include +#include /* * tunables @@ -24,6 +29,7 @@ static const int cfq_slice_sync = HZ / 1 static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_ub_slice = HZ / 2; /* * grace period before allowing idle class to get disk access @@ -40,13 +46,11 @@ static int cfq_slice_idle = HZ / 125; #define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) #define RQ_CFQQ(rq) ((rq)->elevator_private2) -static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; -#define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -55,107 +59,6 @@ static struct completion *ioc_gone; #define sample_valid(samples) ((samples) > 80) -/* - * Most of our rbtree usage is for sorting with min extraction, so - * if we cache the leftmost node we don't have to walk down the tree - * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should - * move this into the elevator for the rq sorting as well. - */ -struct cfq_rb_root { - struct rb_root rb; - struct rb_node *left; -}; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } - -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; - unsigned int busy_queues; - - int rq_in_driver; - int sync_flight; - int hw_tag; - - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; - struct cfq_io_context *active_cic; - - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - - struct timer_list idle_class_timer; - - sector_t last_position; - unsigned long last_end_request; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_fifo_expire[2]; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; - unsigned int cfq_slice[2]; - unsigned int cfq_slice_async_rq; - unsigned int cfq_slice_idle; - - struct list_head cic_list; -}; - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - atomic_t ref; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - unsigned long slice_end; - long slice_resid; - - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - /* various state flags, see below */ - unsigned int flags; -}; - enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -200,6 +103,67 @@ CFQ_CFQQ_FNS(sync); static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, struct task_struct *, gfp_t); +static void cfq_put_queue(struct cfq_queue *cfqq); + +static void __cfq_put_async_queues(struct cfq_bc_data *cfq_bc) +{ + int i; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) { + if (cfq_bc->async_cfqq[0][i]) { + cfq_put_queue(cfq_bc->async_cfqq[0][i]); + cfq_bc->async_cfqq[0][i] = NULL; + } + if (cfq_bc->async_cfqq[1][i]) { + cfq_put_queue(cfq_bc->async_cfqq[1][i]); + cfq_bc->async_cfqq[1][i] = NULL; + } + } + if (cfq_bc->async_idle_cfqq) { + cfq_put_queue(cfq_bc->async_idle_cfqq); + cfq_bc->async_idle_cfqq = NULL; + } +} + +#ifdef CONFIG_BC_IO_SCHED +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) +{ + int mode; + + mode = sync ? cfqd->virt_mode : cfqd->write_virt_mode; + return mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv; +} + +static inline void cfq_put_async_queues(struct cfq_data *cfqd) +{ + struct user_beancounter *ub; + struct cfq_bc_data *cfq_bc; + + rcu_read_lock(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + __cfq_put_async_queues(cfq_bc); + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + rcu_read_unlock(); +} +#else +static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) +{ + return NULL; +} + +static inline void cfq_put_async_queues(struct cfq_data *cfqd) +{ + __cfq_put_async_queues(&cfqd->cfq_bc); +} +#endif + static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *, struct io_context *); @@ -286,6 +250,11 @@ static inline int cfq_slice_used(struct return 1; } +static inline struct user_beancounter *ub_by_iopriv(struct ub_iopriv *iopriv) +{ + return container_of(iopriv, struct user_beancounter, iopriv); +} + /* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closest to the head right now. Distance @@ -446,11 +415,15 @@ static unsigned long cfq_slice_offset(st static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, int add_front) { - struct rb_node **p = &cfqd->service_tree.rb.rb_node; + struct cfq_bc_data *cfq_bc; + struct rb_node **p; struct rb_node *parent = NULL; unsigned long rb_key; int left; + cfq_bc = cfqq->cfq_bc; + + p = &cfq_bc->service_tree.rb.rb_node; if (!add_front) { rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; rb_key += cfqq->slice_resid; @@ -465,7 +438,7 @@ static void cfq_service_tree_add(struct if (rb_key == cfqq->rb_key) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); } left = 1; @@ -501,11 +474,11 @@ static void cfq_service_tree_add(struct } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + cfq_bc->service_tree.left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &cfq_bc->service_tree.rb); } /* @@ -530,6 +503,7 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, s BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + bc_inc_rqnum(cfqq); cfq_resort_rr_list(cfqd, cfqq); } @@ -541,14 +515,19 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, s static inline void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_bc_data *cfq_bc; + BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); + cfq_bc = cfqq->cfq_bc; + if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + bc_dec_rqnum(cfqq); } /* @@ -665,8 +644,7 @@ static void cfq_remove_request(struct re } } -static int cfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) +static int cfq_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; struct request *__rq; @@ -795,7 +773,7 @@ static int start_idle_class_timer(struct unsigned long now = jiffies; if (time_before(now, end) && - time_after_eq(now, cfqd->last_end_request)) { + time_after_eq(now, cfqd->last_end_request)) { mod_timer(&cfqd->idle_class_timer, end); return 1; } @@ -809,13 +787,18 @@ static int start_idle_class_timer(struct */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { + struct cfq_bc_data *cfq_bc; struct cfq_queue *cfqq; struct rb_node *n; - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) return NULL; - n = cfq_rb_first(&cfqd->service_tree); + if (RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)) + return NULL; + + n = cfq_rb_first(&cfq_bc->service_tree); cfqq = rb_entry(n, struct cfq_queue, rb_node); if (cfq_class_idle(cfqq)) { @@ -837,9 +820,17 @@ static struct cfq_queue *cfq_get_next_qu */ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) { - struct cfq_queue *cfqq; + struct cfq_queue *cfqq = NULL; + struct cfq_bc_data *cfq_bc; + + bc_schedule_active(cfqd); + + cfq_bc = cfqd->active_cfq_bc; + if (!cfq_bc) + goto out; cfqq = cfq_get_next_queue(cfqd); +out: __cfq_set_active_queue(cfqd, cfqq); return cfqq; } @@ -930,6 +921,7 @@ static void cfq_dispatch_insert(struct r cfq_remove_request(rq); cfqq->dispatched++; + cfqq->cfq_bc->on_dispatch++; elv_dispatch_sort(q, rq); if (cfq_cfqq_sync(cfqq)) @@ -987,7 +979,7 @@ static struct cfq_queue *cfq_select_queu /* * The active queue has run out of time, expire it and select new. */ - if (cfq_slice_used(cfqq)) + if (cfq_slice_used(cfqq) || bc_expired(cfqd)) goto expire; /* @@ -1085,17 +1077,36 @@ static inline int __cfq_forced_dispatch_ * Drain our current requests. Used for barriers and when switching * io schedulers on-the-fly. */ -static int cfq_forced_dispatch(struct cfq_data *cfqd) +static int __cfq_forced_dispatch(struct cfq_bc_data *cfq_bc) { int dispatched = 0; struct rb_node *n; - while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) { + while ((n = cfq_rb_first(&cfq_bc->service_tree)) != NULL) { struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node); dispatched += __cfq_forced_dispatch_cfqq(cfqq); } + return dispatched; +} + +static int cfq_forced_dispatch(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + int dispatched; + + dispatched = 0; + /* + * We use here _safe iterating, because + * __cfq_forced_dispatch() produces list_del() implicitly + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &cfqd->act_cfq_bc_head, act_cfq_bc_list) { + dispatched += __cfq_forced_dispatch(cfq_bc); + } + cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); @@ -1208,6 +1219,10 @@ static void __cfq_exit_single_io_context smp_wmb(); cic->key = NULL; + /* + * cic->cfqq[ASYNC] is always NULL and the put of async queues + * happens on appropriate bc death or device unplug + */ if (cic->cfqq[ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); cic->cfqq[ASYNC] = NULL; @@ -1327,6 +1342,10 @@ static inline void changed_ioprio(struct spin_lock_irqsave(cfqd->queue->queue_lock, flags); + /* + * cic->cfqq[ASYNC] is always NULL, ioprio change + * for async queues happens automatically + */ cfqq = cic->cfqq[ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; @@ -1367,8 +1386,11 @@ cfq_find_alloc_queue(struct cfq_data *cf { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct ub_iopriv *iopriv; + struct cfq_bc_data *cfq_bc = NULL; retry: + iopriv = cfqq_ub_iopriv(cfqd, is_sync); cic = cfq_cic_rb_lookup(cfqd, tsk->io_context); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -1386,18 +1408,32 @@ retry: */ spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_NOFAIL | __GFP_ZERO, + gfp_mask|__GFP_NOFAIL|__GFP_ZERO, cfqd->queue->node); + if (new_cfqq) { + cfq_bc = bc_findcreate_cfq_bc(iopriv, + cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, new_cfqq); + new_cfqq = NULL; + } + } spin_lock_irq(cfqd->queue->queue_lock); goto retry; } else { cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); + gfp_mask|__GFP_ZERO, cfqd->queue->node); if (!cfqq) goto out; + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + if (!cfq_bc) { + kmem_cache_free(cfq_pool, cfqq); + cfqq = NULL; + goto out; + } } + cfqq->cfq_bc = cfq_bc; RB_CLEAR_NODE(&cfqq->rb_node); INIT_LIST_HEAD(&cfqq->fifo); @@ -1424,15 +1460,15 @@ out: } static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_bc_data *cfq_bc, int ioprio_class, int ioprio) { switch(ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfq_bc->async_cfqq[0][ioprio]; case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfq_bc->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfq_bc->async_idle_cfqq; default: BUG(); } @@ -1446,12 +1482,18 @@ cfq_get_queue(struct cfq_data *cfqd, int const int ioprio_class = task_ioprio_class(tsk); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; + struct cfq_bc_data *cfq_bc; + struct ub_iopriv *iopriv; + + iopriv = cfqq_ub_iopriv(cfqd, is_sync); if (!is_sync) { - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + if (!cfq_bc) + return NULL; + async_cfqq = cfq_async_queue_prio(cfq_bc, ioprio_class, ioprio); cfqq = *async_cfqq; } - if (!cfqq) { cfqq = cfq_find_alloc_queue(cfqd, is_sync, tsk, gfp_mask); if (!cfqq) @@ -1815,6 +1857,7 @@ static void cfq_completed_request(struct WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; cfqq->dispatched--; + cfqq->cfq_bc->on_dispatch--; if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight--; @@ -1927,6 +1970,7 @@ static void cfq_put_request(struct reque rq->elevator_private = NULL; rq->elevator_private2 = NULL; + put_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); cfq_put_queue(cfqq); } } @@ -1944,14 +1988,19 @@ cfq_set_request(struct request_queue *q, const int is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; unsigned long flags; + struct ub_iopriv *iopriv; + struct cfq_bc_data *cfq_bc = NULL; might_sleep_if(gfp_mask & __GFP_WAIT); cic = cfq_get_io_context(cfqd, gfp_mask); + iopriv = cfqq_ub_iopriv(cfqd, is_sync); + if (!is_sync) + cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); spin_lock_irqsave(q->queue_lock, flags); - if (!cic) + if (!cic || (!is_sync && cfq_bc == NULL)) goto queue_fail; cfqq = cic_to_cfqq(cic, is_sync); @@ -1972,6 +2021,7 @@ cfq_set_request(struct request_queue *q, rq->elevator_private = cic; rq->elevator_private2 = cfqq; + get_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); return 0; queue_fail: @@ -2065,21 +2115,6 @@ static void cfq_shutdown_timer_wq(struct kblockd_flush_work(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(elevator_t *e) { struct cfq_data *cfqd = e->elevator_data; @@ -2106,6 +2141,8 @@ static void cfq_exit_queue(elevator_t *e cfq_shutdown_timer_wq(cfqd); + bc_cfq_exit_queue(cfqd); + kfree(cfqd); } @@ -2113,11 +2150,19 @@ static void *cfq_init_queue(struct reque { struct cfq_data *cfqd; - cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL|__GFP_ZERO, q->node); if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; + INIT_LIST_HEAD(&cfqd->act_cfq_bc_head); +#ifndef CONFIG_BC_IO_SCHED + cfq_init_cfq_bc(&cfqd->cfq_bc); + /* + * Adding ub0 to active list in order to serve force dispatching + * case uniformally. Note, that nobody removes ub0 from this list. + */ + list_add_tail(&cfqd->cfq_bc.act_cfq_bc_list, &cfqd->act_cfq_bc_head); +#endif INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; @@ -2142,6 +2187,9 @@ static void *cfq_init_queue(struct reque cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_ub_slice = cfq_ub_slice; + cfqd->virt_mode = 1; + cfqd->write_virt_mode = 1; return cfqd; } @@ -2206,6 +2254,9 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd- SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); +SHOW_FUNCTION(cfq_ub_slice_show, cfqd->cfq_ub_slice, 1); +SHOW_FUNCTION(cfq_virt_mode_show, cfqd->virt_mode, 0); +SHOW_FUNCTION(cfq_write_virt_mode_show, cfqd->write_virt_mode, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -2233,7 +2284,9 @@ STORE_FUNCTION(cfq_slice_idle_store, &cf STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); -#undef STORE_FUNCTION +STORE_FUNCTION(cfq_ub_slice_store, &cfqd->cfq_ub_slice, 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_virt_mode_store, &cfqd->virt_mode, 0, 1, 0); +STORE_FUNCTION(cfq_write_virt_mode_store, &cfqd->write_virt_mode, 0, 1, 0); #define CFQ_ATTR(name) \ __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) @@ -2248,6 +2301,9 @@ static struct elv_fs_entry cfq_attrs[] = CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(ub_slice), + CFQ_ATTR(virt_mode), + CFQ_ATTR(write_virt_mode), __ATTR_NULL }; @@ -2271,6 +2327,7 @@ static struct elevator_type iosched_cfq .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, .trim = cfq_free_io_context, + .put_queue = cfq_put_queue, }, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", diff -uprN linux-2.6.24/block/elevator.c linux-2.6.24.ovz/block/elevator.c --- linux-2.6.24/block/elevator.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/block/elevator.c 2008-03-25 18:53:59.000000000 -0500 @@ -40,6 +40,9 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); +struct kmem_cache *cfq_pool; +EXPORT_SYMBOL_GPL(cfq_pool); + /* * Merge hash stuff. */ @@ -987,12 +990,12 @@ void elv_unregister(struct elevator_type */ if (e->ops.trim) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (p->io_context) e->ops.trim(p->io_context); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff -uprN linux-2.6.24/block/genhd.c linux-2.6.24.ovz/block/genhd.c --- linux-2.6.24/block/genhd.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/block/genhd.c 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,7 @@ #include struct kset block_subsys; +EXPORT_SYMBOL(block_subsys); static DEFINE_MUTEX(block_subsys_lock); /* diff -uprN linux-2.6.24/drivers/acpi/blacklist.c linux-2.6.24.ovz/drivers/acpi/blacklist.c --- linux-2.6.24/drivers/acpi/blacklist.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/acpi/blacklist.c 2008-03-25 18:53:59.000000000 -0500 @@ -208,24 +208,24 @@ static struct dmi_system_id acpi_osi_dmi * Disable OSI(Linux) warnings on all "Acer, inc." * * _OSI(Linux) disables the latest Windows BIOS code: + * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 3100"), * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5050"), + * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5100"), * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5580"), * DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 3010"), * _OSI(Linux) effect unknown: * DMI_MATCH(DMI_PRODUCT_NAME, "Ferrari 5000"), */ - { - .callback = dmi_disable_osi_linux, - .ident = "Acer, inc.", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Acer, inc."), - }, - }, + /* + * note that dmi_check_system() uses strstr() + * to match sub-strings rather than !strcmp(), + * so "Acer" below matches "Acer, inc." above. + */ /* * Disable OSI(Linux) warnings on all "Acer" * * _OSI(Linux) effect unknown: - * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5100"), + * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5315"), * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5610"), * DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 7720Z"), * DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 5520"), @@ -300,7 +300,7 @@ static struct dmi_system_id acpi_osi_dmi DMI_MATCH(DMI_BIOS_VENDOR, "COMPAL"), }, }, - { /* OSI(Linux) touches USB, breaks suspend to disk */ + { /* OSI(Linux) touches USB, unknown side-effect */ .callback = dmi_disable_osi_linux, .ident = "Dell Dimension 5150", .matches = { @@ -474,6 +474,11 @@ static struct dmi_system_id acpi_osi_dmi * * _OSI(Linux) confirmed to be a NOP: * DMI_MATCH(DMI_PRODUCT_NAME, "P1-J150B"), + * with DMI_MATCH(DMI_BOARD_NAME, "ROCKY"), + * + * unknown: + * DMI_MATCH(DMI_PRODUCT_NAME, "S1-MDGDG"), + * with DMI_MATCH(DMI_BOARD_NAME, "ROCKY"), */ { .callback = dmi_disable_osi_linux, diff -uprN linux-2.6.24/drivers/acpi/osl.c linux-2.6.24.ovz/drivers/acpi/osl.c --- linux-2.6.24/drivers/acpi/osl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/acpi/osl.c 2008-03-25 18:53:59.000000000 -0500 @@ -120,7 +120,7 @@ static char osi_additional_string[OSI_ST */ #define OSI_LINUX_ENABLE 0 -struct osi_linux { +static struct osi_linux { unsigned int enable:1; unsigned int dmi:1; unsigned int cmdline:1; @@ -1213,24 +1213,24 @@ acpi_status acpi_os_release_object(acpi_ * * Returns 0 on success */ -int acpi_dmi_dump(void) +static int acpi_dmi_dump(void) { if (!dmi_available) return -1; printk(KERN_NOTICE PREFIX "DMI System Vendor: %s\n", - dmi_get_slot(DMI_SYS_VENDOR)); + dmi_get_system_info(DMI_SYS_VENDOR)); printk(KERN_NOTICE PREFIX "DMI Product Name: %s\n", - dmi_get_slot(DMI_PRODUCT_NAME)); + dmi_get_system_info(DMI_PRODUCT_NAME)); printk(KERN_NOTICE PREFIX "DMI Product Version: %s\n", - dmi_get_slot(DMI_PRODUCT_VERSION)); + dmi_get_system_info(DMI_PRODUCT_VERSION)); printk(KERN_NOTICE PREFIX "DMI Board Name: %s\n", - dmi_get_slot(DMI_BOARD_NAME)); + dmi_get_system_info(DMI_BOARD_NAME)); printk(KERN_NOTICE PREFIX "DMI BIOS Vendor: %s\n", - dmi_get_slot(DMI_BIOS_VENDOR)); + dmi_get_system_info(DMI_BIOS_VENDOR)); printk(KERN_NOTICE PREFIX "DMI BIOS Date: %s\n", - dmi_get_slot(DMI_BIOS_DATE)); + dmi_get_system_info(DMI_BIOS_DATE)); return 0; } diff -uprN linux-2.6.24/drivers/base/class.c linux-2.6.24.ovz/drivers/base/class.c --- linux-2.6.24/drivers/base/class.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/base/class.c 2008-03-25 18:53:59.000000000 -0500 @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "base.h" #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) @@ -71,8 +73,13 @@ static struct kobj_type class_ktype = { }; /* Hotplug events for classes go to the class_obj subsys */ -static decl_subsys(class, &class_ktype, NULL); +decl_subsys(class, &class_ktype, NULL); +#ifndef CONFIG_VE +#define visible_class_subsys class_subsys +#else +#define visible_class_subsys (*get_exec_env()->class_subsys) +#endif int class_create_file(struct class * cls, const struct class_attribute * attr) { @@ -149,7 +156,7 @@ int class_register(struct class * cls) if (error) return error; - cls->subsys.kobj.kset = &class_subsys; + cls->subsys.kobj.kset = &visible_class_subsys; error = subsystem_register(&cls->subsys); if (!error) { @@ -452,8 +459,13 @@ static struct kset_uevent_ops class_ueve .uevent = class_uevent, }; -static decl_subsys(class_obj, &class_device_ktype, &class_uevent_ops); +decl_subsys(class_obj, &class_device_ktype, &class_uevent_ops); +#ifndef CONFIG_VE +#define visible_class_obj_subsys class_obj_subsys +#else +#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys) +#endif static int class_device_add_attrs(struct class_device * cd) { @@ -537,7 +549,7 @@ static struct class_device_attribute cla void class_device_initialize(struct class_device *class_dev) { - kobj_set_kset_s(class_dev, class_obj_subsys); + kobj_set_kset_s(class_dev, visible_class_obj_subsys); kobject_init(&class_dev->kobj); INIT_LIST_HEAD(&class_dev->node); } @@ -851,10 +863,19 @@ void class_interface_unregister(struct c class_put(parent); } +void prepare_sysfs_classes(void) +{ +#ifdef CONFIG_VE + get_ve0()->class_subsys = &class_subsys; + get_ve0()->class_obj_subsys = &class_obj_subsys; +#endif +} + int __init classes_init(void) { int retval; + prepare_sysfs_classes(); retval = subsystem_register(&class_subsys); if (retval) return retval; @@ -890,3 +911,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi EXPORT_SYMBOL_GPL(class_interface_register); EXPORT_SYMBOL_GPL(class_interface_unregister); + +EXPORT_SYMBOL(class_subsys); +EXPORT_SYMBOL(class_obj_subsys); diff -uprN linux-2.6.24/drivers/base/core.c linux-2.6.24.ovz/drivers/base/core.c --- linux-2.6.24/drivers/base/core.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/base/core.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -406,7 +407,12 @@ static struct device_attribute devt_attr */ decl_subsys(devices, &device_ktype, &device_uevent_ops); - +EXPORT_SYMBOL_GPL(devices_subsys); +#ifdef CONFIG_VE +#define ve_devices_subsys (get_exec_env()->devices_subsys) +#else +#define ve_devices_subsys (&devices_subsys) +#endif /** * device_create_file - create sysfs attribute file for device. @@ -525,7 +531,7 @@ static void klist_children_put(struct kl void device_initialize(struct device *dev) { - kobj_set_kset_s(dev, devices_subsys); + dev->kobj.kset = ve_devices_subsys; kobject_init(&dev->kobj); klist_init(&dev->klist_children, klist_children_get, klist_children_put); @@ -556,12 +562,17 @@ static struct kobject * get_device_paren return NULL; } #else +#ifdef CONFIG_VE +#include +#define virtual_dir (get_exec_env()->_virtual_dir) +#else +static struct kobject *virtual_dir = NULL; +#endif + static struct kobject *virtual_device_parent(struct device *dev) { - static struct kobject *virtual_dir = NULL; - if (!virtual_dir) - virtual_dir = kobject_add_dir(&devices_subsys.kobj, "virtual"); + virtual_dir = kobject_add_dir(&get_exec_env()->devices_subsys->kobj, "virtual"); return virtual_dir; } @@ -1073,6 +1084,9 @@ struct device * device_find_child(struct int __init devices_init(void) { +#ifdef CONFIG_VE + ve0.devices_subsys = &devices_subsys; +#endif return subsystem_register(&devices_subsys); } diff -uprN linux-2.6.24/drivers/base/firmware_class.c linux-2.6.24.ovz/drivers/base/firmware_class.c --- linux-2.6.24/drivers/base/firmware_class.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/base/firmware_class.c 2008-03-25 18:53:59.000000000 -0500 @@ -292,7 +292,8 @@ firmware_class_timeout(u_long data) static inline void fw_setup_device_id(struct device *f_dev, struct device *dev) { - snprintf(f_dev->bus_id, BUS_ID_SIZE, "firmware-%s", dev->bus_id); + /* XXX warning we should watch out for name collisions */ + strlcpy(f_dev->bus_id, dev->bus_id, BUS_ID_SIZE); } static int fw_register_device(struct device **dev_p, const char *fw_name, diff -uprN linux-2.6.24/drivers/block/Kconfig linux-2.6.24.ovz/drivers/block/Kconfig --- linux-2.6.24/drivers/block/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/Kconfig 2008-03-25 18:54:00.000000000 -0500 @@ -394,6 +394,8 @@ config CDROM_PKTCDVD_WCACHE this option is dangerous unless the CD-RW media is known good, as we don't do deferred write error handling yet. +source "drivers/block/drbd/Kconfig" + config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET diff -uprN linux-2.6.24/drivers/block/Makefile linux-2.6.24.ovz/drivers/block/Makefile --- linux-2.6.24/drivers/block/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/Makefile 2008-03-25 18:54:00.000000000 -0500 @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o obj-$(CONFIG_XILINX_SYSACE) += xsysace.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff -uprN linux-2.6.24/drivers/block/drbd/Kconfig linux-2.6.24.ovz/drivers/block/drbd/Kconfig --- linux-2.6.24/drivers/block/drbd/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/Kconfig 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,34 @@ +# +# DRBD device driver configuration +# +config BLK_DEV_DRBD + tristate "DRBD Distributed replicated block device support" + select INET + select PROC_FS + ---help--- + Drbd is a block device which is designed to build high availability + clusters. This is done by mirroring a whole block device via (a + dedicated) network. You could see it as a network RAID 1. + + Each device (drbd provides more than one of these devices) has a + state, which can be 'primary' or 'secondary'. On the node with the + primary device the application is supposed to run and to access the + device (/dev/drbdX). Every write is sent to the local 'lower level + block device' and via network to the node with the device in + 'secondary' state. + The secondary device simply writes the data to its lower level block + device. Reads are always carried out locally. + + Drbd management is done through user-space tools. + + Historically DRBD hijacked the NBD major number (43) + and device nodes (/dev/nbX). + We now have an officially assigned major number (147) + and /dev/drbdX. + + If for whatever weird reason you want to keep the old behaviour, + you can give a "use_nbd_major" module parameter. + + http://www.drbd.org/ + + If unsure, say N. diff -uprN linux-2.6.24/drivers/block/drbd/Makefile linux-2.6.24.ovz/drivers/block/drbd/Makefile --- linux-2.6.24/drivers/block/drbd/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/Makefile 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,11 @@ +#CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror + +drbd-objs := drbd_buildtag.o drbd_bitmap.o drbd_proc.o \ + drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \ + lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o + +ifndef CONFIG_CONNECTOR + drbd-objs += connector.o cn_queue.o +endif + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff -uprN linux-2.6.24/drivers/block/drbd/drbd_actlog.c linux-2.6.24.ovz/drivers/block/drbd/drbd_actlog.c --- linux-2.6.24/drivers/block/drbd/drbd_actlog.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_actlog.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,1260 @@ +/* +-*- linux-c -*- + drbd_actlog.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include "drbd_int.h" + +/* This is what I like so much about the linux kernel: + * if you have a close look, you can almost always reuse code by someone else + * ;) + * this is mostly from drivers/md/md.c + */ +STATIC int _drbd_md_sync_page_io(struct drbd_backing_dev *bdev, + struct page *page, sector_t sector, + int rw, int size) +{ + struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct completion event; + int ok; + + bio->bi_bdev = bdev->md_bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = drbd_md_io_complete; + + if (FAULT_ACTIVE((rw & WRITE)? DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio,bio->bi_size,-EIO); + } + else { +#ifdef BIO_RW_SYNC + submit_bio(rw | (1 << BIO_RW_SYNC), bio); +#else + submit_bio(rw, bio); + drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev)); +#endif + } + wait_for_completion(&event); + + ok = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ok; +} + +int drbd_md_sync_page_io(drbd_dev *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int hardsect,mask,ok,offset=0; + struct page *iop = mdev->md_io_page; + + D_ASSERT(semaphore_is_locked(&mdev->md_io_mutex)); + + if (!bdev->md_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("bdev->md_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + + + hardsect = drbd_get_hardsect(bdev->md_bdev); + + // in case hardsect != 512 [ s390 only? ] + if( hardsect != MD_HARDSECT ) { + if(!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if(!page) return 0; + + WARN("Meta data's bdev hardsect_size != %d\n", + MD_HARDSECT); + WARN("Workaround engaged (has performace impact).\n"); + + mdev->md_io_tmpp = page; + } + + mask = ( hardsect / MD_HARDSECT ) - 1; + D_ASSERT( mask == 1 || mask == 3 || mask == 7 ); + D_ASSERT( hardsect == (mask+1) * MD_HARDSECT ); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw == WRITE) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(bdev,iop, + sector,READ,hardsect); + + if (unlikely(!ok)) { + ERR("drbd_md_sync_page_io(,%llus,READ [hardsect!=512]) failed!\n", + (unsigned long long)sector); + return 0; + } + + memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT); + } + } + +#if DUMP_MD >= 3 + INFO("%s [%d]:%s(,%llus,%s)\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); +#endif + + if (sector < drbd_md_first_sector(bdev) || sector > drbd_md_last_sector(bdev)) { + ALERT("%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); + } + + ok = _drbd_md_sync_page_io(bdev,iop,sector,rw,hardsect); + if (unlikely(!ok)) { + ERR("drbd_md_sync_page_io(,%llus,%s) failed!\n", + (unsigned long long)sector,rw ? "WRITE" : "READ"); + return 0; + } + + if( hardsect != MD_HARDSECT && rw == READ ) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_HARDSECT, MD_HARDSECT); + } + + return ok; +} + + +struct __attribute__((packed)) al_transaction { + u32 magic; + u32 tr_number; + // u32 tr_generation; //TODO + struct __attribute__((packed)) { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; + // I do not believe that all storage medias can guarantee atomic + // 512 byte write operations. When the journal is read, only + // transactions with correct xor_sums are considered. +}; // sizeof() = 512 byte + + +struct update_odbm_work { + struct drbd_work w; + unsigned int enr; +} ; + +struct update_al_work { + struct drbd_work w; + struct lc_element * al_ext; + struct completion event; + unsigned int enr; +}; + +STATIC int w_al_write_transaction(struct Drbd_Conf *, struct drbd_work *, int); + +static inline +struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + unsigned long al_flags=0; + + spin_lock_irq(&mdev->al_lock); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/AL_EXT_PER_BM_SECT); + if (unlikely(bm_ext!=NULL)) { + if(test_bit(BME_NO_WRITES,&bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + //INFO("Delaying app write until sync read is done\n"); + return 0; + } + } + al_ext = lc_get(mdev->act_log,enr); + al_flags = mdev->act_log->flags; + spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + WARN("Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + WARN("Ongoing AL update (AL device too slow?)\n"); + } + */ + + return al_ext; +} + +/* FIXME + * this should be able to return failure when meta data update has failed. + */ +void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *al_ext; + struct update_al_work al_work; + + D_ASSERT(atomic_read(&mdev->local_cnt)>0); + + MTRACE(TraceTypeALExts,TraceLvlMetrics, + INFO("al_begin_io( sec=%llus (al_enr=%u) (rs_enr=%d) )\n", + sector, enr, (int)BM_SECT_TO_EXT(sector)); + ); + + wait_event(mdev->al_wait, (al_ext = _al_get(mdev,enr)) ); + + if (al_ext->lc_number != enr) { + // We have to do write an transaction to AL. + unsigned int evicted; + + evicted = al_ext->lc_number; + + if(mdev->state.conn < Connected && evicted != LC_FREE ) { + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT ); + } + + /* drbd_al_write_transaction(mdev,al_ext,enr); + generic_make_request() are serialized on the + current->bio_tail list now. Therefore we have + to deligate writing something to AL to the + worker thread. */ + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(&mdev->data.work,&al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + /* + DUMPI(al_ext->lc_number); + DUMPI(mdev->act_log->new_number); + */ + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); + } +} + +void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *extent; + unsigned long flags; + + MTRACE(TraceTypeALExts,TraceLvlMetrics, + INFO("al_complete_io( sec=%llus (al_enr=%u) (rs_enr=%d) )\n", + sector, enr, (int)BM_SECT_TO_EXT(sector)); + ); + + spin_lock_irqsave(&mdev->al_lock,flags); + + extent = lc_find(mdev->act_log,enr); + + if(!extent) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("al_complete_io() called on inactive extent %u\n",enr); + return; + } + + if( lc_put(mdev->act_log,extent) == 0 ) { + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +STATIC int +w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused) +{ + int i,n,mx; + unsigned int extent_nr; + struct al_transaction* buffer; + sector_t sector; + u32 xor_sum=0; + + struct lc_element *updated = ((struct update_al_work*)w)->al_ext; + unsigned int new_enr = ((struct update_al_work*)w)->enr; + + down(&mdev->md_io_mutex); // protects md_io_buffer, al_tr_cycle, ... + buffer = (struct al_transaction*)page_address(mdev->md_io_page); + + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); + + n = lc_index_of(mdev->act_log, updated); + + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); + +#if 0 /* Use this printf with the test_al.pl program */ + ERR("T%03d S%03d=E%06d\n", mdev->al_tr_number,n,new_enr); +#endif + + xor_sum ^= new_enr; + + mx = min_t(int,AL_EXTENTS_PT, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for(i=0;iact_log, + mdev->al_tr_cycle+i)->lc_number; + buffer->updates[i+1].pos = cpu_to_be32(mdev->al_tr_cycle+i); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; + } + for(;iupdates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; + if(mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle=0; + + buffer->xor_sum = cpu_to_be32(xor_sum); + +// warning LGE check outcome of addition u64/sector_t/s32 +// warning LGE "FIXME code missing" + sector = mdev->bc->md.md_offset + mdev->bc->md.al_offset + mdev->al_tr_pos; + + if(!drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) { + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + } + + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; + + up(&mdev->md_io_mutex); + + complete(&((struct update_al_work*)w)->event); + + return 1; +} + +/** + * drbd_al_read_tr: Reads a single transaction record form the + * on disk activity log. + * Returns -1 on IO error, 0 on checksum error and 1 if it is a valid + * record. + */ +STATIC int drbd_al_read_tr(struct Drbd_Conf *mdev, + struct drbd_backing_dev *bdev, + struct al_transaction* b, + int index) +{ + sector_t sector; + int rv,i; + u32 xor_sum=0; + + sector = bdev->md.md_offset + bdev->md.al_offset + index; + + if(!drbd_md_sync_page_io(mdev,bdev,sector,READ)) { + // Dont process error normally as this is done before + // disk is atached! + return -1; + } + + rv = ( be32_to_cpu(b->magic) == DRBD_MAGIC ); + + for(i=0;iupdates[i].extent); + } + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); + + return rv; +} + +/** + * drbd_al_read_log: Restores the activity log from its on disk + * representation. Returns 1 on success, returns 0 when + * reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct Drbd_Conf *mdev,struct drbd_backing_dev *bdev) +{ + struct al_transaction* buffer; + int from=-1,to=-1,i,cnr, overflow=0,rv; + u32 from_tnr=-1, to_tnr=0; + int active_extents=0; + int transactions=0; + int mx; + + mx = div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + down(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); + + // Find the valid transaction in the log + for(i=0;i<=mx;i++) { + rv = drbd_al_read_tr(mdev,bdev,buffer,i); + if(rv == 0) continue; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + // INFO("index %d valid tnr=%d\n",i,cnr); + + if(cnr == -1) overflow=1; + + if(cnr < from_tnr && !overflow) { + from = i; + from_tnr = cnr; + } + if(cnr > to_tnr) { + to = i; + to_tnr = cnr; + } + } + + if(from == -1 || to == -1) { + WARN("No usable activity log found.\n"); + + up(&mdev->md_io_mutex); + return 1; + } + + // Read the valid transactions. + // INFO("Reading from %d to %d.\n",from,to); + + /* this should better be handled by a for loop, no? + */ + i=from; + while(1) { + int j,pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev,bdev,buffer,i); + ERR_IF(rv == 0) goto cancel; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + + trn=be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for(j=AL_EXTENTS_PT;j>=0;j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if(extent_nr == LC_FREE) continue; + + //if(j<3) INFO("T%03d S%03d=E%06d\n",trn,pos,extent_nr); + lc_set(mdev->act_log,extent_nr,pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + + cancel: + if( i == to) break; + i++; + if( i > mx ) i=0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + + /* ok, we are done with it */ + up(&mdev->md_io_mutex); + + INFO("Found %d transactions (%d active extents) in activity log.\n", + transactions,active_extents); + + return 1; +} + +/** + * drbd_al_to_on_disk_bm: + * Writes the areas of the bitmap which are covered by the AL. + * called when we detach (unconfigure) local storage, + * or when we go from Primary to Secondary state. + */ +void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) +{ + int i; + unsigned int enr; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + if (inc_local_if_state(mdev,Attaching)) { + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + /* TODO encapsulate and optimize within drbd_bitmap + * currently, if we have al-extents 16..19 active, + * sector 4 will be written four times! */ + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT ); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + dec_local(mdev); + } else D_ASSERT(0); +} + +/** + * drbd_al_apply_to_bm: Sets the bits in the bitmap that are described + * by the active extents of the AL. + */ +void drbd_al_apply_to_bm(struct Drbd_Conf *mdev) +{ + unsigned int enr; + unsigned long add=0; + char ppb[10]; + int i; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + add += drbd_bm_ALe_set_all(mdev, enr); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + INFO("Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb,Bit2KB(add))); +} + +static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&mdev->al_lock); + rv = (al_ext->refcnt == 0); + if(likely(rv)) lc_del(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + + if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n"); + + return rv; +} + +/** + * drbd_al_shrink: Removes all active extents form the AL. (but does not + * write any transactions) + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct Drbd_Conf *mdev) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT( test_bit(__LC_DIRTY,&mdev->act_log->flags) ); + + for(i=0;iact_log->nr_elements;i++) { + al_ext = lc_entry(mdev->act_log,i); + if(al_ext->lc_number == LC_FREE) continue; + wait_event(mdev->al_wait, _try_lc_del(mdev,al_ext)); + } + + wake_up(&mdev->al_wait); +} + +STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct update_odbm_work *udw = (struct update_odbm_work*)w; + + if( !inc_local_if_state(mdev,Attaching) ) { + if (DRBD_ratelimit(5*HZ,5)) + WARN("Can not update on disk bitmap, local IO disabled.\n"); + return 1; + } + + drbd_bm_write_sect(mdev, udw->enr ); + dec_local(mdev); + + kfree(udw); + + if(drbd_bm_total_weight(mdev) <= mdev->rs_failed && + ( mdev->state.conn == SyncSource || mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncS || mdev->state.conn == PausedSyncT ) ) { + drbd_bm_lock(mdev); + drbd_resync_finished(mdev); + drbd_bm_unlock(mdev); + } + + return 1; +} + + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector, + int count, int success) +{ + struct bm_extent* ext; + struct update_odbm_work * udw; + + unsigned int enr; + + MUST_HOLD(&mdev->al_lock); + + // I simply assume that a sector/size pair never crosses + // a 16 MB extent border. (Currently this is true...) + enr = BM_SECT_TO_EXT(sector); + + ext = (struct bm_extent *) lc_get(mdev->resync,enr); + if (ext) { + if( ext->lce.lc_number == enr) { + if (success) + ext->rs_left -= count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + ERR("BAD! sector=%llus enr=%u rs_left=%d rs_failed=%d count=%d\n", + (unsigned long long)sector, + ext->lce.lc_number, ext->rs_left, ext->rs_failed, count); + dump_stack(); + // FIXME brrrgs. should never happen! + drbd_force_state(mdev,NS(conn,Disconnecting)); + return; + } + } else { + //WARN("Counting bits in %d (resync LRU small?)\n",enr); + // This element should be in the cache + // since drbd_rs_begin_io() pulled it already in. + + // OR an application write finished, and therefore + // we set something in this area in sync. + int rs_left = drbd_bm_e_weight(mdev,enr); + if (ext->flags != 0) { + WARN("changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if( ext->rs_failed ) { + WARN("Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + set_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags); + } + ext->rs_left = rs_left; + ext->rs_failed = success ? 0 : count; + lc_changed(mdev->resync,&ext->lce); + } + lc_put(mdev->resync,&ext->lce); + // no race, we are within the al_lock! + + if (ext->rs_left == ext->rs_failed) { + ext->rs_failed = 0; + + udw=kmalloc(sizeof(*udw),GFP_ATOMIC); + if(udw) { + udw->enr = ext->lce.lc_number; + udw->w.cb = w_update_odbm; + drbd_queue_work_front(&mdev->data.work,&udw->w); + } else { + WARN("Could not kmalloc an udw\n"); + set_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags); + } + } + } else { + ERR("lc_get() failed! locked=%d/%d flags=%lu\n", + mdev->resync_locked, + mdev->resync->nr_elements, + mdev->resync->flags); + } +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on SyncTarget and receiver on SyncSource. + * + */ +void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up=0; + unsigned long flags; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("drbd_set_in_sync: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we clear it (in sync). + * round up start sector, round down end sector. we make sure we only + * clear full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + ); + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irqsave(&mdev->al_lock,flags); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_clear_bit(mdev,bnr)) count++; + } + if (count) { + // we need the lock for drbd_try_clear_on_disk_bm + if(jiffies - mdev->rs_mark_time > HZ*10) { + /* should be roling marks, but we estimate only anyways. */ + if( mdev->rs_mark_left != drbd_bm_total_weight(mdev)) { + mdev->rs_mark_time =jiffies; + mdev->rs_mark_left =drbd_bm_total_weight(mdev); + } + } + drbd_try_clear_on_disk_bm(mdev,sector,count,TRUE); + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irqrestore(&mdev->al_lock,flags); + if(wake_up) wake_up(&mdev->al_wait); +} + +/* + * this is intended to set one request worth of data out of sync. + * affects at least 1 bit, and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * + * called by tl_clear and drbd_send_dblock (==drbd_make_request). + * so this can be _any_ process. + */ +void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + unsigned long sbnr,ebnr,lbnr; + sector_t esector, nr_sectors; + + /* Find codepoints that call set_out_of_sync() + unsigned long flags; + unsigned int enr; + struct bm_extent* ext; + + if(inc_local(mdev)) { + enr = BM_SECT_TO_EXT(sector); + spin_lock_irqsave(&mdev->al_lock,flags); + ext = (struct bm_extent *) lc_find(mdev->resync,enr); + if (ext) { + WARN("BAD! things will happen, find this.\n"); + dump_stack(); + } + spin_unlock_irqrestore(&mdev->al_lock,flags); + dec_local(mdev); + } + */ + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("sector: %llus, size: %d\n",(unsigned long long)sector,size); + return; + } + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we set it out of sync, + * we do not need to round anything here */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + ); + + /* ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. */ + drbd_bm_set_bits_in_irq(mdev,sbnr,ebnr); +} + +static inline +struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_locked > mdev->resync->nr_elements-3) { + //WARN("bme_get() does not lock all elements\n"); + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + bm_ext = (struct bm_extent*) lc_get(mdev->resync,enr); + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev,enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync,(struct lc_element*)bm_ext); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) mdev->resync_locked++; + set_bit(BME_NO_WRITES,&bm_ext->flags); + } + rs_flags=mdev->resync->flags; + spin_unlock_irq(&mdev->al_lock); + if (wakeup) wake_up(&mdev->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) { + WARN("Have to wait for element" + " (resync LRU too small?)\n"); + } + if (rs_flags & LC_DIRTY) { + BUG(); // WARN("Ongoing RS update (???)\n"); + } + } + + return bm_ext; +} + +static inline int _is_in_al(drbd_dev* mdev, unsigned int enr) +{ + struct lc_element* al_ext; + int rv=0; + + spin_lock_irq(&mdev->al_lock); + if(unlikely(enr == mdev->act_log->new_number)) rv=1; + else { + al_ext = lc_find(mdev->act_log,enr); + if(al_ext) { + if (al_ext->refcnt) rv=1; + } + } + spin_unlock_irq(&mdev->al_lock); + + /* + if(unlikely(rv)) { + INFO("Delaying sync read until app's write is done\n"); + } + */ + return rv; +} + +/** + * drbd_rs_begin_io: Gets an extent in the resync LRU cache and sets it + * to BME_LOCKED. + * + * @sector: The sector number + * + * sleeps on al_wait. + * returns 1 if successful. + * returns 0 if interrupted. + */ +int drbd_rs_begin_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + int i, sig; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", + (unsigned long long)sector,enr); + ); + + sig = wait_event_interruptible( mdev->al_wait, + (bm_ext = _bme_get(mdev,enr)) ); + if (sig) return 0; + + if(test_bit(BME_LOCKED,&bm_ext->flags)) return 1; + + for(i=0;ial_wait, + !_is_in_al(mdev,enr*AL_EXT_PER_BM_SECT+i) ); + if (sig) { + spin_lock_irq(&mdev->al_lock); + if( lc_put(mdev->resync,&bm_ext->lce) == 0 ) { + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + spin_unlock_irq(&mdev->al_lock); + return 0; + } + } + + set_bit(BME_LOCKED,&bm_ext->flags); + + return 1; +} + +/** + * drbd_try_rs_begin_io: Gets an extent in the resync LRU cache, sets it + * to BME_NO_WRITES, then tries to set it to BME_LOCKED. + * + * @sector: The sector number + * + * does not sleep. + * returns zero if we could set BME_LOCKED and can proceed, + * -EAGAIN if we need to try again. + */ +int drbd_try_rs_begin_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_try_rs_begin_io: sector=%llus\n", + (unsigned long long)sector); + ); + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we giveup the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keept the refcount on. + * so we remembered which extent we had to try agin, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("dropping %u, aparently got 'synced' " + "by application io\n", mdev->resync_wenr); + ); + bm_ext = (struct bm_extent*)lc_find(mdev->resync,mdev->resync_wenr); + if (bm_ext) { + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES,&bm_ext->flags)); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync,&bm_ext->lce); + wake_up(&mdev->al_wait); + } else { + ALERT("LOGIC BUG\n"); + } + } + bm_ext = (struct bm_extent*)lc_try_get(mdev->resync,enr); + if (bm_ext) { + if (test_bit(BME_LOCKED,&bm_ext->flags)) { + goto proceed; + } + if (!test_and_set_bit(BME_NO_WRITES,&bm_ext->flags)) { + mdev->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("dropping extra reference on %u\n",enr); + ); + bm_ext->lce.refcnt--; + D_ASSERT(bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + if (mdev->resync_locked > mdev->resync->nr_elements-3) + goto try_again; + bm_ext = (struct bm_extent*)lc_get(mdev->resync,enr); + if (!bm_ext) { + const unsigned long rs_flags = mdev->resync->flags; + if (rs_flags & LC_STARVING) { + WARN("Have to wait for element" + " (resync LRU too small?)\n"); + } + if (rs_flags & LC_DIRTY) { + BUG(); // WARN("Ongoing RS update (???)\n"); + } + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev,enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync,(struct lc_element*)bm_ext); + wake_up(&mdev->al_wait); + D_ASSERT(test_bit(BME_LOCKED,&bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES,&bm_ext->flags); + D_ASSERT(bm_ext->lce.refcnt == 1); + mdev->resync_locked++; + goto check_al; + } + check_al: + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("checking al for %u\n",enr); + ); + for (i=0;iact_log->new_number)) + goto try_again; + if (lc_is_used(mdev->act_log,al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED,&bm_ext->flags); + proceed: + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + return 0; + + try_again: + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("need to try again for %u\n",enr); + ); + if (bm_ext) mdev->resync_wenr = enr; + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + unsigned long flags; + + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", + (long long)sector, enr); + ); + + spin_lock_irqsave(&mdev->al_lock,flags); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr); + if(!bm_ext) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if( lc_put(mdev->resync,(struct lc_element *)bm_ext) == 0 ) { + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +/** + * drbd_rs_cancel_all: Removes extents from the resync LRU. Even + * if they are BME_LOCKED. + */ +void drbd_rs_cancel_all(drbd_dev* mdev) +{ + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_cancel_all\n"); + ); + + spin_lock_irq(&mdev->al_lock); + + if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. + for(i=0;iresync->nr_elements;i++) { + bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); + if(bm_ext->lce.lc_number == LC_FREE) continue; + bm_ext->lce.refcnt = 0; // Rude but ok. + bm_ext->rs_left = 0; + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + lc_del(mdev->resync,&bm_ext->lce); + } + mdev->resync->used=0; + dec_local(mdev); + } + mdev->resync_locked = 0; + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); +} + +/** + * drbd_rs_del_all: Gracefully remove all extents from the resync LRU. + * there may be still a reference hold by someone. In that this function + * returns -EAGAIN. + * In case all elements got removed it returns zero. + */ +int drbd_rs_del_all(drbd_dev* mdev) +{ + struct bm_extent* bm_ext; + int i; + + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_del_all\n"); + ); + + spin_lock_irq(&mdev->al_lock); + + if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. + for(i=0;iresync->nr_elements;i++) { + bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); + if(bm_ext->lce.lc_number == LC_FREE) continue; + if (bm_ext->lce.lc_number == mdev->resync_wenr) { + INFO("dropping %u in drbd_rs_del_all, " + "aparently got 'synced' by application io\n", + mdev->resync_wenr); + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES,&bm_ext->flags)); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync,&bm_ext->lce); + } + if(bm_ext->lce.refcnt != 0) { + INFO("Retrying drbd_rs_del_all() later. " + "refcnt=%d\n",bm_ext->lce.refcnt); + dec_local(mdev); + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; + } + D_ASSERT(bm_ext->rs_left == 0); + D_ASSERT(!test_bit(BME_LOCKED,&bm_ext->flags)); + D_ASSERT(!test_bit(BME_NO_WRITES,&bm_ext->flags)); + lc_del(mdev->resync,&bm_ext->lce); + } + D_ASSERT(mdev->resync->used==0); + dec_local(mdev); + } + spin_unlock_irq(&mdev->al_lock); + + return 0; +} + +/* Record information on a failure to resync the specified blocks + * + * called on SyncTarget when resync write fails or NegRSDReply received + * + */ +void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up=0; + + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("drbd_rs_failed_io: sector=%llus, size=%u\n", + (unsigned long long)sector,size); + ); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* + * round up start sector, round down end sector. we make sure we only + * handle full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)>0) count++; + } + if (count) { + mdev->rs_failed += count; + + drbd_try_clear_on_disk_bm(mdev,sector,count,FALSE); + + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irq(&mdev->al_lock); + if(wake_up) wake_up(&mdev->al_wait); +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_bitmap.c linux-2.6.24.ovz/drivers/block/drbd/drbd_bitmap.c --- linux-2.6.24/drivers/block/drbd/drbd_bitmap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_bitmap.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,1168 @@ +/* +-*- linux-c -*- + drbd_bitmap.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2007, Philipp Reisner . + Copyright (C) 2004-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include // for memset +#include /* for D_ASSERT(in_interrupt()) */ + + +#include +#include "drbd_int.h" + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + * + * unfortunately this currently means that this file is not + * yet selfcontained, because it needs to know about how to receive + * the bitmap from the peer via the data socket. + * This is to be solved with some sort of + * drbd_bm_copy(mdev,offset,size,unsigned long*) ... + + * Note that since find_first_bit returns int, this implementation + * "only" supports up to 1<<(32+12) == 16 TB... non issue, since + * currently DRBD is limited to ca 3.8 TB storage anyways. + * + * we will eventually change the implementation to not allways hold the full + * bitmap in memory, but only some 'lru_cache' of the on disk bitmap, + * since vmalloc'ing mostly unused 128M is antisocial. + + * THINK + * I'm not yet sure whether this file should be bits only, + * or wether I want it to do all the sector<->bit calculation in here. + */ + +// warning LGE "verify all spin_lock_irq here, and their call path" +// warning LGE "and change to irqsave where applicable" +// warning LGE "so we don't accidentally nest spin_lock_irq()" +/* + * NOTE + * Access to the *bm is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bit is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * FIXME + * for performance reasons, when we _know_ we have irq disabled, we should + * probably introduce some _in_irq variants, so we know to only spin_lock(). + * + * FIXME + * Actually you need to serialize all resize operations. + * but then, resize is a drbd state change, and it should be serialized + * already. Unfortunately it is not (yet), so two concurrent resizes, like + * attach storage (drbdsetup) and receive the peers size (drbd receiver) + * may eventually blow things up. + * Therefore, + * you may only change the other members when holding + * the bm_change mutex _and_ the bm_lock. + * thus reading them holding either is safe. + * this is sort of overkill, but I rather do it right + * than have two resize operations interfere somewhen. + */ +struct drbd_bitmap { + unsigned long *bm; + spinlock_t bm_lock; + /* WARNING unsigned long bm_fo and friends: + * 32bit number of bit offset is just enough for 512 MB bitmap. + * it will blow up if we make the bitmap bigger... + * not that it makes much sense to have a bitmap that large, + * rather change the granularity to 16k or 64k or something. + * (that implies other problems, however...) + */ + unsigned long bm_fo; // next offset for drbd_bm_find_next + unsigned long bm_set; // nr of set bits; THINK maybe atomic_t ? + unsigned long bm_bits; + size_t bm_words; + sector_t bm_dev_capacity; + struct semaphore bm_change; // serializes resize operations + + atomic_t bm_async_io; + wait_queue_head_t bm_io_wait; + + unsigned long bm_flags; + + // { REMOVE + unsigned long bm_line; + char *bm_file; + // } +}; + +// { REMOVE once we serialize all state changes properly +#define D_BUG_ON(x) ERR_IF(x) { dump_stack(); } +#define BM_LOCKED 0 +#define BM_MD_IO_ERROR (BITS_PER_LONG-1) // 31? 63? + +#if 0 // simply disabled for now... +#define MUST_NOT_BE_LOCKED() do { \ + if (test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap is locked by %s:%lu\n", \ + __FILE__, __LINE__, b->bm_file,b->bm_line); \ + dump_stack(); \ + } \ + } \ +} while (0) +#define MUST_BE_LOCKED() do { \ + if (!test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap not locked!\n", \ + __FILE__, __LINE__); \ + dump_stack(); \ + } \ + } \ +} while (0) +#else +#define MUST_NOT_BE_LOCKED() do {(void)b;} while (0) +#define MUST_BE_LOCKED() do {(void)b;} while (0) +#endif +void __drbd_bm_lock(drbd_dev *mdev, char* file, int line) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_set_bit(BM_LOCKED,&b->bm_flags)) { + b->bm_file = file; + b->bm_line = line; + } else if (DRBD_ratelimit(5*HZ,5)) { + ERR("%s:%d: bitmap already locked by %s:%lu\n", + file, line, b->bm_file,b->bm_line); + /* + dump_stack(); + ERR("This is no oops, but debug stack trace only.\n"); + ERR("If you get this often, or in reproducable situations, " + "notify \n"); + */ + } + spin_unlock_irq(&b->bm_lock); +} +void drbd_bm_unlock(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_clear_bit(BM_LOCKED,&mdev->bitmap->bm_flags)) { + ERR("bitmap not locked in bm_unlock\n"); + } else { + /* FIXME if we got a "is already locked" previously, + * we unlock here even though we actually MUST NOT do so... */ + b->bm_file = NULL; + b->bm_line = -1; + } + spin_unlock_irq(&b->bm_lock); +} + +#if 0 +// has been very helpful to indicate that rs_total and rs_left have been +// used in a non-smp safe way... +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev)); \ + if ( (b->bm_set != mdev->rs_total) && \ + (b->bm_set != mdev->rs_left) ) { \ + if ( DRBD_ratelimit(5*HZ,5) ) { \ + ERR("%s:%d: ?? bm_set=%lu; rs_total=%lu, rs_left=%lu\n",\ + __FILE__ , __LINE__ , \ + b->bm_set, mdev->rs_total, mdev->rs_left ); \ + } \ + } \ +} while (0) +#else +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + if (b->bm_dev_capacity != drbd_get_capacity(mdev->this_bdev)) { \ + ERR("%s:%d: bm_dev_capacity:%llu drbd_get_capacity:%llu\n", \ + __FILE__, __LINE__, \ + (unsigned long long) b->bm_dev_capacity, \ + (unsigned long long) drbd_get_capacity(mdev->this_bdev));\ + } \ +} while (0) +#endif +// } + +#if DUMP_MD >= 3 +/* debugging aid */ +STATIC void bm_end_info(drbd_dev *mdev, const char* where) +{ + struct drbd_bitmap *b = mdev->bitmap; + size_t w = (b->bm_bits-1) >> LN2_BPL; + + INFO("%s: bm_set=%lu\n", where, b->bm_set); + INFO("bm[%d]=0x%lX\n", w, b->bm[w]); + w++; + + if ( w < b->bm_words ) { + D_ASSERT(w == b->bm_words -1); + INFO("bm[%d]=0x%lX\n",w,b->bm[w]); + } +} +#else +#define bm_end_info(ignored...) ((void)(0)) +#endif + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * drbd_dev*, but for the debug macros I like to have the mdev around + * to be able to report device specific. + */ + +/* FIXME TODO sometimes I use "int offset" as index into the bitmap. + * since we currently are LIMITED to (128<<11)-64-8 sectors of bitmap, + * this is ok [as long as we dont run on a 24 bit arch :)]. + * But it is NOT strictly ok. + */ + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in mdev->bitmap. + */ +int drbd_bm_init(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + D_BUG_ON(b); + b = kmalloc(sizeof(struct drbd_bitmap),GFP_KERNEL); + if (!b) + return -ENOMEM; + memset(b,0,sizeof(*b)); + spin_lock_init(&b->bm_lock); + init_MUTEX(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + mdev->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(drbd_dev *mdev) +{ + ERR_IF(!mdev->bitmap) return 0; + return mdev->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(drbd_dev *mdev) +{ + ERR_IF (!mdev->bitmap) return; + /* FIXME I think we should explicitly change the device size to zero + * before this... + * + D_BUG_ON(mdev->bitmap->bm); + */ + vfree(mdev->bitmap->bm); + kfree(mdev->bitmap); + mdev->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Rerturns the number of bits cleared. + */ +STATIC int bm_clear_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + int cleared=0; + + if ( w < b->bm_words ) { + cleared = hweight_long(b->bm[w] & ~mask); + b->bm[w++] &= mask; + } + + if ( w < b->bm_words ) { + cleared += hweight_long(b->bm[w]); + b->bm[w++]=0; + } + + return cleared; +} + +STATIC void bm_set_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + + if ( w < b->bm_words ) { + b->bm[w++] |= ~mask; + } + + if ( w < b->bm_words ) { + b->bm[w++] = ~(0UL); + } +} + +STATIC unsigned long bm_count_bits(struct drbd_bitmap * b, int just_read) +{ + unsigned long *bm = b->bm; + unsigned long *ep = b->bm + b->bm_words; + unsigned long bits = 0; + + while ( bm < ep ) { + /* on little endian, this is *bm = *bm; + * and should be optimized away by the compiler */ + if (just_read) *bm = lel_to_cpu(*bm); + bits += hweight_long(*bm++); + } + + return bits; +} + +#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) + +/* + * make sure the bitmap has enough room for the attached storage, + * if neccessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initiallized to all bits set. + */ +int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bits, bytes, words, *nbm, *obm = 0; + int err = 0, growing; + + ERR_IF(!b) return -ENOMEM; + MUST_BE_LOCKED(); + + ERR_IF (down_trylock(&b->bm_change)) { + down(&b->bm_change); + } + + INFO("drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + obm = b->bm; + b->bm = NULL; + b->bm_fo = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + goto free_obm; + } else { + bits = BM_SECT_TO_BIT(ALIGN(capacity,BM_SECTORS_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits,64) >> LN2_BPL; + + D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12)); + + if ( words == b->bm_words ) { + /* optimize: capacity has changed, + * but only within one long word worth of bits. + * just update the bm_dev_capacity and bm_bits members. + */ + spin_lock_irq(&b->bm_lock); + b->bm_bits = bits; + b->bm_dev_capacity = capacity; + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + goto out; + } else { + /* one extra long to catch off by one errors */ + bytes = (words+1)*sizeof(long); + nbm = vmalloc(bytes); + if (!nbm) { + ERR("bitmap: failed to vmalloc %lu bytes\n",bytes); + err = -ENOMEM; + goto out; + } + } + spin_lock_irq(&b->bm_lock); + obm = b->bm; + // brgs. move several MB within spinlock... + // FIXME this should go into userspace! + if (obm) { + bm_set_surplus(b); + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); + memcpy(nbm,obm,min_t(size_t,b->bm_words,words)*sizeof(long)); + } + growing = words > b->bm_words; + if (growing) { // set all newly allocated bits + // start at -1, just to be sure. + memset( nbm + (b->bm_words?:1)-1 , 0xff, + (words - ((b->bm_words?:1)-1)) * sizeof(long) ); + b->bm_set += bits - b->bm_bits; + } + nbm[words] = DRBD_MAGIC; + b->bm = nbm; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + bm_clear_surplus(b); + if( !growing ) b->bm_set = bm_count_bits(b,0); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + INFO("resync bitmap: bits=%lu words=%lu\n",bits,words); + } + free_obm: + vfree(obm); // vfree(NULL) is noop + out: + up(&b->bm_change); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +unsigned long drbd_bm_total_weight(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long s; + unsigned long flags; + + ERR_IF(!b) return 0; + // MUST_BE_LOCKED(); well. yes. but ... + + spin_lock_irqsave(&b->bm_lock,flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock,flags); + + return s; +} + +size_t drbd_bm_words(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + + /* FIXME + * actually yes. really. otherwise it could just change its size ... + * but it triggers all the time... + * MUST_BE_LOCKED(); + */ + + return b->bm_words; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_merge_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = *bm | lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_set_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + + if (number == 0) return; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + if ( (offset >= b->bm_words) || + (offset+number > b->bm_words) || + (number > PAGE_SIZE/sizeof(long)) || + (number <= 0) ) { + // yes, there is "%z", but that gives compiler warnings... + ERR("offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + return; + } + + // MUST_BE_LOCKED(); yes. but not neccessarily globally... + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + bm = b->bm + offset; + while(number--) *buffer++ = cpu_to_lel(*bm++); + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,0xff,b->bm_words*sizeof(long)); + bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +int drbd_bm_async_io_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + struct drbd_bitmap *b = bio->bi_private; + + if (bio->bi_size) + return 1; + + if (error) { + /* doh. what now? + * for now, set all bits, and flag MD_IO_ERROR + */ + /* FIXME kmap_atomic memset etc. pp. */ + __set_bit(BM_MD_IO_ERROR,&b->bm_flags); + } + if (atomic_dec_and_test(&b->bm_async_io)) + wake_up(&b->bm_io_wait); + + bio_put(bio); + + return 0; +} + +STATIC void drbd_bm_page_io_async(drbd_dev *mdev, struct drbd_bitmap *b, int page_nr, int rw) +{ + /* we are process context. we always get a bio */ + /* THINK: do we need GFP_NOIO here? */ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct page *page = vmalloc_to_page((char*)(b->bm) + (PAGE_SIZE*page_nr)); + unsigned int len; + sector_t on_disk_sector = mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small flexible external meta data device */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(mdev->bc) - on_disk_sector + 1)<<9); + + D_DUMPLU(on_disk_sector); + D_DUMPI(len); + + bio->bi_bdev = mdev->bc->md_bdev; + bio->bi_sector = on_disk_sector; + bio_add_page(bio, page, len, 0); + bio->bi_private = b; + bio->bi_end_io = drbd_bm_async_io_complete; + + if (FAULT_ACTIVE((rw&WRITE)?DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio,bio->bi_size,-EIO); + } + else + submit_bio(rw, bio); +} +/* read one sector of the on disk bitmap into memory. + * on disk bitmap is little endian. + * @enr is _sector_ offset from start of on disk bitmap (aka bm-extent nr). + * returns 0 on success, -EIO on failure + */ +int drbd_bm_read_sect(drbd_dev *mdev,unsigned long enr) +{ + sector_t on_disk_sector = mdev->bc->md.md_offset + mdev->bc->md.bm_offset + enr; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global ... + + down(&mdev->md_io_mutex); + if(drbd_md_sync_page_io(mdev,mdev->bc,on_disk_sector,READ)) { + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("read_sect: sector=%lus offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + drbd_bm_set_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + } else { + int i; + err = -EIO; + ERR( "IO ERROR reading bitmap sector %lu " + "(meta-disk sector %llu)\n", + enr, (unsigned long long)on_disk_sector ); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_read: Read the whole bitmap from its on disk location. + * + * currently only called from "drbd_ioctl_set_disk" + * FIXME need to be able to return an error!! + * + */ +# if defined(__LITTLE_ENDIAN) + /* nothing to do, on disk == in memory */ +# define bm_cpu_to_lel(x) ((void)0) +# else +void bm_cpu_to_lel(struct drbd_bitmap *b) +{ + /* need to cpu_to_lel all the pages ... + * this may be optimized by using + * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; + * the following is still not optimal, but better than nothing */ + const unsigned long *end = b->bm+b->bm_words; + unsigned long *bm; + if (b->bm_set == 0) { + /* no page at all; avoid swap if all is 0 */ + return; + } else if (b->bm_set == b->bm_bits) { + /* only the last words */ + bm = end-2; + } else { + /* all pages */ + bm = b->bm; + } + for (; bm < end; bm++) { + *bm = cpu_to_lel(*bm); + } +} +# endif +/* lel_to_cpu == cpu_to_lel */ +# define bm_lel_to_cpu(x) bm_cpu_to_lel(x) + +STATIC int drbd_bm_rw(struct Drbd_Conf *mdev, int rw) +{ + struct drbd_bitmap *b = mdev->bitmap; + /* sector_t sector; */ + int bm_words, num_pages, i; + unsigned long now; + char ppb[10]; + int err = 0; + + MUST_BE_LOCKED(); + + bm_words = drbd_bm_words(mdev); + num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; + + /* OK, I manipulate the bitmap low level, + * and I expect to be the exclusive user. + * If not, I am really in a bad mood... + * to catch such bugs early, make all people who want to access the + * bitmap while I read/write it dereference a NULL pointer :-> + */ + mdev->bitmap = NULL; + + if(rw == WRITE) bm_cpu_to_lel(b); + + now = jiffies; + atomic_set(&b->bm_async_io, num_pages); + __clear_bit(BM_MD_IO_ERROR,&b->bm_flags); + + for (i = 0; i < num_pages; i++) { + /* let the layers below us try to merge these bios... */ + drbd_bm_page_io_async(mdev,b,i,rw); + } + + drbd_blk_run_queue(bdev_get_queue(mdev->bc->md_bdev)); + wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); + INFO("%s of bitmap took %lu jiffies\n", + rw == READ ? "reading" : "writing", jiffies - now); + + if (test_bit(BM_MD_IO_ERROR,&b->bm_flags)) { + ALERT("we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + err = -EIO; + } + + now = jiffies; + if(rw == WRITE) { + bm_lel_to_cpu(b); + } else /* rw == READ */ { + /* just read, if neccessary adjust endianness */ + b->bm_set = bm_count_bits(b, 1); + INFO("recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + + /* ok, done, + * now it is visible again + */ + + mdev->bitmap = b; + + INFO("%s marked out-of-sync by on disk bit-map.\n", + ppsize(ppb,drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10)) ); + + return err; +} + +int drbd_bm_read(struct Drbd_Conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + int err=0; + + if (b->bm) { + // bitmap size > 0 + err = drbd_bm_rw(mdev, READ); + + if (err == 0) + b->bm[b->bm_words] = DRBD_MAGIC; + } + + return err; +} + +/** + * drbd_bm_write_sect: Writes a 512 byte piece of the bitmap to its + * on disk location. On disk bitmap is little endian. + * + * @enr: The _sector_ offset from the start of the bitmap. + * + */ +int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr) +{ + sector_t on_disk_sector = enr + mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global... + + down(&mdev->md_io_mutex); + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("write_sect: sector=%lu offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + if (num_words < S2W(1)) { + memset(page_address(mdev->md_io_page),0,MD_HARDSECT); + } + drbd_bm_get_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + if (!drbd_md_sync_page_io(mdev,mdev->bc,on_disk_sector,WRITE)) { + int i; + err = -EIO; + ERR( "IO ERROR writing bitmap sector %lu " + "(meta-disk sector %llus)\n", + enr, (unsigned long long)on_disk_sector ); + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + mdev->bm_writ_cnt++; + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_write: Write the whole bitmap to its on disk location. + */ +int drbd_bm_write(struct Drbd_Conf *mdev) +{ + int err = drbd_bm_rw(mdev, WRITE); + + INFO("%lu KB now marked out-of-sync by on disk bit-map.\n", + drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10) ); + + return err; +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); \ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,0,b->bm_words*sizeof(long)); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +void drbd_bm_reset_find(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + + ERR_IF(!b) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + b->bm_fo = 0; + spin_unlock_irq(&b->bm_lock); + +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * should not make much difference anyways, but ... + * this returns a bit number, NOT a sector! + */ +unsigned long drbd_bm_find_next(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + + ERR_IF(!b) return i; + ERR_IF(!b->bm) return i; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + if (b->bm_fo < b->bm_bits) { + i = find_next_bit(b->bm,b->bm_bits,b->bm_fo); + } else if (b->bm_fo > b->bm_bits) { + ERR("bm_fo=%lu bm_bits=%lu\n",b->bm_fo, b->bm_bits); + } + if (i >= b->bm_bits) { + i = -1UL; + b->bm_fo = 0; + } else { + b->bm_fo = i+1; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +void drbd_bm_set_find(drbd_dev *mdev, unsigned long i) +{ + struct drbd_bitmap *b = mdev->bitmap; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + + b->bm_fo = min_t(unsigned long, i, b->bm_bits); + + spin_unlock_irq(&b->bm_lock); +} + + +int drbd_bm_rs_done(drbd_dev *mdev) +{ + return mdev->bitmap->bm_fo == 0; +} + +// THINK maybe the D_BUG_ON(i<0)s in set/clear/test should be not that strict? + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_set_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 1; + ERR_IF(!b->bm) return 1; + +/* + * only called from drbd_set_out_of_sync. + * strange_state blubber is already in place there... + strange_state = ( mdev->cstate > Connected ) || + ( mdev->cstate == Connected && + !(test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags)) ); + if (strange_state) + ERR("%s in drbd_bm_set_bit\n", conns_to_name(mdev->cstate)); +*/ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_set_bit(bitnr, b->bm)); + b->bm_set += !i; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* returns number of bits actually changed (0->1) + * wants bitnr, not sector */ +int drbd_bm_set_bits_in_irq(drbd_dev *mdev, const unsigned long s, const unsigned long e) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bitnr; + int c = 0; + ERR_IF(!b) return 1; + ERR_IF(!b->bm) return 1; + +#if 0 + /* hm. I assumed that, when inside of lock_irq/unlock_irq, + * in_interrupt() would be true ? + * how else can I assert that this called with irq disabled without using + * spin_lock_irqsave? */ + D_BUG_ON(!in_interrupt()); /* called within spin_lock_irq(&mdev->req_lock) */ +#endif + + spin_lock(&b->bm_lock); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + for (bitnr = s; bitnr <=e; bitnr++) { + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + } else { + c += (0 == __test_and_set_bit(bitnr, b->bm)); + } + } + b->bm_set += c; + spin_unlock(&b->bm_lock); + return c; +} + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_clear_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long flags; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_clear_bit(bitnr, b->bm)); + b->bm_set -= i; + } + spin_unlock_irqrestore(&b->bm_lock,flags); + + /* clearing bits should only take place when sync is in progress! + * this is only called from drbd_set_in_sync. + * strange_state blubber is already in place there ... + if (i && mdev->cstate <= Connected) + ERR("drbd_bm_clear_bit: cleared a bitnr=%lu while %s\n", + bitnr, conns_to_name(mdev->cstate)); + */ + + return i; +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + if (bitnr < b->bm_bits) { + i = test_bit(bitnr, b->bm) ? 1 : 0; + } else if (bitnr == b->bm_bits) { + i = -1; + } else /* (bitnr > b->bm_bits) */ { + ERR("bitnr=%lu > bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(drbd_dev *mdev, unsigned long enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int count, s, e; + unsigned long flags; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + + s = S2W(enr); + e = min((size_t)S2W(enr+1),b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + } else { + ERR("start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock,flags); +#if DUMP_MD >= 3 + INFO("enr=%lu weight=%d e=%d s=%d\n", enr, count, e, s); +#endif + return count; +} + +/* set all bits covered by the AL-extent al_enr */ +unsigned long drbd_bm_ALe_set_all(drbd_dev *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long weight; + int count, s, e; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + n = e-s; + memset(b->bm+s,-1,n*sizeof(long)); + b->bm_set += n*BITS_PER_LONG - count; + if (e == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + } + } else { + ERR("start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_buildtag.c linux-2.6.24.ovz/drivers/block/drbd/drbd_buildtag.c --- linux-2.6.24/drivers/block/drbd/drbd_buildtag.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_buildtag.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,6 @@ +/* automatically generated. DO NOT EDIT. */ +const char * drbd_buildtag(void) +{ + return "SVN Revision: 2713" + " build by root@dhcp0-176.sw.ru, 2007-03-05 16:06:51"; +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_compat_wrappers.h linux-2.6.24.ovz/drivers/block/drbd/drbd_compat_wrappers.h --- linux-2.6.24/drivers/block/drbd/drbd_compat_wrappers.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_compat_wrappers.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,329 @@ +/* + * FIXME this file is bound to die, renamed or included in drbd_int.h + */ + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +# error "use a 2.6 kernel, please" +#endif + + +/* struct page has a union in 2.6.15 ... + * an anonymous union and struct since 2.6.16 + * or in fc5 "2.6.15" */ +#include +#ifndef page_private +# define page_private(page) ((page)->private) +# define set_page_private(page, v) ((page)->private = (v)) +#endif + +#include // for fsync_bdev + +/* see get_sb_bdev and bd_claim */ +extern char* drbd_sec_holder; + +// bi_end_io handlers +// int (bio_end_io_t) (struct bio *, unsigned int, int); +extern int drbd_md_io_complete (struct bio *bio, unsigned int bytes_done, int error); + +extern int drbd_endio_read_sec (struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_endio_write_sec(struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_endio_pri (struct bio *bio, unsigned int bytes_done, int error); + +static inline sector_t drbd_get_hardsect(struct block_device *bdev) +{ + return bdev->bd_disk->queue->hardsect_size; +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? bdev->bd_inode->i_size >> 9 : 0; +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(drbd_dev *mdev, + sector_t size) +{ + /* set_capacity(mdev->this_bdev->bd_disk, size); */ + set_capacity(mdev->vdisk,size); + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +static inline int drbd_sync_me(drbd_dev *mdev) +{ + return fsync_bdev(mdev->this_bdev); +} + +#define drbd_bio_uptodate(bio) bio_flagged(bio,BIO_UPTODATE) + +#ifdef CONFIG_HIGHMEM +/* + * I don't know why there is no bvec_kmap, only bvec_kmap_irq ... + * + * we do a sock_recvmsg into the target buffer, + * so we obviously cannot use the bvec_kmap_irq variant. -lge + * + * Most likely it is only due to performance anyways: + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + unsigned long addr; + + addr = (unsigned long) kmap(bvec->bv_page); + + if (addr & ~PAGE_MASK) + BUG(); + + return (char *) addr + bvec->bv_offset; +} + +static inline void drbd_bio_kunmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + + kunmap(bvec->bv_page); +} + +#else +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + return page_address(bvec->bv_page) + bvec->bv_offset; +} +static inline void drbd_bio_kunmap(struct bio *bio) +{ + // do nothing. +} +#endif + +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) return 1; + } + + return 0; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(int rw, int fault_type, struct bio *bio) +{ + bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined. + + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd_generic_make_request: bio->bi_bdev == NULL\n"); + dump_stack(); + bio_endio(bio, bio->bi_size, -ENODEV); + return; + } + + if (FAULT_ACTIVE(fault_type)) + bio_endio(bio,bio->bi_size,-EIO); + else + generic_make_request(bio); +} + +static inline void drbd_plug_device(drbd_dev *mdev) +{ + request_queue_t *q; + q = bdev_get_queue(mdev->this_bdev); + + spin_lock_irq(q->queue_lock); + +/* XXX the check on !blk_queue_plugged is redundant, + * implicitly checked in blk_plug_device */ + + if(!blk_queue_plugged(q)) { + blk_plug_device(q); + del_timer(&q->unplug_timer); + // unplugging should not happen automatically... + } + spin_unlock_irq(q->queue_lock); +} + +static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + struct page *page = bvec->bv_page; + size_t size = bvec->bv_len; + int offset = bvec->bv_offset; + int ret; + + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +#ifdef DEFINE_SOCK_CREATE_KERN +#define sock_create_kern sock_create +#endif + +#ifdef USE_KMEM_CACHE_S +typedef struct kmem_cache_s drbd_kmem_cache_t; +#else +typedef struct kmem_cache drbd_kmem_cache_t; +#endif + +#ifdef NEED_BACKPORT_OF_ATOMIC_ADD + +#if defined(__x86_64__) + +static __inline__ int atomic_add_return(int i, atomic_t *v) +{ + int __i = i; + __asm__ __volatile__( + LOCK_PREFIX "xaddl %0, %1;" + :"=r"(i) + :"m"(v->counter), "0"(i)); + return i + __i; +} + +static __inline__ int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i,v); +} + +#define atomic_inc_return(v) (atomic_add_return(1,v)) +#define atomic_dec_return(v) (atomic_sub_return(1,v)) + +#elif defined(__i386__) || defined(__arch_um__) + +static __inline__ int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if(unlikely(boot_cpu_data.x86==3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + __asm__ __volatile__( + LOCK_PREFIX "xaddl %0, %1;" + :"=r"(i) + :"m"(v->counter), "0"(i)); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +static __inline__ int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i,v); +} + +#define atomic_inc_return(v) (atomic_add_return(1,v)) +#define atomic_dec_return(v) (atomic_sub_return(1,v)) + +#else +# error "You need to copy/past atomic_inc_return()/atomic_dec_return() here" +# error "for your architecture. (Hint: Kernels after 2.6.10 have those" +# error "by default! Using a later kernel might be less effort!)" +#endif + +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +/* With Linux-2.6.19 the crypto API changed! */ +/* This is not a generic backport of the new api, it just implements + the corner case of "hmac(xxx)". */ + +#define CRYPTO_ALG_ASYNC 4711 +#define CRYPTO_ALG_TYPE_HASH CRYPTO_ALG_TYPE_DIGEST + +struct crypto_hash { + struct crypto_tfm *base; + const u8 *key; + int keylen; +}; + +struct hash_desc { + struct crypto_hash *tfm; + u32 flags; +}; + +static inline struct crypto_hash * +crypto_alloc_hash(char *alg_name, u32 type, u32 mask) +{ + struct crypto_hash *ch; + char *closing_bracket; + + // "hmac(xxx)" is in alg_name we need that xxx. + closing_bracket = strchr(alg_name,')'); + if(!closing_bracket) return NULL; + if(closing_bracket-alg_name < 6) return NULL; + + ch = kmalloc(sizeof(struct crypto_hash),GFP_KERNEL); + if(!ch) return NULL; + + *closing_bracket = 0; + ch->base = crypto_alloc_tfm(alg_name + 5, 0); + *closing_bracket = ')'; + + if (ch->base == NULL) { + kfree(ch); + return NULL; + } + + return ch; +} + +static inline int +crypto_hash_setkey(struct crypto_hash *hash,const u8 *key,unsigned int keylen) +{ + hash->key = key; + hash->keylen = keylen; + + return 0; +} + +static inline int +crypto_hash_digest(struct hash_desc *desc, struct scatterlist *sg, + unsigned int nbytes, u8 *out) +{ + + crypto_hmac(desc->tfm->base, (u8*)desc->tfm->key, + &desc->tfm->keylen, sg, 1 /* ! */ , out); + /* ! this is not generic. Would need to convert nbytes -> nsg */ + + return 0; +} + +static inline void crypto_free_hash(struct crypto_hash *tfm) +{ + crypto_free_tfm(tfm->base); + kfree(tfm); +} + +static inline unsigned int crypto_hash_digestsize(struct crypto_hash *tfm) +{ + return crypto_tfm_alg_digestsize(tfm->base); +} + +static inline struct crypto_tfm *crypto_hash_tfm(struct crypto_hash *tfm) +{ + return tfm->base; +} + +#endif diff -uprN linux-2.6.24/drivers/block/drbd/drbd_int.h linux-2.6.24.ovz/drivers/block/drbd/drbd_int.h --- linux-2.6.24/drivers/block/drbd/drbd_int.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_int.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,1920 @@ +/* + drbd_int.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lru_cache.h" +//#include "linux/drbd.h" + +// module parameter, defined in drbd_main.c +extern int minor_count; +extern int allow_oos; +extern int major_nr; +extern int use_nbd_major; + +#ifdef DRBD_ENABLE_FAULTS +extern int enable_faults; +extern int fault_rate; +#endif + +#include +#ifdef DRBD_MAJOR +# warning "FIXME. DRBD_MAJOR is now officially defined in major.h" +#endif + +#include +#include +#define MAJOR_NR major_nr + +#undef DEVICE_NAME +#define DEVICE_NAME "drbd" + +// XXX do we need this? +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + * + * FIXME btw, we should register some reboot notifier. + */ +#define DRBD_SIGKILL SIGHUP + +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 // All EEs on the free list should have this value + // freshly allocated EEs get !ID_VACANT (== 1) + // so if it says "cannot dereference null + // pointer at adress 0x00000001, it is most + // probably one of these :( +#define is_syncer_block_id(id) ((id)==ID_SYNCER) + +struct Drbd_Conf; +typedef struct Drbd_Conf drbd_dev; + +#ifdef DBG_ALL_SYMBOLS +# define STATIC +#else +# define STATIC static +#endif + +#ifdef PARANOIA +# define PARANOIA_BUG_ON(x) BUG_ON(x) +#else +# define PARANOIA_BUG_ON(x) +#endif + +/* + * Some Message Macros + *************************/ + +// handy macro: DUMPP(somepointer) +#define DUMPP(A) ERR( #A " = %p in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPLU(A) ERR( #A " = %lu in %s:%d\n", (unsigned long)(A),__FILE__,__LINE__); +#define DUMPLLU(A) ERR( #A " = %llu in %s:%d\n",(unsigned long long)(A),__FILE__,__LINE__); +#define DUMPLX(A) ERR( #A " = %lx in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPI(A) ERR( #A " = %d in %s:%d\n", (int)(A),__FILE__,__LINE__); + +#define DUMPST(A) DUMPLLU((unsigned long long)(A)) + +#if 0 +#define D_DUMPP(A) DUMPP(A) +#define D_DUMPLU(A) DUMPLU(A) +#define D_DUMPLLU(A) DUMPLLU(A) +#define D_DUMPLX(A) DUMPLX(A) +#define D_DUMPI(A) DUMPI(A) +#else +#define D_DUMPP(A) +#define D_DUMPLU(A) +#define D_DUMPLLU(A) +#define D_DUMPLX(A) +#define D_DUMPI(A) +#endif + +// Info: do not remove the spaces around the "," before ## +// Otherwise this is not portable from gcc-2.95 to gcc-3.3 +#define PRINTK(level,fmt,args...) \ + printk(level DEVICE_NAME "%d: " fmt, \ + mdev->minor , ##args) + +#define ALERT(fmt,args...) PRINTK(KERN_ALERT, fmt , ##args) +#define ERR(fmt,args...) PRINTK(KERN_ERR, fmt , ##args) +#define WARN(fmt,args...) PRINTK(KERN_WARNING, fmt , ##args) +#define INFO(fmt,args...) PRINTK(KERN_INFO, fmt , ##args) +#define DBG(fmt,args...) PRINTK(KERN_DEBUG, fmt , ##args) + +/* see kernel/printk.c:printk_ratelimit + * macro, so it is easy do have independend rate limits at different locations + * "initializer element not constant ..." with kernel 2.4 :( + * so I initialize toks to something large + */ +#define DRBD_ratelimit(ratelimit_jiffies,ratelimit_burst) \ +({ \ + int __ret; \ + static unsigned long toks = 0x80000000UL; \ + static unsigned long last_msg; \ + static int missed; \ + unsigned long now = jiffies; \ + toks += now - last_msg; \ + last_msg = now; \ + if (toks > (ratelimit_burst * ratelimit_jiffies)) \ + toks = ratelimit_burst * ratelimit_jiffies; \ + if (toks >= ratelimit_jiffies) { \ + int lost = missed; \ + missed = 0; \ + toks -= ratelimit_jiffies; \ + if (lost) \ + WARN("%d messages suppressed in %s:%d.\n",\ + lost , __FILE__ , __LINE__ ); \ + __ret=1; \ + } else { \ + missed++; \ + __ret=0; \ + } \ + __ret; \ +}) + + +#ifdef DBG_ASSERTS +extern void drbd_assert_breakpoint(drbd_dev*, char *, char *, int ); +# define D_ASSERT(exp) if (!(exp)) \ + drbd_assert_breakpoint(mdev,#exp,__FILE__,__LINE__) +#else +# define D_ASSERT(exp) if (!(exp)) \ + ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__,__LINE__) +#endif +#define ERR_IF(exp) if (({ \ + int _b = (exp)!=0; \ + if (_b) ERR("%s: (" #exp ") in %s:%d\n", __func__, __FILE__,__LINE__); \ + _b; \ + })) + +// Defines to control fault insertion +enum { + DRBD_FAULT_MD_WR = 0, + DRBD_FAULT_MD_RD, + DRBD_FAULT_RS_WR, + DRBD_FAULT_RS_RD, + DRBD_FAULT_DT_WR, + DRBD_FAULT_DT_RD, + DRBD_FAULT_DT_RA, // READA = Read ahead + + DRBD_FAULT_MAX, +}; + +#ifdef DRBD_ENABLE_FAULTS +#define FAULT_ACTIVE(_t) \ + (fault_rate && (enable_faults & (1<<(_t))) && _drbd_insert_fault(_t)) + +extern unsigned int _drbd_insert_fault(unsigned int type); +#else +#define FAULT_ACTIVE(_t) (0) +#endif + +#include +// integer division, round _UP_ to the next integer +#define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) ) +// usual integer division +#define div_floor(A,B) ( (A)/(B) ) + +/* + * Compatibility Section + *************************/ + +#define LOCK_SIGMASK(task,flags) spin_lock_irqsave(&task->sighand->siglock, flags) +#define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sighand->siglock, flags) +#define RECALC_SIGPENDING() recalc_sigpending(); + +#if defined(DBG_SPINLOCKS) && defined(__SMP__) +# define MUST_HOLD(lock) if(!spin_is_locked(lock)) { ERR("Not holding lock! in %s\n", __FUNCTION__ ); } +#else +# define MUST_HOLD(lock) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) +# define HAVE_KERNEL_SENDMSG 1 +#else +# define HAVE_KERNEL_SENDMSG 0 +#endif + + +/* + * our structs + *************************/ + +#define SET_MDEV_MAGIC(x) \ + ({ typecheck(struct Drbd_Conf*,x); \ + (x)->magic = (long)(x) ^ DRBD_MAGIC; }) +#define IS_VALID_MDEV(x) \ + ( typecheck(struct Drbd_Conf*,x) && \ + ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0)) + +/* drbd_meta-data.c (still in drbd_main.c) */ +#define DRBD_MD_MAGIC (DRBD_MAGIC+4) // 4th incarnation of the disk layout. + +extern struct Drbd_Conf **minor_table; + +/*** + * on the wire + *********************************************************************/ + +typedef enum { + Data, + DataReply, // Response to DataRequest + RSDataReply, // Response to RSDataRequest + Barrier, + ReportBitMap, + BecomeSyncTarget, + BecomeSyncSource, + UnplugRemote, // Used at various times to hint the peer to hurry up + DataRequest, // Used to ask for a data block + RSDataRequest, // Used to ask for a data block + SyncParam, + ReportProtocol, + ReportUUIDs, + ReportSizes, + ReportState, + ReportSyncUUID, + AuthChallenge, + AuthResponse, + StateChgRequest, + + Ping, // These are sent on the meta socket... + PingAck, + RecvAck, // Used in protocol B + WriteAck, // Used in protocol C + RSWriteAck, // Is a WriteAck, additionally call set_in_sync(). + DiscardAck, // Used in protocol C, two-primaries conflict detection + NegAck, // Sent if local disk is unusable + NegDReply, // Local disk is broken... + NegRSDReply, // Local disk is broken... + BarrierAck, + StateChgReply, + + MAX_CMD, + MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ... + MAX_OPT_CMD, + + /* FIXME + * to get a more useful error message with drbd-8 <-> drbd 0.7.x, + * these could be reimplemented as special case of HandShake. */ + HandShakeM = 0xfff1, // First Packet on the MetaSock + HandShakeS = 0xfff2, // First Packet on the Socket + + HandShake = 0xfffe // FIXED for the next century! +} Drbd_Packet_Cmd; + +static inline const char* cmdname(Drbd_Packet_Cmd cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [Data] = "Data", + [DataReply] = "DataReply", + [RSDataReply] = "RSDataReply", + [Barrier] = "Barrier", + [ReportBitMap] = "ReportBitMap", + [BecomeSyncTarget] = "BecomeSyncTarget", + [BecomeSyncSource] = "BecomeSyncSource", + [UnplugRemote] = "UnplugRemote", + [DataRequest] = "DataRequest", + [RSDataRequest] = "RSDataRequest", + [SyncParam] = "SyncParam", + [ReportProtocol] = "ReportProtocol", + [ReportUUIDs] = "ReportUUIDs", + [ReportSizes] = "ReportSizes", + [ReportState] = "ReportState", + [ReportSyncUUID] = "ReportSyncUUID", + [AuthChallenge] = "AuthChallenge", + [AuthResponse] = "AuthResponse", + [Ping] = "Ping", + [PingAck] = "PingAck", + [RecvAck] = "RecvAck", + [WriteAck] = "WriteAck", + [RSWriteAck] = "RSWriteAck", + [DiscardAck] = "DiscardAck", + [NegAck] = "NegAck", + [NegDReply] = "NegDReply", + [NegRSDReply] = "NegRSDReply", + [BarrierAck] = "BarrierAck", + [StateChgRequest] = "StateChgRequest", + [StateChgReply] = "StateChgReply" + }; + + if (Data > cmd || cmd >= MAX_CMD) { + switch (cmd) { + case HandShakeM: + return "HandShakeM"; + break; + case HandShakeS: + return "HandShakeS"; + break; + case HandShake: + return "HandShake"; + break; + default: + return "Unknown"; + break; + } + } + return cmdnames[cmd]; +} + + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +typedef struct { + u32 magic; + u16 command; + u16 length; // bytes of data after this header + char payload[0]; +} __attribute((packed)) Drbd_Header; +// 8 bytes. packet FIXED for the next century! + +/* + * short commands, packets without payload, plain Drbd_Header: + * Ping + * PingAck + * BecomeSyncTarget + * BecomeSyncSource + * UnplugRemote + */ + +/* + * commands with out-of-struct payload: + * ReportBitMap (no additional fields) + * Data, DataReply (see Drbd_Data_Packet) + */ + +#define DP_HARDBARRIER 1 +#define DP_RW_SYNC 2 +#define DP_MAY_SET_IN_SYNC 4 + +typedef struct { + Drbd_Header head; + u64 sector; // 64 bits sector number + u64 block_id; // Used in protocol B&C for the address of the req. + u32 seq_num; + u32 dp_flags; +} __attribute((packed)) Drbd_Data_Packet; + +/* + * commands which share a struct: + * RecvAck (proto B), WriteAck (proto C) (see Drbd_BlockAck_Packet) + * DataRequest, RSDataRequest (see Drbd_BlockRequest_Packet) + */ +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __attribute((packed)) Drbd_BlockAck_Packet; + + +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_BlockRequest_Packet; + +/* + * commands with their own struct for additional fields: + * HandShake + * Barrier + * BarrierAck + * SyncParam + * ReportParams + */ + +typedef struct { + Drbd_Header head; // 8 bytes + u32 protocol_version; + u32 feature_flags; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u64 reserverd[8]; +} __attribute((packed)) Drbd_HandShake_Packet; +// 80 bytes, FIXED for the next century + +/* FIXME do we actually send a barrier packet with "0" as barrier number? + * what for? + * couldn't we send the pointer as handle as well, as we do with block_id? + */ +typedef struct { + Drbd_Header head; + u32 barrier; // may be 0 or a barrier number + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_Barrier_Packet; + +typedef struct { + Drbd_Header head; + u32 barrier; + u32 set_size; +} __attribute((packed)) Drbd_BarrierAck_Packet; + +typedef struct { + Drbd_Header head; + u32 rate; +} __attribute((packed)) Drbd_SyncParam_Packet; + +typedef struct { + Drbd_Header head; + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 want_lose; + u32 two_primaries; +} __attribute((packed)) Drbd_Protocol_Packet; + +typedef struct { + Drbd_Header head; + u64 uuid[EXT_UUID_SIZE]; +} __attribute((packed)) Drbd_GenCnt_Packet; + +typedef struct { + Drbd_Header head; + u64 uuid; +} __attribute((packed)) Drbd_SyncUUID_Packet; + +typedef struct { + Drbd_Header head; + u64 d_size; // size of disk + u64 u_size; // user requested size + u64 c_size; // current exported size + u32 max_segment_size; // Maximal size of a BIO + u32 queue_order_type; +} __attribute((packed)) Drbd_Sizes_Packet; + +typedef struct { + Drbd_Header head; + u32 state; +} __attribute((packed)) Drbd_State_Packet; + +typedef struct { + Drbd_Header head; + u32 mask; + u32 val; +} __attribute((packed)) Drbd_Req_State_Packet; + +typedef struct { + Drbd_Header head; + u32 retcode; +} __attribute((packed)) Drbd_RqS_Reply_Packet; + +typedef struct { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __attribute((packed)) Drbd06_Parameter_P; + +typedef struct { + Drbd_Header head; + u64 block_id; + u32 seq_num; + u32 pad; +} __attribute((packed)) Drbd_Discard_Packet; + +typedef union { + Drbd_Header head; + Drbd_HandShake_Packet HandShake; + Drbd_Data_Packet Data; + Drbd_BlockAck_Packet BlockAck; + Drbd_Barrier_Packet Barrier; + Drbd_BarrierAck_Packet BarrierAck; + Drbd_SyncParam_Packet SyncParam; + Drbd_Protocol_Packet Protocol; + Drbd_Sizes_Packet Sizes; + Drbd_GenCnt_Packet GenCnt; + Drbd_State_Packet State; + Drbd_Req_State_Packet ReqState; + Drbd_RqS_Reply_Packet RqSReply; + Drbd_BlockRequest_Packet BlockRequest; +} __attribute((packed)) Drbd_Polymorph_Packet; + +/**********************************************************************/ + +typedef enum { + None, + Running, + Exiting, + Restarting +} Drbd_thread_state; + +struct Drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion startstop; + Drbd_thread_state t_state; + int (*function) (struct Drbd_thread *); + drbd_dev *mdev; +}; + +static inline Drbd_thread_state get_t_state(struct Drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return (volatile int)thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and Tl_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(drbd_dev*, struct drbd_work*, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +struct drbd_barrier; +struct drbd_request { + struct drbd_work w; + drbd_dev *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ + + /* barrier_nr: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * starting a new epoch... + */ + + /* up to here, the struct layout is identical to Tl_epoch_entry; + * we might be able to use that to our advantage... */ + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; +}; + +struct drbd_barrier { + struct drbd_work w; + struct list_head requests; // requests before + struct drbd_barrier *next; // pointer to the next barrier + unsigned int br_number; // the barriers identifier. + int n_req; // number of requests attached before this barrier +}; + +typedef struct drbd_request drbd_request_t; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send WriteAck + read_ee .. [RS]DataRequest being read +*/ + +struct Tl_epoch_entry { + struct drbd_work w; + drbd_dev *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int barrier_nr; + + /* up to here, the struct layout is identical to drbd_request; + * we might be able to use that to our advantage... */ + + unsigned int barrier_nr2; + /* If we issue the bio with BIO_RW_BARRIER we have to + send a barrier ACK before we send the ACK to this + write. We store the barrier number in here. + In case the barrier after this write has been coalesced + as well, we set it's barrier_nr into barrier_nr2 */ + + unsigned int flags; + u64 block_id; +}; + +/* ee flag bits */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_CONFLICT_PENDING, + __EE_MAY_SET_IN_SYNC, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) + +/* global flag bits */ +enum { + ISSUE_BARRIER, // next Data is preceeded by a Barrier + SIGNAL_ASENDER, // whether asender wants to be interrupted + SEND_PING, // whether asender should send a ping asap + WRITE_ACK_PENDING, // so BarrierAck won't overtake WriteAck + WORK_PENDING, // completion flag for drbd_disconnect + STOP_SYNC_TIMER, // tell timer to cancel itself + UNPLUG_QUEUED, // only relevant with kernel 2.4 + UNPLUG_REMOTE, // whether sending a "UnplugRemote" makes sense + MD_DIRTY, // current gen counts and flags not yet on disk + DISCARD_CONCURRENT, // Set on one node, cleared on the peer! + USE_DEGR_WFC_T, // Use degr-wfc-timeout instead of wfc-timeout. + CLUSTER_ST_CHANGE, // Cluster wide state change going on... + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, // This node was a crashed primary. Gets + // cleared when the state.conn goes into + // Connected state. + WRITE_BM_AFTER_RESYNC // A kmalloc() during resync failed +}; + +struct drbd_bitmap; // opaque for Drbd_Conf + +// TODO sort members for performance +// MAYBE group them further + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; // producers up it, worker down()s it + spinlock_t q_lock; // to protect the list. +}; + +/* If Philipp agrees, we remove the "mutex", and make_request will only + * (throttle on "queue full" condition and) queue it to the worker thread... + * which then is free to do whatever is needed, and has exclusive send access + * to the data socket ... + */ +struct drbd_socket { + struct drbd_work_queue work; + struct semaphore mutex; + struct socket *socket; + Drbd_Polymorph_Packet sbuf; // this way we get our + Drbd_Polymorph_Packet rbuf; // send/receive buffers off the stack +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + u64 uuid[UUID_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to al area */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* u32 al_nr_extents; important for restoring the AL + * is stored into sync_conf.al_extents, which in turn + * gets applied to act_log->nr_elements + */ +}; + +// for sync_conf and other types... +#define PACKET(name, number, fields) struct name { fields }; +#define INTEGER(pn,pr,member) int member; +#define INT64(pn,pr,member) __u64 member; +#define BIT(pn,pr,member) unsigned member : 1; +#define STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include "linux/drbd_nl.h" + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct file *lo_file; + struct file *md_file; + struct drbd_md md; + struct disk_conf dc; /* The user provided config... */ +}; + +struct Drbd_Conf { +#ifdef PARANOIA + long magic; +#endif + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct net_conf *net_conf; // protected by inc_net() and dec_net() + struct syncer_conf sync_conf; + struct drbd_backing_dev *bc; // protected by inc_local() dec_local() + + sector_t p_size; /* partner's disk size */ + request_queue_t *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + struct drbd_socket data; // for data/barrier/cstate/parameter packets + struct drbd_socket meta; // for ping/ack (metadata) packets + volatile unsigned long last_received; // in jiffies, either socket + volatile unsigned int ko_count; + struct drbd_work resync_work, + unplug_work, + md_sync_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + + drbd_state_t new_state_tmp; // Used after attach while negotiating new disk state. + drbd_state_t state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; // upon each state change. + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; // Requests we need to complete + atomic_t ap_pending_cnt; // AP data packets on the wire, ack expected + atomic_t rs_pending_cnt; // RS request/data packets on the wire + atomic_t unacked_cnt; // Need to send replys for + atomic_t local_cnt; // Waiting for local disk to signal completion + atomic_t net_cnt; // Users of net_conf + spinlock_t req_lock; + struct drbd_barrier* unused_spare_barrier; /* for pre-allocation */ + struct drbd_barrier* newest_barrier; + struct drbd_barrier* oldest_barrier; + struct hlist_head * tl_hash; + unsigned int tl_hash_s; + // sector_t rs_left; // blocks not up-to-date [unit BM_BLOCK_SIZE] + // moved into bitmap->bm_set + unsigned long rs_total; // blocks to sync in this run [unit BM_BLOCK_SIZE] + unsigned long rs_failed; // number of sync IOs that failed in this run + unsigned long rs_start; // Syncer's start time [unit jiffies] + unsigned long rs_paused; // cumulated time in PausedSyncX state [unit jiffies] + unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE] + unsigned long rs_mark_time;// marks's time [unit jiffies] + struct Drbd_thread receiver; + struct Drbd_thread worker; + struct Drbd_thread asender; + struct drbd_bitmap* bitmap; + struct lru_cache* resync; // Used to track operations of resync... + unsigned int resync_locked; // Number of locked elements in resync LRU + unsigned int resync_wenr; // resync extent number waiting for application requests + int open_cnt; + u64 *p_uuid; + /* FIXME clean comments, restructure so it is more obvious which + * members are protected by what */ + unsigned int epoch_size; + struct list_head active_ee; // IO in progress + struct list_head sync_ee; // IO in progress + struct list_head done_ee; // send ack + struct list_head read_ee; // IO in progress + struct list_head net_ee; // zero-copy network send in progress + struct hlist_head * ee_hash; // is proteced by req_lock! + unsigned int ee_hash_s; + struct Tl_epoch_entry * last_write_w_barrier; // ee_lock, single thread + int next_barrier_nr; // ee_lock, single thread + struct hlist_head * app_reads_hash; // is proteced by req_lock + struct list_head resync_reads; + atomic_t pp_in_use; + wait_queue_head_t ee_wait; + struct page *md_io_page; // one page buffer for md_io + struct page *md_io_tmpp; // in case hardsect != 512 [ s390 only? ] + struct semaphore md_io_mutex; // protects the md_io_buffer + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache* act_log; // activity log + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; // position of the next transaction in the journal + struct crypto_hash* cram_hmac_tfm; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + int minor; + unsigned long comm_bm_set; // communicated number of set bits. +}; + +static inline drbd_dev *minor_to_mdev(int minor) +{ + drbd_dev *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; +} + +static inline int mdev_to_minor(drbd_dev *mdev) +{ + return mdev->minor; +} + +/* returns 1 if it was successfull, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(drbd_dev *mdev) +{ + down(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + up(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(drbd_dev *mdev) +{ + up(&mdev->data.mutex); +} + + +/* + * function declarations + *************************/ + +// drbd_main.c + +enum chg_state_flags { + ChgStateHard = 1, + ChgStateVerbose = 2, + ScheduleAfter = 4, +}; + +extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, + drbd_state_t mask, drbd_state_t val); +extern void drbd_force_state(drbd_dev*, drbd_state_t, drbd_state_t); +extern int _drbd_request_state(drbd_dev*, drbd_state_t, drbd_state_t, + enum chg_state_flags); +extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags ); +extern void print_st_err(drbd_dev*, drbd_state_t, drbd_state_t, int ); +extern void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, + enum chg_state_flags); +extern int drbd_thread_start(struct Drbd_thread *thi); +extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait); +extern void drbd_free_resources(drbd_dev *mdev); +extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(drbd_dev *mdev); +extern struct drbd_barrier *_tl_add_barrier(drbd_dev *,struct drbd_barrier *); +extern void drbd_free_sock(drbd_dev *mdev); +extern int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(drbd_dev *mdev); +extern int drbd_send_uuids(drbd_dev *mdev); +extern int drbd_send_sync_uuid(drbd_dev *mdev, u64 val); +extern int drbd_send_sizes(drbd_dev *mdev); +extern int drbd_send_state(drbd_dev *mdev); +extern int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(drbd_dev *mdev, int use_data_socket, + Drbd_Packet_Cmd cmd, Drbd_Header *h, size_t size); +extern int drbd_send_cmd2(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + char* data, size_t size); +extern int drbd_send_sync_param(drbd_dev *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int drbd_send_ack_rp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_BlockRequest_Packet *rp); +extern int drbd_send_ack_dp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_Data_Packet *dp); +extern int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size); +extern int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req); +extern int _drbd_send_barrier(drbd_dev *mdev, struct drbd_barrier *barrier); +extern int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id); +extern int drbd_send_bitmap(drbd_dev *mdev); +extern int _drbd_send_bitmap(drbd_dev *mdev); +extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode); +extern void drbd_free_bc(struct drbd_backing_dev* bc); +extern int drbd_io_error(drbd_dev* mdev, int forcedetach); +extern void drbd_mdev_cleanup(drbd_dev *mdev); + +// drbd_meta-data.c (still in drbd_main.c) +extern void drbd_md_sync(drbd_dev *mdev); +extern int drbd_md_read(drbd_dev *mdev, struct drbd_backing_dev * bdev); +// maybe define them below as inline? +extern void drbd_uuid_set(drbd_dev *mdev,int idx, u64 val); +extern void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val); +extern void drbd_uuid_new_current(drbd_dev *mdev); +extern void drbd_uuid_set_bm(drbd_dev *mdev, u64 val); +extern void drbd_md_set_flag(drbd_dev *mdev, int flags); +extern void drbd_md_clear_flag(drbd_dev *mdev, int flags); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(drbd_dev *mdev); + +/* Meta data layout + We reserve a 128MB Block (4k aligned) + * either at the end of the backing device + * or on a seperate meta data device. */ + +#define MD_RESERVED_SECT ( 128LU << 11 ) // 128 MB, unit sectors +// The following numbers are sectors +#define MD_AL_OFFSET 8 // 8 Sectors after start of meta area +#define MD_AL_MAX_SIZE 64 // = 32 kb LOG ~ 3776 extents ~ 14 GB Storage +#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) //Allows up to about 3.8TB + +#define MD_HARDSECT_B 9 // Since the smalles IO unit is usually 512 byte +#define MD_HARDSECT (1< we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define BM_BLOCK_SIZE_B 12 // 4k per bit +#define BM_BLOCK_SIZE (1<>(BM_BLOCK_SIZE_B-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SIZE_B-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SIZE_B-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SIZE_B-9)) + +/* who much _storage_ sectors we have per bitmap sector */ +#define BM_SECT_PER_EXT (1ULL << (BM_EXT_SIZE_B-9)) + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B) ) +#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + + +#define BM_BLOCKS_PER_BM_EXT_B ( BM_EXT_SIZE_B - BM_BLOCK_SIZE_B ) +#define BM_BLOCKS_PER_BM_EXT_MASK ( (1<= level) && (type & trace_type)); +} +static inline int +is_mdev_trace(drbd_dev *mdev, unsigned int type, unsigned int level) { + return (is_trace(type, level) && + ( ( 1 << mdev_to_minor(mdev)) & trace_devs)); +} + +#define MTRACE(type,lvl,code...) \ +do { \ + if (unlikely(is_mdev_trace(mdev,type,lvl))) { \ + code \ + } \ +} while (0) + +#define TRACE(type,lvl,code...) \ +do { \ + if (unlikely(is_trace(type,lvl))) { \ + code \ + } \ +} while (0) + +// Buffer printing support +// DbgPrintFlags: used for Flags arg to DbgPrintBuffer +// - DBGPRINT_BUFFADDR; if set, each line starts with the +// virtual address of the line being output. If clear, +// each line starts with the offset from the beginning +// of the buffer. +typedef enum { + DBGPRINT_BUFFADDR = 0x0001, +} DbgPrintFlags; + +extern void drbd_print_uuid(drbd_dev *mdev, unsigned int idx); + +extern void drbd_print_buffer(const char *prefix,unsigned int flags,int size, + const void *buffer,const void *buffer_va, + unsigned int length); + +// Bio printing support +extern void _dump_bio(drbd_dev *mdev, struct bio *bio, int complete); + +static inline void dump_bio(drbd_dev *mdev, struct bio *bio, int complete) { + MTRACE(TraceTypeRq,TraceLvlSummary, + _dump_bio(mdev, bio, complete); + ); +} + +// Packet dumping support +extern void _dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line); + +static inline void +dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line) +{ + MTRACE(TraceTypePacket, TraceLvlSummary, + _dump_packet(mdev,sock,recv,p,file,line); + ); +} + +#else + +#define MTRACE(ignored...) ((void)0) +#define TRACE(ignored...) ((void)0) + +#define dump_bio(ignored...) ((void)0) +#define dump_packet(ignored...) ((void)0) +#endif + +// drbd_req +extern int drbd_make_request_26(request_queue_t *q, struct bio *bio); +extern int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req); +extern int drbd_merge_bvec(request_queue_t *, struct bio *, struct bio_vec *); +extern int is_valid_ar_handle(drbd_request_t *, sector_t); + + +// drbd_nl.c +extern char* ppsize(char* buf, unsigned long long size); +extern sector_t drbd_new_dev_size(struct Drbd_Conf*, struct drbd_backing_dev*); +extern int drbd_determin_dev_size(drbd_dev*); +extern void drbd_setup_queue_param(drbd_dev *mdev, unsigned int); +extern int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force); +extern int drbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +drbd_disks_t drbd_try_outdate_peer(drbd_dev *mdev); +extern long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg); +extern int drbd_khelper(drbd_dev *mdev, char* cmd); + +// drbd_worker.c +extern int drbd_worker(struct Drbd_thread *thi); +extern void drbd_alter_sa(drbd_dev *mdev, int na); +extern void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side); +extern void resume_next_sg(drbd_dev* mdev); +extern void suspend_other_sg(drbd_dev* mdev); +extern int drbd_resync_finished(drbd_dev *mdev); +// maybe rather drbd_main.c ? +extern int drbd_md_sync_page_io(drbd_dev *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw); +// worker callbacks +extern int w_req_cancel_conflict (drbd_dev *, struct drbd_work *, int); +extern int w_read_retry_remote (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_data_req (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_rsdata_req (drbd_dev *, struct drbd_work *, int); +extern int w_resync_inactive (drbd_dev *, struct drbd_work *, int); +extern int w_resume_next_sg (drbd_dev *, struct drbd_work *, int); +extern int w_io_error (drbd_dev *, struct drbd_work *, int); +extern int w_send_write_hint (drbd_dev *, struct drbd_work *, int); +extern int w_make_resync_request (drbd_dev *, struct drbd_work *, int); +extern int w_send_dblock (drbd_dev *, struct drbd_work *, int); +extern int w_send_barrier (drbd_dev *, struct drbd_work *, int); +extern int w_send_read_req (drbd_dev *, struct drbd_work *, int); +extern int w_prev_work_done (drbd_dev *, struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); + +#if 0 +#define BD_CLAIM(bdev,holder) ({ \ + int r = bd_claim(bdev,holder); \ + printk(KERN_INFO "drbd: %u = bd_claim(%p,%p); [%p;%u]\n", \ + r, bdev, holder, bdev->bd_holder, bdev->bd_holders); \ + r; }) + +#define BD_RELEASE(bdev) do { \ + printk(KERN_INFO "drbd: pre: bd_release(%p); [%p;%u]\n", \ + bdev, bdev->bd_holder, bdev->bd_holders); \ + bd_release(bdev); \ + printk(KERN_INFO "drbd: post: bd_release(%p); [%p;%u]\n", \ + bdev, bdev->bd_holder, bdev->bd_holders); \ + } while (0) +#else +#define BD_CLAIM(bdev,holder) bd_claim(bdev,holder) +#define BD_RELEASE(bdev) bd_release(bdev) +#endif + +// drbd_receiver.c +extern int drbd_release_ee(drbd_dev* mdev,struct list_head* list); +extern struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + unsigned int gfp_mask); +extern void drbd_free_ee(drbd_dev *mdev, struct Tl_epoch_entry* e); +extern void drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); +extern void _drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); +extern void drbd_set_recv_tcq(drbd_dev *mdev, int tcq_enabled); +extern void _drbd_clear_done_ee(drbd_dev *mdev); + +static inline void drbd_tcp_cork(struct socket *sock) +{ +#if 1 + mm_segment_t oldfs = get_fs(); + int val = 1; + + set_fs(KERNEL_DS); + tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); + set_fs(oldfs); +#else + tcp_sk(sock->sk)->nonagle |= TCP_NAGLE_CORK; +#endif +} + +static inline void drbd_tcp_flush(struct socket *sock) +{ +#if 1 + mm_segment_t oldfs = get_fs(); + int val = 0; + + set_fs(KERNEL_DS); + tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); + set_fs(oldfs); +#else + tcp_sk(sock->sk)->nonagle &= ~TCP_NAGLE_CORK; + tcp_push_pending_frames(sock->sk, tcp_sk(sock->sk)); +#endif +} + +// drbd_proc.c +extern struct proc_dir_entry *drbd_proc; +extern struct file_operations drbd_proc_fops; +extern const char* conns_to_name(drbd_conns_t s); +extern const char* roles_to_name(drbd_role_t s); + +// drbd_actlog.c +extern void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern int drbd_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern int drbd_try_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_cancel_all(drbd_dev* mdev); +extern int drbd_rs_del_all(drbd_dev* mdev); +extern void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size); +extern int drbd_al_read_log(struct Drbd_Conf *mdev,struct drbd_backing_dev *); +extern void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_in_sync(mdev,sector,size) \ + __drbd_set_in_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_out_of_sync(mdev,sector,size) \ + __drbd_set_out_of_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void drbd_al_apply_to_bm(struct Drbd_Conf *mdev); +extern void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev); +extern void drbd_al_shrink(struct Drbd_Conf *mdev); + + +// drbd_nl.c + +void drbd_nl_cleanup(void); +int __init drbd_nl_init(void); +void drbd_bcast_state(drbd_dev *mdev); + +/* + * inline helper functions + *************************/ + +#define peer_mask role_mask +#define pdsk_mask disk_mask +#define susp_mask 1 +#define user_isp_mask 1 +#define aftr_isp_mask 1 + +#define NS(T,S) ({drbd_state_t mask; mask.i=0; mask.T = T##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T = (S); val;}) +#define NS2(T1,S1,T2,S2) \ + ({drbd_state_t mask; mask.i=0; mask.T1 = T1##_mask; \ + mask.T2 = T2##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T1 = (S1); \ + val.T2 = (S2); val;}) +#define NS3(T1,S1,T2,S2,T3,S3) \ + ({drbd_state_t mask; mask.i=0; mask.T1 = T1##_mask; \ + mask.T2 = T2##_mask; mask.T3 = T3##_mask; mask;}), \ + ({drbd_state_t val; val.i=0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val;}) + +#define _NS(D,T,S) D,({drbd_state_t ns; ns.i = D->state.i; ns.T = (S); ns;}) +#define _NS2(D,T1,S1,T2,S2) \ + D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ + ns.T2 = (S2); ns;}) +#define _NS3(D,T1,S1,T2,S2,T3,S3) \ + D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ + ns.T2 = (S2); ns.T3 = (S3); ns;}) + +static inline void drbd_state_lock(drbd_dev *mdev) +{ + wait_event(mdev->misc_wait, + !test_and_set_bit(CLUSTER_ST_CHANGE,&mdev->flags)); +} + +static inline void drbd_state_unlock(drbd_dev *mdev) +{ + clear_bit(CLUSTER_ST_CHANGE,&mdev->flags); + wake_up(&mdev->misc_wait); +} + +static inline int drbd_request_state(drbd_dev* mdev, drbd_state_t mask, + drbd_state_t val) +{ + return _drbd_request_state(mdev, mask, val, ChgStateVerbose); +} + +/** + * drbd_chk_io_error: Handles the on_io_error setting, should be called from + * all io completion handlers. See also drbd_io_error(). + */ +static inline void __drbd_chk_io_error(drbd_dev* mdev, int forcedetach) +{ + switch(mdev->bc->dc.on_io_error) { + case PassOn: /* FIXME would this be better named "Ignore"? */ + if (!forcedetach) { + if (printk_ratelimit()) + ERR("Local IO failed. Passing error on...\n"); + break; + } + /* NOTE fall through to detach case if forcedetach set */ + case Detach: + if (_drbd_set_state(_NS(mdev,disk,Failed),ChgStateHard) + == SS_Success) { + if (printk_ratelimit()) + ERR("Local IO failed. Detaching...\n"); + } + break; + case CallIOEHelper: + _drbd_set_state(_NS(mdev,disk,Failed),ChgStateHard); + break; + } +} + +static inline void drbd_chk_io_error(drbd_dev* mdev, int error, int forcedetach) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock,flags); + __drbd_chk_io_error(mdev,forcedetach); + spin_unlock_irqrestore(&mdev->req_lock,flags); + } +} + +static inline int semaphore_is_locked(struct semaphore* s) +{ + if(!down_trylock(s)) { + up(s); + return 0; + } + return 1; +} + +/* Returns the first sector number of our meta data, + * which, for internal meta data, happens to be the maximum capacity + * we could agree upon with our peer + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/* returns the last sector number of our meta data, + * to be able to catch out of band md access */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_AL_OFFSET -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect; + } +} + +/* returns the capacity we announce to out peer */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return drbd_get_capacity(bdev->backing_bdev) + ? drbd_md_first_sector(bdev) + : 0; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return drbd_get_capacity(bdev->backing_bdev); + } +} + +/* returns the sector number of our meta data 'super' block */ +static inline sector_t drbd_md_ss__(drbd_dev *mdev, + struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + default: /* external, some index */ + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; + case DRBD_MD_INDEX_INTERNAL: + /* with drbd08, internal meta data is always "flexible" */ + case DRBD_MD_INDEX_FLEX_INT: + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + if (!bdev->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("bdev->backing_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) + - MD_AL_OFFSET; + case DRBD_MD_INDEX_FLEX_EXT: + return 0; + } +} + +static inline void +_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add_tail(&w->list,&q->q); + up(&q->s); +} + +static inline void +drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock,flags); + list_add(&w->list,&q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock,flags); +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock,flags); + list_add_tail(&w->list,&q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock,flags); +} + +static inline void wake_asender(drbd_dev *mdev) { + if(test_bit(SIGNAL_ASENDER, &mdev->flags)) { + force_sig(DRBD_SIG, mdev->asender.task); + } +} + +static inline void request_ping(drbd_dev *mdev) { + set_bit(SEND_PING,&mdev->flags); + wake_asender(mdev); +} + +static inline int drbd_send_short_cmd(drbd_dev *mdev, Drbd_Packet_Cmd cmd) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_DATA_SOCKET,cmd,&h,sizeof(h)); +} + +static inline int drbd_send_ping(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_META_SOCKET,Ping,&h,sizeof(h)); +} + +static inline int drbd_send_ping_ack(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,USE_META_SOCKET,PingAck,&h,sizeof(h)); +} + +static inline void drbd_thread_stop(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,TRUE); +} + +static inline void drbd_thread_stop_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,FALSE); +} + +static inline void drbd_thread_restart_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,TRUE,FALSE); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, queue_for_net_write or queue_for_net_read); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, data_received) + * [from receive_DataReply] + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) + * [from got_BlockAck (WriteAck, RecvAck)] + * FIXME + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, send_failed or send_canceled) + * _req_mod(req, connection_lost_while_pending) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if(atomic_read(&mdev->which)<0) \ + ERR("in %s:%d: " #which " = %d < 0 !\n", \ + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) + +#define dec_ap_pending(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + if(atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->misc_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * SyncTarget sends RSDataRequest (and expects RSDataReply) + * SyncSource sends RSDataReply (and expects WriteAck whith ID_SYNCER) + * (or NegAck with ID_SYNCER) + */ +static inline void inc_rs_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->rs_pending_cnt); +} + +#define dec_rs_pending(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a RecvAck (proto B) + * or WriteAck (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a WriteAck + * receive_DataRequest (receive_RSDataRequest) we need to send back Data + * receive_Barrier_* we need to send a BarrierAck + */ +static inline void inc_unacked(drbd_dev* mdev) +{ + atomic_inc(&mdev->unacked_cnt); +} + +#define dec_unacked(mdev) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + +#define sub_unacked(mdev, n) do { \ + typecheck(drbd_dev*,mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + + +static inline void dec_net(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->net_cnt)) { + wake_up(&mdev->misc_wait); + } +} + +/** + * inc_net: Returns TRUE when it is ok to access mdev->net_conf. You + * should call dec_net() when finished looking at mdev->net_conf. + */ +static inline int inc_net(drbd_dev* mdev) +{ + int have_net_conf; + + atomic_inc(&mdev->net_cnt); + have_net_conf = mdev->state.conn >= Unconnected; + if(!have_net_conf) dec_net(mdev); + return have_net_conf; +} + +/* strictly speaking, + * these would have to hold the req_lock while looking at + * the disk state. But since we cannot submit within a spinlock, + * this is mood... + */ + +static inline void dec_local(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->local_cnt)) { + wake_up(&mdev->misc_wait); + } + D_ASSERT(atomic_read(&mdev->local_cnt)>=0); +} +/** + * inc_local: Returns TRUE when local IO is possible. If it returns + * TRUE you should call dec_local() after IO is completed. + */ +static inline int inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins ); + if( !io_allowed ) { + dec_local(mdev); + } + return io_allowed; +} +static inline int inc_local(drbd_dev* mdev) +{ + return inc_local_if_state(mdev, Inconsistent); +} + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(drbd_dev* mdev) +{ + int mxb = 1000000; /* arbitrary limit on open requests */ + if(inc_net(mdev)) { + mxb = mdev->net_conf->max_buffers; + dec_net(mdev); + } + return mxb; +} + +static inline int __inc_ap_bio_cond(drbd_dev* mdev) { + int mxb = drbd_get_max_buffers(mdev); + if (mdev->state.susp) return 0; + if (mdev->state.conn == WFBitMapS) return 0; + if (mdev->state.conn == WFBitMapT) return 0; + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&mdev->ap_bio_cnt) > mxb) return 0; + atomic_inc(&mdev->ap_bio_cnt); + return 1; +} + +/* I'd like to use wait_event_lock_irq, + * but I'm not sure when it got introduced, + * and not sure when it has 3 or 4 arguments */ +static inline void inc_ap_bio(drbd_dev* mdev) +{ + /* compare with after_state_ch, + * os.conn != WFBitMapS && ns.conn == WFBitMapS */ + DEFINE_WAIT(wait); + + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection handshake + * as long as we would exeed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + spin_lock_irq(&mdev->req_lock); + while (!__inc_ap_bio_cond(mdev)) { + prepare_to_wait(&mdev->misc_wait,&wait,TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + schedule(); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } + spin_unlock_irq(&mdev->req_lock); +} + +static inline void dec_ap_bio(drbd_dev* mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); + + D_ASSERT(ap_bio>=0); + if (ap_bio < mxb) wake_up(&mdev->misc_wait); +} + +static inline int seq_cmp(u32 a, u32 b) +{ + /* we assume wrap around at 32bit. + * for wrap around at 24bit (old atomic_t), + * we'd have to + * a <<= 8; b <<= 8; + */ + return ((s32)(a) - (s32)(b)); +} +#define seq_lt(a,b) (seq_cmp((a),(b)) < 0) +#define seq_gt(a,b) (seq_cmp((a),(b)) > 0) +#define seq_ge(a,b) (seq_cmp((a),(b)) >= 0) +#define seq_le(a,b) (seq_cmp((a),(b)) <= 0) +/* CAUTION: please no side effects in arguments! */ +#define seq_max(a,b) ((u32)(seq_gt((a),(b)) ? (a) : (b))) + +static inline void update_peer_seq(drbd_dev* mdev, unsigned int new_seq) +{ + unsigned int m; + spin_lock(&mdev->peer_seq_lock); + m = seq_max(mdev->peer_seq, new_seq); + mdev->peer_seq = m; + spin_unlock(&mdev->peer_seq_lock); + if (m == new_seq) wake_up(&mdev->seq_wait); +} + +static inline int drbd_queue_order_type(drbd_dev* mdev) +{ + int rv; +#if !defined(QUEUE_FLAG_ORDERED) + ERR_IF(mdev->bc == NULL) return QUEUE_ORDERED_NONE; + rv = bdev_get_queue(mdev->bc->backing_bdev)->ordered; +#else +# define QUEUE_ORDERED_NONE 0 +# define QUEUE_ORDERED_TAG 1 +# define QUEUE_ORDERED_FLUSH 2 +# warning "TCQ code disabled at compile time." + rv = QUEUE_ORDERED_NONE; // Kernels before 2.6.12 had not had TCQ support. +#endif + return rv; +} + +/* + * FIXME investigate what makes most sense: + * a) blk_run_queue(q); + * + * b) struct backing_dev_info *bdi; + * b1) bdi = &q->backing_dev_info; + * b2) bdi = mdev->bc->backing_bdev->bd_inode->i_mapping->backing_dev_info; + * blk_run_backing_dev(bdi,NULL); + * + * c) generic_unplug(q) ? __generic_unplug(q) ? + * + * d) q->unplug_fn(q), which is what all the drivers/md/ stuff uses... + * + */ +static inline void drbd_blk_run_queue(request_queue_t *q) +{ + if (q && q->unplug_fn) + q->unplug_fn(q); +} + +static inline void drbd_kick_lo(drbd_dev *mdev) +{ + if (!mdev->bc->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("backing_bdev==NULL in drbd_kick_lo! The following call trace is for debuggin purposes only. Don't worry.\n"); + dump_stack(); + } + } else { + drbd_blk_run_queue(bdev_get_queue(mdev->bc->backing_bdev)); + } +} +#endif diff -uprN linux-2.6.24/drivers/block/drbd/drbd_main.c linux-2.6.24.ovz/drivers/block/drbd/drbd_main.c --- linux-2.6.24/drivers/block/drbd/drbd_main.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_main.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,3246 @@ +/* +-*- Linux-c -*- + drbd.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL_SYSCALLS__ +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ + +/* YES. We got an official device major from lanana + */ +#define LANANA_DRBD_MAJOR 147 + +struct after_state_chg_work { + struct drbd_work w; + drbd_state_t os; + drbd_state_t ns; + enum chg_state_flags flags; +}; + +int drbdd_init(struct Drbd_thread*); +int drbd_worker(struct Drbd_thread*); +int drbd_asender(struct Drbd_thread*); + +int drbd_init(void); +STATIC int drbd_open(struct inode *inode, struct file *file); +STATIC int drbd_close(struct inode *inode, struct file *file); +STATIC int w_after_state_ch(drbd_dev *mdev, struct drbd_work *w, int unused); +STATIC int w_md_sync(drbd_dev *mdev, struct drbd_work *w, int unused); +STATIC void md_sync_timer_fn(unsigned long data); + +MODULE_AUTHOR("Philipp Reisner , Lars Ellenberg "); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_ALIAS_BLOCKDEV_MAJOR(LANANA_DRBD_MAJOR); + +#include +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, int,0); +module_param(allow_oos, bool,0); + +#ifdef DRBD_ENABLE_FAULTS +int enable_faults = 0; +int fault_rate; +int fault_count; +module_param(enable_faults,int,0664); // bitmap of enabled faults +module_param(fault_rate,int,0664); // fault rate % value - applies to all enabled faults +module_param(fault_count,int,0664); // count of faults inserted +#endif + +// module parameter, defined +int major_nr = LANANA_DRBD_MAJOR; +int minor_count = 32; + +int allow_oos = 0; + +#ifdef ENABLE_DYNAMIC_TRACE +int trace_type = 0; // Bitmap of trace types to enable +int trace_level= 0; // Current trace level +int trace_devs = 0; // Bitmap of devices to trace + +module_param(trace_level,int,0644); +module_param(trace_type,int,0644); +module_param(trace_devs,int,0644); +#endif + +// global panic flag +volatile int drbd_did_panic = 0; + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct Drbd_Conf **minor_table = NULL; + +drbd_kmem_cache_t *drbd_request_cache; +drbd_kmem_cache_t *drbd_ee_cache; +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; + +/* I do not use a standard mempool, because: + 1) I want to hand out the preallocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page* drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +STATIC struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_close, +}; + +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) + +/************************* The transfer log start */ +STATIC int tl_init(drbd_dev *mdev) +{ + struct drbd_barrier *b; + + b=kmalloc(sizeof(struct drbd_barrier),GFP_KERNEL); + if(!b) return 0; + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->next=0; + b->br_number=4711; + b->n_req=0; + + mdev->oldest_barrier = b; + mdev->newest_barrier = b; + + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + + return 1; +} + +STATIC void tl_cleanup(drbd_dev *mdev) +{ + D_ASSERT(mdev->oldest_barrier == mdev->newest_barrier); + kfree(mdev->oldest_barrier); + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = 0; + } +} + +/** + * _tl_add_barrier: Adds a barrier to the TL. + * It returns the previously newest barrier + * (not the just created barrier) to the caller. + */ +struct drbd_barrier *_tl_add_barrier(drbd_dev *mdev,struct drbd_barrier *new) +{ + struct drbd_barrier *newest_before; + + INIT_LIST_HEAD(&new->requests); + INIT_LIST_HEAD(&new->w.list); + new->next=0; + new->n_req=0; + + newest_before = mdev->newest_barrier; + /* never send a barrier number == 0, because that is special-cased + * when using TCQ for our write ordering code */ + new->br_number = (newest_before->br_number+1) ?: 1; + mdev->newest_barrier->next = new; + mdev->newest_barrier = new; + + return newest_before; +} + +/* when we receive a barrier ack */ +void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_barrier *b; + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_barrier; + mdev->oldest_barrier = b->next; + + /* in protocol C this list should be empty, + * unless there is local io pending. + * in protocol A and B, this should not be empty, even though the + * master_bio's could already been completed. */ + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request,tl_requests); + _req_mod(r, barrier_acked, 0); + } + list_del(&b->requests); + /* There could be requests on the list waiting for completion + of the write to the local disk, to avoid corruptions of + slab's data structures we have to remove the lists head */ + + spin_unlock_irq(&mdev->req_lock); + + D_ASSERT(b->br_number == barrier_nr); + D_ASSERT(b->n_req == set_size); + +#if 1 + if(b->br_number != barrier_nr) { + DUMPI(b->br_number); + DUMPI(barrier_nr); + } + if(b->n_req != set_size) { + DUMPI(b->n_req); + DUMPI(set_size); + } +#endif + + kfree(b); +} + + +/* called by drbd_disconnect (exiting receiver thread) + * or from some after_state_ch */ +void tl_clear(drbd_dev *mdev) +{ + struct drbd_barrier *b, *tmp; + + WARN("tl_clear()\n"); + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_barrier; + while ( b ) { + struct list_head *le, *tle; + struct drbd_request *r; + + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request,tl_requests); + _req_mod(r, connection_lost_while_pending, 0); + } + tmp = b->next; + + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + if (b == mdev->newest_barrier) { + D_ASSERT(tmp == NULL); + b->br_number=4711; + b->n_req=0; + INIT_LIST_HEAD(&b->requests); + mdev->oldest_barrier = b; + break; + } + kfree(b); + b = tmp; + /* dec_ap_pending corresponding to _drbd_send_barrier; + * note: the barrier for the current epoch (newest_barrier) + * has not been sent yet, so we don't dec_ap_pending for it + * here, either */ + dec_ap_pending(mdev); + } + D_ASSERT(mdev->newest_barrier == mdev->oldest_barrier); + D_ASSERT(mdev->newest_barrier->br_number == 4711); + spin_unlock_irq(&mdev->req_lock); +} + +/** + * drbd_io_error: Handles the on_io_error setting, should be called in the + * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context. + * See also drbd_chk_io_error + * + * NOTE: we set ourselves FAILED here if on_io_error is Detach or Panic OR + * if the forcedetach flag is set. This flag is set when failures + * occur writing the meta data portion of the disk as they are + * not recoverable. We also try to write the "need full sync bit" here + * anyways. This is to make sure that you get a resynchronisation of + * the full device the next time you connect. + */ +int drbd_io_error(drbd_dev* mdev, int forcedetach) +{ + enum io_error_handler eh; + unsigned long flags; + int send,ok=1; + + eh = PassOn; + if(inc_local_if_state(mdev,Failed)) { + eh = mdev->bc->dc.on_io_error; + dec_local(mdev); + } + + if(!forcedetach && eh == PassOn) + return 1; + + spin_lock_irqsave(&mdev->req_lock,flags); + if( (send = (mdev->state.disk == Failed)) ) { + _drbd_set_state(_NS(mdev,disk,Diskless), + ChgStateHard|ScheduleAfter); + } + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if(!send) return ok; + + ok = drbd_send_state(mdev); + if (ok) WARN("Notified peer that my disk is broken.\n"); + else ERR("Sending state in drbd_io_error() failed\n"); + + // Make sure we try to flush meta-data to disk - we come + // in here because of a local disk error so it might fail + // but we still need to try -- both because the error might + // be in the data portion of the disk and because we need + // to ensure the md-sync-timer is stopped if running. + drbd_md_sync(mdev); + + /* Releasing the backing device is done in after_state_ch() */ + + if(eh == CallIOEHelper) { + drbd_khelper(mdev,"local-io-error"); + } + + return ok; +} + +/** + * cl_wide_st_chg: + * Returns TRUE if this state change should be preformed as a cluster wide + * transaction. Of course it returns 0 as soon as the connection is lost. + */ +STATIC int cl_wide_st_chg(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns) +{ + return ( os.conn >= Connected && ns.conn >= Connected && + ( ( os.role != Primary && ns.role == Primary ) || + ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) || + ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) || + ( os.disk != Diskless && ns.disk == Diskless ) ) ) || + (os.conn >= Connected && ns.conn == Disconnecting); +} + +int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, + drbd_state_t mask, drbd_state_t val) +{ + unsigned long flags; + drbd_state_t os,ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock,flags); + if (rv==SS_Success && !(f&ScheduleAfter)) after_state_ch(mdev,os,ns,f); + + return rv; +} + +void drbd_force_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val) +{ + drbd_change_state(mdev,ChgStateHard,mask,val); +} + +STATIC int is_valid_state(drbd_dev* mdev, drbd_state_t ns); +STATIC int is_valid_state_transition(drbd_dev*, drbd_state_t, drbd_state_t); +STATIC int drbd_send_state_req(drbd_dev *, drbd_state_t, drbd_state_t); + +set_st_err_t _req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val) +{ + drbd_state_t os,ns; + unsigned long flags; + int rv; + + if(test_and_clear_bit(CL_ST_CHG_SUCCESS,&mdev->flags)) + return SS_CW_Success; + + if(test_and_clear_bit(CL_ST_CHG_FAIL,&mdev->flags)) + return SS_CW_FailedByPeer; + + rv=0; + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + if( !cl_wide_st_chg(mdev,os,ns) ) rv = SS_CW_NoNeed; + if( !rv ) { + rv = is_valid_state(mdev,ns); + if(rv==SS_Success) { + rv = is_valid_state_transition(mdev,ns,os); + if(rv==SS_Success) rv = 0; // cont waiting, otherwise fail. + } + } + spin_unlock_irqrestore(&mdev->req_lock,flags); + + return rv; +} + +/** + * _drbd_request_state: + * This function is the most gracefull way to change state. For some state + * transition this function even does a cluster wide transaction. + * It has a cousin named drbd_request_state(), which is always verbose. + */ +int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, + enum chg_state_flags f) +{ + unsigned long flags; + drbd_state_t os,ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + + if(cl_wide_st_chg(mdev,os,ns)) { + rv = is_valid_state(mdev,ns); + if(rv == SS_Success ) rv = is_valid_state_transition(mdev,ns,os); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if( rv < SS_Success ) { + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + drbd_state_lock(mdev); + if( !drbd_send_state_req(mdev,mask,val) ) { + drbd_state_unlock(mdev); + rv = SS_CW_FailedByPeer; + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + wait_event(mdev->state_wait,(rv=_req_st_cond(mdev,mask,val))); + + if( rv < SS_Success ) { + // nearly dead code. + drbd_state_unlock(mdev); + if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + spin_lock_irqsave(&mdev->req_lock,flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + drbd_state_unlock(mdev); + } + + rv = _drbd_set_state(mdev, ns, f); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if (rv==SS_Success && !(f&ScheduleAfter)) after_state_ch(mdev,os,ns,f); + + return rv; +} + + +STATIC void print_st(drbd_dev* mdev, char *name, drbd_state_t ns) +{ + ERR(" %s = { cs:%s st:%s/%s ds:%s/%s %c%c%c%c }\n", + name, + conns_to_name(ns.conn), + roles_to_name(ns.role), + roles_to_name(ns.peer), + disks_to_name(ns.disk), + disks_to_name(ns.pdsk), + ns.susp ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-' + ); +} + +void print_st_err(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, int err) +{ + ERR("State change failed: %s\n",set_st_err_name(err)); + print_st(mdev," state",os); + print_st(mdev,"wanted",ns); +} + + +#define peers_to_name roles_to_name +#define pdsks_to_name disks_to_name + +#define susps_to_name(A) ( (A) ? "1" : "0" ) +#define aftr_isps_to_name(A) ( (A) ? "1" : "0" ) +#define peer_isps_to_name(A) ( (A) ? "1" : "0" ) +#define user_isps_to_name(A) ( (A) ? "1" : "0" ) + +#define PSC(A) \ + ({ if( ns.A != os.A ) { \ + pbp += sprintf(pbp, #A "( %s -> %s ) ", \ + A##s_to_name(os.A), \ + A##s_to_name(ns.A)); \ + } }) + +STATIC int is_valid_state(drbd_dev* mdev, drbd_state_t ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum fencing_policy fp; + int rv=SS_Success; + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + if(inc_net(mdev)) { + if( !mdev->net_conf->two_primaries && + ns.role == Primary && ns.peer == Primary ) + rv=SS_TwoPrimaries; + dec_net(mdev); + } + + if( rv <= 0 ) /* already found a reason to abort */; + else if( ns.role == Secondary && mdev->open_cnt ) + rv=SS_DeviceInUse; + + else if( ns.role == Primary && ns.conn < Connected && + ns.disk < UpToDate ) rv=SS_NoUpToDateDisk; + + else if( fp >= Resource && + ns.role == Primary && ns.conn < Connected && + ns.pdsk >= DUnknown ) rv=SS_PrimaryNOP; + + else if( ns.role == Primary && ns.disk <= Inconsistent && + ns.pdsk <= Inconsistent ) rv=SS_NoUpToDateDisk; + + else if( ns.conn > Connected && + ns.disk < UpToDate && ns.pdsk < UpToDate ) + rv=SS_BothInconsistent; + + else if( ns.conn > Connected && + (ns.disk == Diskless || ns.pdsk == Diskless ) ) + rv=SS_SyncingDiskless; + + else if( (ns.conn == Connected || + ns.conn == WFBitMapS || + ns.conn == SyncSource || + ns.conn == PausedSyncS) && + ns.disk == Outdated ) rv=SS_ConnectedOutdates; + + return rv; +} + +STATIC int is_valid_state_transition(drbd_dev* mdev,drbd_state_t ns,drbd_state_t os) +{ + int rv=SS_Success; + + if( (ns.conn == StartingSyncT || ns.conn == StartingSyncS ) && + os.conn > Connected) rv=SS_ResyncRunning; + + if( ns.conn == Disconnecting && os.conn == StandAlone) + rv=SS_AlreadyStandAlone; + + if( ns.disk == Outdated && os.disk == Diskless) + rv=SS_CanNotOutdateDL; + + return rv; +} + +int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) +{ + drbd_state_t os; + int rv=SS_Success, warn_sync_abort=0; + enum fencing_policy fp; + + MUST_HOLD(&mdev->req_lock); + + os = mdev->state; + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + /* Early state sanitising. Dissalow the invalidate ioctl to connect */ + if( (ns.conn == StartingSyncS || ns.conn == StartingSyncT) && + os.conn < Connected ) { + ns.conn = os.conn; + ns.pdsk = os.pdsk; + } + + /* Dissalow Network errors to configure a device's network part */ + if( (ns.conn >= Timeout && ns.conn <= TearDown ) && + os.conn <= Disconnecting ) { + ns.conn = os.conn; + } + + /* Dissalow network errors (+TearDown) to overwrite each other. + Dissalow network errors to overwrite the Disconnecting state. */ + if( ( (os.conn >= Timeout && os.conn <= TearDown) + || os.conn == Disconnecting ) && + ns.conn >= Timeout && ns.conn <= TearDown ) { + ns.conn = os.conn; + } + + if( ns.conn < Connected ) { + ns.peer_isp = 0; + ns.peer = Unknown; + if ( ns.pdsk > DUnknown || + ns.pdsk < Inconsistent ) ns.pdsk = DUnknown; + } + + if( ns.conn <= Disconnecting && ns.disk == Diskless ) { + ns.pdsk = DUnknown; + } + + if( ns.conn > Connected && (ns.disk <= Failed || ns.pdsk <= Failed )) { + warn_sync_abort=1; + ns.conn = Connected; + } + + if( ns.conn >= Connected && + ( ns.disk == Consistent || ns.disk == Outdated ) ) { + switch(ns.conn) { + case WFBitMapT: + case PausedSyncT: + ns.disk = Outdated; + break; + case Connected: + case WFBitMapS: + case SyncSource: + case PausedSyncS: + ns.disk = UpToDate; + break; + case SyncTarget: + ns.disk = Inconsistent; + WARN("Implicit set disk state Inconsistent!\n"); + break; + } + if( os.disk == Outdated && ns.disk == UpToDate ) { + WARN("Implicit set disk from Outdate to UpToDate\n"); + } + } + + if( ns.conn >= Connected && + ( ns.pdsk == Consistent || ns.pdsk == Outdated ) ) { + switch(ns.conn) { + case Connected: + case WFBitMapT: + case PausedSyncT: + case SyncTarget: + ns.pdsk = UpToDate; + break; + case WFBitMapS: + case PausedSyncS: + ns.pdsk = Outdated; + break; + case SyncSource: + ns.pdsk = Inconsistent; + WARN("Implicit set pdsk Inconsistent!\n"); + break; + } + if( os.pdsk == Outdated && ns.pdsk == UpToDate ) { + WARN("Implicit set pdsk from Outdate to UpToDate\n"); + } + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < Connected && ns.disk == Negotiating ) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } + + if( fp == Stonith ) { + if(ns.role == Primary && + ns.conn < Connected && + ns.pdsk > Outdated ) { + ns.susp = 1; + } + } + + if( ns.aftr_isp || ns.peer_isp || ns.user_isp ) { + if(ns.conn == SyncSource) ns.conn=PausedSyncS; + if(ns.conn == SyncTarget) ns.conn=PausedSyncT; + } else { + if(ns.conn == PausedSyncS) ns.conn=SyncSource; + if(ns.conn == PausedSyncT) ns.conn=SyncTarget; + } + + if( ns.i == os.i ) return SS_NothingToDo; + + if( !(flags & ChgStateHard) ) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev,ns); + if(rv < SS_Success) { + /* If the old state was illegal as well, then let + this happen...*/ + + if( is_valid_state(mdev,os) == rv ) { + ERR("Forcing state change from bad state. " + "Error would be: '%s'\n", + set_st_err_name(rv)); + print_st(mdev,"old",os); + print_st(mdev,"new",ns); + rv = SS_Success; + } + } else rv = is_valid_state_transition(mdev,ns,os); + } + + if(rv < SS_Success) { + if( flags & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); + return rv; + } + + if(warn_sync_abort) { + WARN("Resync aborted.\n"); + } + +#if DUMP_MD >= 2 + { + char *pbp,pb[300]; + pbp = pb; + *pbp=0; + PSC(role); + PSC(peer); + PSC(conn); + PSC(disk); + PSC(pdsk); + PSC(susp); + PSC(aftr_isp); + PSC(peer_isp); + PSC(user_isp); + INFO("%s\n", pb); + } +#endif + + mdev->state.i = ns.i; + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + + /** post-state-change actions **/ + if ( os.conn >= SyncSource && ns.conn <= Connected ) { + set_bit(STOP_SYNC_TIMER,&mdev->flags); + mod_timer(&mdev->resync_timer,jiffies); + } + + if( (os.conn == PausedSyncT || os.conn == PausedSyncS) && + (ns.conn == SyncTarget || ns.conn == SyncSource) ) { + INFO("Syncer continues.\n"); + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + if( ns.conn == SyncTarget ) { + D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags)); + clear_bit(STOP_SYNC_TIMER,&mdev->flags); + mod_timer(&mdev->resync_timer,jiffies); + } + } + + if( (os.conn == SyncTarget || os.conn == SyncSource) && + (ns.conn == PausedSyncT || ns.conn == PausedSyncS) ) { + INFO("Resync suspended\n"); + mdev->rs_mark_time = jiffies; + if( ns.conn == PausedSyncT ) { + set_bit(STOP_SYNC_TIMER,&mdev->flags); + } + } + + if ( os.disk == Diskless && os.conn == StandAlone && + (ns.disk > Diskless || ns.conn >= Unconnected) ) { + int i; + i = try_module_get(THIS_MODULE); + D_ASSERT(i); + } + + if( flags & ScheduleAfter ) { + struct after_state_chg_work* ascw; + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if(ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + drbd_queue_work(&mdev->data.work,&ascw->w); + } else { + WARN("Could not kmalloc an ascw\n"); + } + } + + return rv; +} + +STATIC int w_after_state_ch(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct after_state_chg_work* ascw; + + ascw = (struct after_state_chg_work*) w; + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + kfree(ascw); + + return 1; +} + +void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, + enum chg_state_flags flags) +{ + enum fencing_policy fp; + u32 mdf; + + if ( (os.conn != Connected && ns.conn == Connected) ) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if( mdev->p_uuid ) { + mdev->p_uuid[UUID_FLAGS] &= ~((u64)2); + } + } + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + + mdf = mdev->bc->md.flags & ~(MDF_Consistent|MDF_PrimaryInd| + MDF_ConnectedInd|MDF_WasUpToDate| + MDF_PeerOutDated ); + + if (test_bit(CRASHED_PRIMARY,&mdev->flags) || + mdev->state.role == Primary || + ( mdev->state.pdsk < Inconsistent && + mdev->state.peer == Primary ) ) mdf |= MDF_PrimaryInd; + if (mdev->state.conn > WFReportParams) mdf |= MDF_ConnectedInd; + if (mdev->state.disk > Inconsistent) mdf |= MDF_Consistent; + if (mdev->state.disk > Outdated) mdf |= MDF_WasUpToDate; + if (mdev->state.pdsk <= Outdated && + mdev->state.pdsk >= Inconsistent) mdf |= MDF_PeerOutDated; + if( mdf != mdev->bc->md.flags) { + mdev->bc->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + dec_local(mdev); + } + + /* Inform userspace about the change... */ + drbd_bcast_state(mdev); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if( fp == Stonith && ns.susp ) { + // case1: The outdate peer handler is successfull: + // case2: The connection was established again: + if ( (os.pdsk > Outdated && ns.pdsk <= Outdated) || // case1 + (os.conn < Connected && ns.conn >= Connected) ) { + tl_clear(mdev); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev,susp,0), + ChgStateVerbose | ScheduleAfter ); + spin_unlock_irq(&mdev->req_lock); + } + } + // Do not change the order of the if above and below... + if (os.conn != WFBitMapS && ns.conn == WFBitMapS) { + /* compare with drbd_make_request_common, + * wait_event and inc_ap_bio. + * Note: we may lose connection whilst waiting here. + * no worries though, should work out ok... */ + wait_event(mdev->misc_wait, + mdev->state.conn != WFBitMapS || + !atomic_read(&mdev->ap_bio_cnt)); + drbd_bm_lock(mdev); // { + drbd_send_bitmap(mdev); + drbd_bm_unlock(mdev); // } + } + + /* Lost contact to peer's copy of the data */ + if ( (os.pdsk>=Inconsistent && os.pdsk!=DUnknown && os.pdsk!=Outdated) && + (ns.pdskp_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + if (inc_local(mdev)) { + if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) { + /* Only do it if we have not yet done it... */ + drbd_uuid_new_current(mdev); + } + if (ns.peer == Primary ) { + /* Note: The condition ns.peer == Primary implies + that we are connected. Otherwise it would + be ns.peer == Unknown. */ + /* Our peer lost its disk. + Not rotation into BitMap-UUID! A FullSync is + required after a primary detached from it disk! */ + u64 uuid; + INFO("Creating new current UUID [no BitMap]\n"); + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, Current, uuid); + } + dec_local(mdev); + } + } + + if( ns.pdsk < Inconsistent ) { + /* Diskless Peer becomes primary */ + if (os.peer == Secondary && ns.peer == Primary ) { + drbd_uuid_new_current(mdev); + } + /* Diskless Peer becomes secondary */ + if (os.peer == Primary && ns.peer == Secondary ) { + drbd_al_to_on_disk_bm(mdev); + } + } + + /* Last part of the attaching process ... */ + if ( os.disk == Attaching && ns.disk == Negotiating ) { + drbd_send_sizes(mdev); // to start sync... + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + /* We want to pause/continue resync, tell peer. */ + if ( ( os.aftr_isp != ns.aftr_isp ) || + ( os.user_isp != ns.user_isp ) ) { + drbd_send_state(mdev); + } + + /* In case one of the isp bits got set, suspend other devices. */ + if ( ( !os.aftr_isp && !os.peer_isp && !os.user_isp) && + ( ns.aftr_isp || ns.peer_isp || ns.user_isp) ) { + suspend_other_sg(mdev); + } + + /* We are in the progress to start a full sync... */ + if ( ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) || + ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) ) { + + drbd_bm_lock(mdev); // racy... + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_unlock(mdev); + + if (ns.conn == StartingSyncT) { + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev,conn,WFSyncUUID), + ChgStateVerbose | ScheduleAfter ); + spin_unlock_irq(&mdev->req_lock); + } else /* StartingSyncS */ { + drbd_start_resync(mdev,SyncSource); + } + } + + /* We are invalidating our self... */ + if ( os.conn < Connected && ns.conn < Connected && + os.disk > Inconsistent && ns.disk == Inconsistent ) { + drbd_bm_lock(mdev); // racy... + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_unlock(mdev); + } + + if ( os.disk > Diskless && ns.disk == Diskless ) { + /* since inc_local() only works as long as disk>=Inconsistent, + and it is Diskless here, local_cnt can only go down, it can + not increase... It will reach zero */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + drbd_free_bc(mdev->bc); mdev->bc = NULL; + lc_free(mdev->resync); mdev->resync = NULL; + lc_free(mdev->act_log); mdev->act_log = NULL; + } + + // A resync finished or aborted, wake paused devices... + if ( (os.conn > Connected && ns.conn <= Connected) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp) ) { + resume_next_sg(mdev); + } + + if ( os.conn != Disconnecting && ns.conn <= Disconnecting ) { + drbd_thread_stop_nowait(&mdev->receiver); + } + + // Upon network failure, we need to restart the receiver. + if ( os.conn > TearDown && + ns.conn <= TearDown && ns.conn >= Timeout) { + drbd_thread_restart_nowait(&mdev->receiver); + } + + if ( os.conn == StandAlone && ns.conn == Unconnected) { + drbd_thread_start(&mdev->receiver); + } + + if ( os.disk == Diskless && os.conn <= Disconnecting && + (ns.disk > Diskless || ns.conn >= Unconnected) ) { + if(!drbd_thread_start(&mdev->worker)) { + module_put(THIS_MODULE); + } + } + + /* FIXME what about Primary, Diskless, and then losing + * the connection? since we survive that "somehow", + * maybe we may not stop the worker yet, + * since that would call drbd_mdev_cleanup. + * after which we probably won't survive the next + * request from the upper layers ... BOOM again :( */ + if ( (os.disk > Diskless || os.conn > StandAlone) && + ns.disk == Diskless && ns.conn == StandAlone ) { + drbd_thread_stop_nowait(&mdev->worker); + } +} + + +STATIC int drbd_thread_setup(void* arg) +{ + struct Drbd_thread *thi = (struct Drbd_thread *) arg; + drbd_dev *mdev = thi->mdev; + int retval; + + daemonize("drbd_thread"); + D_ASSERT(get_t_state(thi) == Running); + D_ASSERT(thi->task == NULL); + spin_lock(&thi->t_lock); + thi->task = current; + smp_mb(); + spin_unlock(&thi->t_lock); + complete(&thi->startstop); // notify: thi->task is set. + + while(1) { + retval = thi->function(thi); + if(get_t_state(thi) != Restarting) break; + thi->t_state = Running; + } + + spin_lock(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + smp_mb(); + spin_unlock(&thi->t_lock); + + // THINK maybe two different completions? + complete(&thi->startstop); // notify: thi->task unset. + + return retval; +} + +STATIC void drbd_thread_init(drbd_dev *mdev, struct Drbd_thread *thi, + int (*func) (struct Drbd_thread *)) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + thi->function = func; + thi->mdev = mdev; +} + +int drbd_thread_start(struct Drbd_thread *thi) +{ + int pid; + drbd_dev *mdev = thi->mdev; + + spin_lock(&thi->t_lock); + + /* INFO("drbd_thread_start: %s [%d]: %s %d -> Running\n", + current->comm, current->pid, + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE", + thi->t_state); */ + + if (thi->t_state == None) { + init_completion(&thi->startstop); + D_ASSERT(thi->task == NULL); + thi->t_state = Running; + spin_unlock(&thi->t_lock); + flush_signals(current); // otherw. may get -ERESTARTNOINTR + pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS); + if (pid < 0) { + ERR("Couldn't start thread (%d)\n", pid); + return FALSE; + } + wait_for_completion(&thi->startstop); // waits until thi->task is set + D_ASSERT(thi->task); + D_ASSERT(get_t_state(thi) == Running); + } else { + spin_unlock(&thi->t_lock); + } + + return TRUE; +} + + +void _drbd_thread_stop(struct Drbd_thread *thi, int restart,int wait) +{ + drbd_dev *mdev = thi->mdev; + Drbd_thread_state ns = restart ? Restarting : Exiting; + + spin_lock(&thi->t_lock); + + /* INFO("drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n", + current->comm, current->pid, + thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */ + + if (thi->t_state == None) { + spin_unlock(&thi->t_lock); + if(restart) drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock(&thi->t_lock); + return; + } + + thi->t_state = ns; + smp_mb(); + if (thi->task != current) { + if(wait) init_completion(&thi->startstop); + force_sig(DRBD_SIGKILL,thi->task); + } else D_ASSERT(!wait); + } + spin_unlock(&thi->t_lock); + + if (wait) { + D_ASSERT(thi->task != current); + wait_for_completion(&thi->startstop); + spin_lock(&thi->t_lock); + D_ASSERT(thi->task == NULL); + D_ASSERT(thi->t_state == None); + spin_unlock(&thi->t_lock); + } +} + +/* the appropriate socket mutex must be held already */ +int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags) +{ + int sent,ok; + + ERR_IF(!h) return FALSE; + ERR_IF(!size) return FALSE; + + h->magic = BE_DRBD_MAGIC; + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size-sizeof(Drbd_Header)); + + dump_packet(mdev,sock,0,(void*)h, __FILE__, __LINE__); + sent = drbd_send(mdev,sock,h,size,msg_flags); + + ok = ( sent == size ); + if(!ok) { + ERR("short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + } + return ok; +} + +/* don't pass the socket. we may only look at it + * when we hold the appropriate socket mutex. + */ +int drbd_send_cmd(drbd_dev *mdev, int use_data_socket, + Drbd_Packet_Cmd cmd, Drbd_Header* h, size_t size) +{ + int ok = 0; + struct socket *sock; + + if (use_data_socket) { + down(&mdev->data.mutex); + sock = mdev->data.socket; + } else { + down(&mdev->meta.mutex); + sock = mdev->meta.socket; + } + + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); + } + + if (use_data_socket) { + up(&mdev->data.mutex); + } else + up(&mdev->meta.mutex); + return ok; +} + +int drbd_send_cmd2(drbd_dev *mdev, Drbd_Packet_Cmd cmd, char* data, + size_t size) +{ + Drbd_Header h; + int ok; + + h.magic = BE_DRBD_MAGIC; + h.command = cpu_to_be16(cmd); + h.length = cpu_to_be16(size); + + if (!drbd_get_data_sock(mdev)) + return 0; + + dump_packet(mdev,mdev->data.socket,0,(void*)&h, __FILE__, __LINE__); + + ok = ( sizeof(h) == drbd_send(mdev,mdev->data.socket,&h,sizeof(h),0) ); + ok = ok && ( size == drbd_send(mdev,mdev->data.socket,data,size,0) ); + + drbd_put_data_sock(mdev); + + return ok; +} + +int drbd_send_sync_param(drbd_dev *mdev, struct syncer_conf *sc) +{ + Drbd_SyncParam_Packet p; + + p.rate = cpu_to_be32(sc->rate); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,SyncParam,(Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_protocol(drbd_dev *mdev) +{ + Drbd_Protocol_Packet p; + + p.protocol = cpu_to_be32(mdev->net_conf->wire_protocol); + p.after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); + p.after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); + p.after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); + p.want_lose = cpu_to_be32(mdev->net_conf->want_lose); + p.two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportProtocol, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_uuids(drbd_dev *mdev) +{ + Drbd_GenCnt_Packet p; + int i; + u64 uuid_flags = 0; + + if(!inc_local_if_state(mdev,Negotiating)) return 1; // ok. + + for (i = Current; i < UUID_SIZE; i++) { + /* FIXME howto handle diskless ? */ + p.uuid[i] = mdev->bc + ? cpu_to_be64(mdev->bc->md.uuid[i]) + : 0; + } + + mdev->comm_bm_set = drbd_bm_total_weight(mdev); + p.uuid[UUID_SIZE] = cpu_to_be64(mdev->comm_bm_set); + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; + uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; + p.uuid[UUID_FLAGS] = cpu_to_be64(uuid_flags); + + dec_local(mdev); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportUUIDs, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sync_uuid(drbd_dev *mdev, u64 val) +{ + Drbd_SyncUUID_Packet p; + + p.uuid = cpu_to_be64(val); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportSyncUUID, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sizes(drbd_dev *mdev) +{ + Drbd_Sizes_Packet p; + sector_t d_size, u_size; + int q_order_type; + int ok; + + if(inc_local_if_state(mdev,Negotiating)) { + D_ASSERT(mdev->bc->backing_bdev); + d_size = drbd_get_max_capacity(mdev->bc); + u_size = mdev->bc->dc.disk_size; + q_order_type = drbd_queue_order_type(mdev); + p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); + dec_local(mdev); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + } + + p.d_size = cpu_to_be64(d_size); + p.u_size = cpu_to_be64(u_size); + p.c_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + p.max_segment_size = cpu_to_be32(mdev->rq_queue->max_segment_size); + p.queue_order_type = cpu_to_be32(q_order_type); + + ok = drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportSizes, + (Drbd_Header*)&p,sizeof(p)); + return ok; +} + +int drbd_send_state(drbd_dev *mdev) +{ + Drbd_State_Packet p; + + p.state = cpu_to_be32(mdev->state.i); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportState, + (Drbd_Header*)&p,sizeof(p)); +} + +STATIC int drbd_send_state_req(drbd_dev *mdev, drbd_state_t mask, drbd_state_t val) +{ + Drbd_Req_State_Packet p; + + p.mask = cpu_to_be32(mask.i); + p.val = cpu_to_be32(val.i); + + return drbd_send_cmd(mdev,USE_DATA_SOCKET,StateChgRequest, + (Drbd_Header*)&p,sizeof(p)); +} + +int drbd_send_sr_reply(drbd_dev *mdev, int retcode) +{ + Drbd_RqS_Reply_Packet p; + + p.retcode = cpu_to_be32(retcode); + + return drbd_send_cmd(mdev,USE_META_SOCKET,StateChgReply, + (Drbd_Header*)&p,sizeof(p)); +} + + +/* See the comment at receive_bitmap() */ +int _drbd_send_bitmap(drbd_dev *mdev) +{ + int want; + int ok=TRUE, bm_i=0; + size_t bm_words, num_words; + unsigned long *buffer; + Drbd_Header *p; + + ERR_IF(!mdev->bitmap) return FALSE; + + bm_words = drbd_bm_words(mdev); + p = vmalloc(PAGE_SIZE); // sleeps. cannot fail. + buffer = (unsigned long*)p->payload; + + if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + if (unlikely(mdev->state.disk <= Failed )) { + /* write_bm did fail! Leave full sync flag set in Meta Data + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + ERR("Failed to write bitmap to disk!\n"); + } + else { + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + } + } + + /* + * maybe TODO use some simple compression scheme, nowadays there are + * some such algorithms in the kernel anyways. + */ + do { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + if (want) { + drbd_bm_get_lel(mdev, bm_i, num_words, buffer); + } + ok = _drbd_send_cmd(mdev,mdev->data.socket,ReportBitMap, + p, sizeof(*p) + want, 0); + bm_i += num_words; + } while (ok && want); + + vfree(p); + return ok; +} + +int drbd_send_bitmap(drbd_dev *mdev) +{ + int ok; + + if (!drbd_get_data_sock(mdev)) + return 0; + ok=_drbd_send_bitmap(mdev); + drbd_put_data_sock(mdev); + return ok; +} + +int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size) +{ + int ok; + Drbd_BarrierAck_Packet p; + + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); + + ok = drbd_send_cmd(mdev,USE_META_SOCKET,BarrierAck,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +/** + * _drbd_send_ack: + * This helper function expects the sector and block_id parameter already + * in big endian! + */ +STATIC int _drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + u64 sector, + u32 blksize, + u64 block_id) +{ + int ok; + Drbd_BlockAck_Packet p; + + p.sector = sector; + p.block_id = block_id; + p.blksize = blksize; + p.seq_num = cpu_to_be32(atomic_add_return(1,&mdev->packet_seq)); + + if (!mdev->meta.socket || mdev->state.conn < Connected) return FALSE; + ok=drbd_send_cmd(mdev,USE_META_SOCKET,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +int drbd_send_ack_dp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_Data_Packet *dp) +{ + const int header_size = sizeof(Drbd_Data_Packet) - sizeof(Drbd_Header); + int data_size = ((Drbd_Header*)dp)->length - header_size; + + return _drbd_send_ack(mdev,cmd,dp->sector,cpu_to_be32(data_size), + dp->block_id); +} + +int drbd_send_ack_rp(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + Drbd_BlockRequest_Packet *rp) +{ + return _drbd_send_ack(mdev,cmd,rp->sector,rp->blksize,rp->block_id); +} + +int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, struct Tl_epoch_entry *e) +{ + return _drbd_send_ack(mdev,cmd, + cpu_to_be64(e->sector), + cpu_to_be32(e->size), + e->block_id); +} + +int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id) +{ + int ok; + Drbd_BlockRequest_Packet p; + + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + /* FIXME BIO_RW_SYNC ? */ + + ok = drbd_send_cmd(mdev,USE_DATA_SOCKET,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +/* called on sndtimeo + * returns FALSE if we should retry, + * TRUE if we think connection is dead + */ +STATIC int we_should_drop_the_connection(drbd_dev *mdev, struct socket *sock) +{ + int drop_it; + // long elapsed = (long)(jiffies - mdev->last_received); + // DUMPLU(elapsed); // elapsed ignored for now. + + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || (volatile int)mdev->state.conn < Connected; + + if (drop_it) + return TRUE; + + drop_it = !--mdev->ko_count; + if ( !drop_it ) { + ERR("[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); + } + + return drop_it; /* && (mdev->state == Primary) */; +} + +/* The idea of sendpage seems to be to put some kind of reference + to the page into the skb, and to hand it over to the NIC. In + this process get_page() gets called. + + As soon as the page was really sent over the network put_page() + gets called by some part of the network layer. [ NIC driver? ] + + [ get_page() / put_page() increment/decrement the count. If count + reaches 0 the page will be freed. ] + + This works nicely with pages from FSs. + But this means that in protocol A we might signal IO completion too early ! + + In order not to corrupt data during a resync we must make sure + that we do not reuse our own buffer pages (EEs) to early, therefore + we have the net_ee list. + + XFS seems to have problems, still, it submits pages with page_count == 0! + As a workaround, we disable sendpage on pages with page_count == 0 or PageSlab. +*/ +int _drbd_no_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + int ret; + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int sent,ok; + int len = size; + +#ifdef SHOW_SENDPAGE_USAGE + unsigned long now = jiffies; + static unsigned long total = 0; + static unsigned long fallback = 0; + static unsigned long last_rep = 0; + + /* report statistics every hour, + * if we had at least one fallback. + */ + ++total; + if (fallback && time_before(last_rep+3600*HZ, now)) { + last_rep = now; + printk(KERN_INFO DEVICE_NAME + ": sendpage() omitted: %lu/%lu\n", fallback, total); + } +#endif + + /* PARANOIA. if this ever triggers, + * something in the layers above us is really kaputt. + *one roundtrip later: + * doh. it triggered. so XFS _IS_ really kaputt ... + * oh well... + */ + if ( (page_count(page) < 1) || PageSlab(page) ) { + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set... + */ +#ifdef SHOW_SENDPAGE_USAGE + ++fallback; +#endif + sent = _drbd_no_send_page(mdev, page, offset, size); + if (likely(sent > 0)) len -= sent; + goto out; + } + + set_fs(KERNEL_DS); + do { + sent = mdev->data.socket->ops->sendpage(mdev->data.socket,page, + offset,len, + MSG_NOSIGNAL); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else + continue; + } + if (sent <= 0) { + WARN("%s: size=%d len=%d sent=%d\n", + __func__,(int)size,len,sent); + break; + } + len -= sent; + offset += sent; + // FIXME test "last_received" ... + } while(len > 0 /* THINK && mdev->cstate >= Connected*/); + set_fs(oldfs); + + out: + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} + +STATIC int _drbd_send_zc_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (! _drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, + bvec->bv_len) ) { + return 0; + } + } + + return 1; +} + +/* Used to send write requests + * Primary -> Peer (Data) + */ +int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req) +{ + int ok=1; + Drbd_Data_Packet p; + unsigned int dp_flags=0; + + if (!drbd_get_data_sock(mdev)) + return 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(Data); + p.head.length = cpu_to_be16(sizeof(p)-sizeof(Drbd_Header)+req->size); + + p.sector = cpu_to_be64(req->sector); + p.block_id = (unsigned long)req; + p.seq_num = cpu_to_be32( req->seq_num = + atomic_add_return(1,&mdev->packet_seq) ); + dp_flags = 0; + if(req->master_bio->bi_rw & BIO_RW_BARRIER) { + dp_flags |= DP_HARDBARRIER; + } + if(req->master_bio->bi_rw & BIO_RW_SYNC) { + dp_flags |= DP_RW_SYNC; + } + if(mdev->state.conn >= SyncSource && + mdev->state.conn <= PausedSyncT) { + dp_flags |= DP_MAY_SET_IN_SYNC; + } + + p.dp_flags = cpu_to_be32(dp_flags); + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + set_bit(UNPLUG_REMOTE,&mdev->flags); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if(ok) { + if(mdev->net_conf->wire_protocol == DRBD_PROT_A) { + ok = _drbd_send_bio(mdev,req->master_bio); + } else { + ok = _drbd_send_zc_bio(mdev,req->master_bio); + } + } + + drbd_put_data_sock(mdev); + return ok; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) Primary (DataReply) + * SyncSource -> SyncTarget (RSDataReply) + */ +int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e) +{ + int ok; + Drbd_Data_Packet p; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header) + e->size); + + p.sector = cpu_to_be64(e->sector); + p.block_id = e->block_id; + /* p.seq_num = 0; No sequence numbers here.. */ + + /* Only called by our kernel thread. + * This one may be interupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to ioctl or module unload. + */ + if (!drbd_get_data_sock(mdev)) + return 0; + + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if (ok) ok = _drbd_send_zc_bio(mdev,e->private_bio); + + drbd_put_data_sock(mdev); + return ok; +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags) +{ +#if !HAVE_KERNEL_SENDMSG + mm_segment_t oldfs; + struct iovec iov; +#else + struct kvec iov; +#endif + struct msghdr msg; + int rv,sent=0; + + if (!sock) return -1000; + + // THINK if (signal_pending) return ... ? + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = 0; + msg.msg_namelen = 0; +#if !HAVE_KERNEL_SENDMSG + msg.msg_iov = &iov; + msg.msg_iovlen = 1; +#endif + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + +#if !HAVE_KERNEL_SENDMSG + oldfs = get_fs(); + set_fs(KERNEL_DS); +#endif + + if (sock == mdev->data.socket) + mdev->ko_count = mdev->net_conf->ko_count; + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ +#if !HAVE_KERNEL_SENDMSG + rv = sock_sendmsg(sock, &msg, iov.iov_len ); +#else + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); +#endif + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(mdev,sock)) + break; + else + continue; + } + D_ASSERT(rv != 0); + if (rv == -EINTR ) { +#if 0 + /* FIXME this happens all the time. + * we don't care for now! + * eventually this should be sorted out be the proper + * use of the SIGNAL_ASENDER bit... */ + if (DRBD_ratelimit(5*HZ,5)) { + DBG("Got a signal in drbd_send(,%c,)!\n", + sock == mdev->meta.socket ? 'm' : 's'); + // dump_stack(); + } +#endif + flush_signals(current); + rv = 0; + } + if (rv < 0) break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while(sent < size); + +#if !HAVE_KERNEL_SENDMSG + set_fs(oldfs); +#endif + + if (rv <= 0) { + if (rv != -EAGAIN) { + ERR("%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + drbd_force_state(mdev, NS(conn,BrokenPipe)); + } else + drbd_force_state(mdev, NS(conn,Timeout)); + } + + return sent; +} + +STATIC int drbd_open(struct inode *inode, struct file *file) +{ + drbd_dev *mdev; + unsigned long flags; + int rv=0; + + mdev = minor_to_mdev(MINOR(inode->i_rdev)); + if(!mdev) return -ENODEV; + + spin_lock_irqsave(&mdev->req_lock,flags); + /* to have a stable mdev->state.role and no race with updating open_cnt */ + + if (mdev->state.role != Primary) { + if (file->f_mode & FMODE_WRITE) { + rv = -EROFS; + } else if (!allow_oos) { + rv = -EMEDIUMTYPE; + } + } + + if(!rv) mdev->open_cnt++; + spin_unlock_irqrestore(&mdev->req_lock,flags); + + return rv; +} + +STATIC int drbd_close(struct inode *inode, struct file *file) +{ + /* do not use *file (May be NULL, in case of a unmount :-) */ + drbd_dev *mdev; + + mdev = minor_to_mdev(MINOR(inode->i_rdev)); + if(!mdev) return -ENODEV; + + /* + printk(KERN_ERR DEVICE_NAME ": close(inode=%p,file=%p)" + "current=%p,minor=%d,wc=%d\n", inode, file, current, minor, + inode->i_writecount); + */ + + mdev->open_cnt--; + + return 0; +} + +STATIC void drbd_unplug_fn(request_queue_t *q) +{ + drbd_dev *mdev = q->queuedata; + + MTRACE(TraceTypeUnplug,TraceLvlSummary, + INFO("got unplugged ap_bio_count=%d\n", + atomic_read(&mdev->ap_bio_cnt)); + ); + + /* unplug FIRST */ + spin_lock_irq(q->queue_lock); + blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + /* only if connected */ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.pdsk >= Inconsistent && mdev->state.conn >= Connected) { + D_ASSERT(mdev->state.role == Primary); + if (test_and_clear_bit(UNPLUG_REMOTE,&mdev->flags)) { + /* add to the data.work queue, + * unless already queued. + * XXX this might be a good addition to drbd_queue_work + * anyways, to detect "double queuing" ... */ + if (list_empty(&mdev->unplug_work.list)) + drbd_queue_work(&mdev->data.work,&mdev->unplug_work); + } + } + spin_unlock_irq(&mdev->req_lock); + + if(mdev->state.disk >= Inconsistent) drbd_kick_lo(mdev); +} + +void drbd_set_defaults(drbd_dev *mdev) +{ + mdev->sync_conf.after = DRBD_AFTER_DEF; + mdev->sync_conf.rate = DRBD_RATE_DEF; + mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; // 512 MB active set + mdev->state = (drbd_state_t){ { Secondary, + Unknown, + StandAlone, + Diskless, + DUnknown, + 0 } }; +} + +void drbd_init_set_defaults(drbd_dev *mdev) +{ + // the memset(,0,) did most of this + // note: only assignments, no allocation in here + +#ifdef PARANOIA + SET_MDEV_MAGIC(mdev); +#endif + + drbd_set_defaults(mdev); + + atomic_set(&mdev->ap_bio_cnt,0); + atomic_set(&mdev->ap_pending_cnt,0); + atomic_set(&mdev->rs_pending_cnt,0); + atomic_set(&mdev->unacked_cnt,0); + atomic_set(&mdev->local_cnt,0); + atomic_set(&mdev->net_cnt,0); + atomic_set(&mdev->packet_seq,0); + atomic_set(&mdev->pp_in_use, 0); + + init_MUTEX(&mdev->md_io_mutex); + init_MUTEX(&mdev->data.mutex); + init_MUTEX(&mdev->meta.mutex); + sema_init(&mdev->data.work.s,0); + sema_init(&mdev->meta.work.s,0); + + spin_lock_init(&mdev->data.work.q_lock); + spin_lock_init(&mdev->meta.work.q_lock); + + spin_lock_init(&mdev->al_lock); + spin_lock_init(&mdev->req_lock); + spin_lock_init(&mdev->peer_seq_lock); + + INIT_LIST_HEAD(&mdev->active_ee); + INIT_LIST_HEAD(&mdev->sync_ee); + INIT_LIST_HEAD(&mdev->done_ee); + INIT_LIST_HEAD(&mdev->read_ee); + INIT_LIST_HEAD(&mdev->net_ee); + INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); + INIT_LIST_HEAD(&mdev->resync_work.list); + INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->md_sync_work.list); + mdev->resync_work.cb = w_resync_inactive; + mdev->unplug_work.cb = w_send_write_hint; + mdev->md_sync_work.cb = w_md_sync; + init_timer(&mdev->resync_timer); + init_timer(&mdev->md_sync_timer); + mdev->resync_timer.function = resync_timer_fn; + mdev->resync_timer.data = (unsigned long) mdev; + mdev->md_sync_timer.function = md_sync_timer_fn; + mdev->md_sync_timer.data = (unsigned long) mdev; + + init_waitqueue_head(&mdev->misc_wait); + init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->ee_wait); + init_waitqueue_head(&mdev->al_wait); + init_waitqueue_head(&mdev->seq_wait); + + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + +#ifdef __arch_um__ + INFO("mdev = 0x%p\n",mdev); +#endif +} + +void drbd_mdev_cleanup(drbd_dev *mdev) +{ + /* I'd like to cleanup completely, and memset(,0,) it. + * but I'd have to reinit it. + * FIXME: do the right thing... + */ + + /* list of things that may still + * hold data of the previous config + + * act_log ** re-initialized in set_disk + * on_io_error + + * al_tr_cycle ** re-initialized in ... FIXME?? + * al_tr_number + * al_tr_pos + + * backing_bdev ** re-initialized in drbd_free_ll_dev + * lo_file + * md_bdev + * md_file + * md_index + + * ko_count ** re-initialized in set_net + + * last_received ** currently ignored + + * mbds_id ** re-initialized in ... FIXME?? + + * resync ** re-initialized in ... FIXME?? + + *** no re-init necessary (?) *** + * md_io_page + * this_bdev + + * vdisk ? + + * rq_queue ** FIXME ASSERT ?? + * newest_barrier + * oldest_barrier + */ + + drbd_thread_stop(&mdev->receiver); + + /* no need to lock it, I'm the only thread alive */ + if ( mdev->epoch_size != 0) + ERR("epoch_size:%d\n",mdev->epoch_size); + mdev->al_writ_cnt = + mdev->bm_writ_cnt = + mdev->read_cnt = + mdev->recv_cnt = + mdev->send_cnt = + mdev->writ_cnt = + mdev->p_size = + mdev->rs_start = + mdev->rs_total = + mdev->rs_failed = + mdev->rs_mark_left = + mdev->rs_mark_time = 0; + D_ASSERT(mdev->net_conf == NULL); + drbd_set_my_capacity(mdev,0); + drbd_bm_resize(mdev,0); + + // just in case + drbd_free_resources(mdev); + + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + */ + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->net_ee)); + D_ASSERT(list_empty(&mdev->resync_reads)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->resync_work.list)); + D_ASSERT(list_empty(&mdev->unplug_work.list)); + +} + + +void drbd_destroy_mempools(void) +{ + struct page *page; + + while(drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page*)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) kmem_cache_destroy(drbd_request_cache); + + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + + return; +} + +int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; + int i; + + // prepare our caches and mempools + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_pp_pool = NULL; + + // caches + drbd_request_cache = kmem_cache_create( + "drbd_req_cache", sizeof(drbd_request_t), + 0, 0, NULL, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee_cache", sizeof(struct Tl_epoch_entry), + 0, 0, NULL, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + // mempools + drbd_request_mempool = mempool_create( number, + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create( number, + mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + // drbd's page pool + spin_lock_init(&drbd_pp_lock); + + for (i=0;i< number;i++) { + page = alloc_page(GFP_HIGHUSER); + if(!page) goto Enomem; + set_page_private(page,(unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + + Enomem: + drbd_destroy_mempools(); // in case we allocated some + return -ENOMEM; +} + +STATIC int drbd_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + /* just so we have it. you never know what interessting things we + * might want to do here some day... + */ + + return NOTIFY_DONE; +} + +STATIC struct notifier_block drbd_notifier = { + .notifier_call = drbd_notify_sys, +}; + + +STATIC void __exit drbd_cleanup(void) +{ + int i, rr; + + unregister_reboot_notifier(&drbd_notifier); + + drbd_nl_cleanup(); + + if (minor_table) { + if (drbd_proc) + remove_proc_entry("drbd",&proc_root); + i=minor_count; + while (i--) { + drbd_dev *mdev = minor_to_mdev(i); + struct gendisk **disk = &mdev->vdisk; + request_queue_t **q = &mdev->rq_queue; + + if(!mdev) continue; + drbd_free_resources(mdev); + + if (*disk) { + del_gendisk(*disk); + put_disk(*disk); + *disk = NULL; + } + if (*q) blk_put_queue(*q); + *q = NULL; + + D_ASSERT(mdev->open_cnt == 0); + if (mdev->this_bdev) bdput(mdev->this_bdev); + + tl_cleanup(mdev); + if (mdev->bitmap) drbd_bm_cleanup(mdev); + if (mdev->resync) lc_free(mdev->resync); + + rr = drbd_release_ee(mdev,&mdev->active_ee); + if(rr) ERR("%d EEs in active list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->sync_ee); + if(rr) ERR("%d EEs in sync list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->read_ee); + if(rr) ERR("%d EEs in read list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->done_ee); + if(rr) ERR("%d EEs in done list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->net_ee); + if(rr) ERR("%d EEs in net list found!\n",rr); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp,&mdev->data.work.q) { + DUMPP(lp); + } + }; + + if (mdev->md_io_page) + __free_page(mdev->md_io_page); + + if (mdev->md_io_tmpp) + __free_page(mdev->md_io_tmpp); + + if (mdev->act_log) lc_free(mdev->act_log); + + if(mdev->ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash_s = 0; + mdev->ee_hash = NULL; + } + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = 0; + mdev->tl_hash = NULL; + } + if(mdev->app_reads_hash) { + kfree(mdev->app_reads_hash); + mdev->app_reads_hash = NULL; + } + if ( mdev->p_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + } + drbd_destroy_mempools(); + } + + kfree(minor_table); + + if (unregister_blkdev(MAJOR_NR, DEVICE_NAME) != 0) + printk(KERN_ERR DEVICE_NAME": unregister of device failed\n"); + + printk(KERN_INFO DEVICE_NAME": module cleanup done.\n"); +} + +drbd_dev *drbd_new_device(int minor) +{ + drbd_dev *mdev = NULL; + struct gendisk *disk; + request_queue_t *q; + + mdev = kmalloc(sizeof(drbd_dev),GFP_KERNEL); + if(!mdev) goto Enomem; + memset(mdev,0,sizeof(drbd_dev )); + + mdev->minor = minor; + + drbd_init_set_defaults(mdev); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) goto Enomem; + mdev->rq_queue = q; + q->queuedata = mdev; + q->max_segment_size = DRBD_MAX_SEGMENT_SIZE; + + disk = alloc_disk(1); + if (!disk) goto Enomem; + mdev->vdisk = disk; + + set_disk_ro( disk, TRUE ); + + disk->queue = q; + disk->major = MAJOR_NR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, DEVICE_NAME "%d", minor); + disk->private_data = mdev; + add_disk(disk); + + mdev->this_bdev = bdget(MKDEV(MAJOR_NR,minor)); + // we have no partitions. we contain only ourselves. + mdev->this_bdev->bd_contains = mdev->this_bdev; + + blk_queue_make_request(q, drbd_make_request_26); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &mdev->req_lock; // needed since we use + // plugging on a queue, that actually has no requests! + q->unplug_fn = drbd_unplug_fn; + + mdev->md_io_page = alloc_page(GFP_KERNEL); + if(!mdev->md_io_page) goto Enomem; + + if (drbd_bm_init(mdev)) goto Enomem; + // no need to lock access, we are still initializing the module. + if (!tl_init(mdev)) goto Enomem; + + mdev->app_reads_hash=kmalloc(APP_R_HSIZE*sizeof(void*),GFP_KERNEL); + if (!mdev->app_reads_hash) goto Enomem; + memset(mdev->app_reads_hash,0,APP_R_HSIZE*sizeof(void*)); + + return mdev; + + Enomem: + if(mdev) { + if(mdev->app_reads_hash) kfree(mdev->app_reads_hash); + if(mdev->md_io_page) __free_page(mdev->md_io_page); + kfree(mdev); + } + return NULL; +} + +int __init drbd_init(void) +{ + int err; + +#if 0 +// warning LGE "DEBUGGING" +/* I am too lazy to calculate this by hand -lge + */ +#define SZO(x) printk(KERN_ERR "sizeof(" #x ") = %d\n", sizeof(x)) + SZO(struct Drbd_Conf); + SZO(struct buffer_head); + SZO(Drbd_Polymorph_Packet); + SZO(struct drbd_socket); + SZO(struct bm_extent); + SZO(struct lc_element); + SZO(struct semaphore); + SZO(struct drbd_request); + SZO(struct bio); + SZO(wait_queue_head_t); + SZO(spinlock_t); + SZO(Drbd_Header); + SZO(Drbd_HandShake_Packet); + SZO(Drbd_Barrier_Packet); + SZO(Drbd_BarrierAck_Packet); + SZO(Drbd_SyncParam_Packet); + SZO(Drbd06_Parameter_P); + SZO(Drbd_Data_Packet); + SZO(Drbd_BlockAck_Packet); + printk(KERN_ERR "AL_EXTENTS_PT = %d\n",AL_EXTENTS_PT); + printk(KERN_ERR "DRBD_MAX_SECTORS = %llu\n",DRBD_MAX_SECTORS); + printk(KERN_ERR "DRBD_MAX_SECTORS_FLEX = %llu\n",DRBD_MAX_SECTORS_FLEX); +#define OOF(t,m) printk(KERN_ERR "offsetof("#t","#m") = %d\n", offsetof(t,m)) + OOF(struct Drbd_Conf,bitmap); + //OOF(struct drbd_bitmap,bm_set); + return -EBUSY; +#endif +#ifdef __arch_um__ + printk(KERN_INFO "drbd_module = 0x%p core = 0x%p\n", + THIS_MODULE,THIS_MODULE->module_core); +#endif + + if (sizeof(Drbd_HandShake_Packet) != 80) { + printk(KERN_ERR DEVICE_NAME + ": never change the size or layout of the HandShake packet.\n"); + return -EINVAL; + } + + if (1 > minor_count||minor_count > 255) { + printk(KERN_ERR DEVICE_NAME + ": invalid minor_count (%d)\n",minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = 8; +#endif + } + + if( (err = drbd_nl_init()) ) { + return err; + } + + err = register_blkdev(MAJOR_NR, DEVICE_NAME); + if (err) { + printk(KERN_ERR DEVICE_NAME + ": unable to register block device major %d\n", + MAJOR_NR); + return err; + } + + register_reboot_notifier(&drbd_notifier); + + /* + * allocate all necessary structs + */ + err = -ENOMEM; + + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; // play safe for drbd_cleanup + minor_table = kmalloc(sizeof(drbd_dev *)*minor_count,GFP_KERNEL); + if(!minor_table) goto Enomem; + memset(minor_table,0,sizeof(drbd_dev *)*minor_count); + + if ((err = drbd_create_mempools())) + goto Enomem; + +#if CONFIG_PROC_FS + /* + * register with procfs + */ + drbd_proc = create_proc_entry("drbd", S_IFREG | S_IRUGO , &proc_root); + + if (!drbd_proc) { + printk(KERN_ERR DEVICE_NAME": unable to register proc file\n"); + goto Enomem; + } + + drbd_proc->proc_fops = &drbd_proc_fops; + drbd_proc->owner = THIS_MODULE; +#else +# error "Currently drbd depends on the proc file system (CONFIG_PROC_FS)" +#endif + + printk(KERN_INFO DEVICE_NAME ": initialised. " + "Version: " REL_VERSION " (api:%d/proto:%d)\n", + API_VERSION,PRO_VERSION); + printk(KERN_INFO DEVICE_NAME ": %s\n", drbd_buildtag()); + printk(KERN_INFO DEVICE_NAME": registered as block device major %d\n", MAJOR_NR); + printk(KERN_INFO DEVICE_NAME": minor_table @ 0x%p\n", minor_table); + + return 0; // Success! + + Enomem: + drbd_cleanup(); + if (err == -ENOMEM) // currently always the case + printk(KERN_ERR DEVICE_NAME ": ran out of memory\n"); + else + printk(KERN_ERR DEVICE_NAME ": initialization failure\n"); + return err; +} + +void drbd_free_bc(struct drbd_backing_dev* bc) +{ + if(bc == NULL) return; + + BD_RELEASE(bc->backing_bdev); + BD_RELEASE(bc->md_bdev); + + fput(bc->lo_file); + fput(bc->md_file); + + kfree(bc); +} + +void drbd_free_sock(drbd_dev *mdev) +{ + if (mdev->data.socket) { + sock_release(mdev->data.socket); + mdev->data.socket = 0; + } + if (mdev->meta.socket) { + sock_release(mdev->meta.socket); + mdev->meta.socket = 0; + } +} + + +void drbd_free_resources(drbd_dev *mdev) +{ + if ( mdev->cram_hmac_tfm ) { + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + } + drbd_free_sock(mdev); + drbd_free_bc(mdev->bc); + mdev->bc=0; +} + +/*********************************/ +/* meta data management */ + +struct meta_data_on_disk { + u64 la_size; // last agreed size. + u64 uuid[UUID_SIZE]; // UUIDs. + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; // MDF + u32 magic; + u32 md_size_sect; + u32 al_offset; // offset to this block + u32 al_nr_extents; // important for restoring the AL + // `-- act_log->nr_elements <-- sync_conf.al_extents + u32 bm_offset; // offset to the bitmap, from here + u32 bm_bytes_per_bit; // BM_BLOCK_SIZE + u32 reserved_u32[4]; + +} __attribute((packed)); + +/** + * drbd_md_sync: + * Writes the meta data super block if the MD_DIRTY flag bit is set. + */ +void drbd_md_sync(drbd_dev *mdev) +{ + struct meta_data_on_disk * buffer; + sector_t sector; + int i; + + if (!test_and_clear_bit(MD_DIRTY,&mdev->flags)) return; + del_timer(&mdev->md_sync_timer); + + // We use here Failed and not Attaching because we try to write + // metadata even if we detach due to a disk failure! + if(!inc_local_if_state(mdev,Failed)) return; + + INFO("Writing meta data super block now.\n"); + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + memset(buffer,0,512); + + buffer->la_size=cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + for (i = Current; i < UUID_SIZE; i++) + buffer->uuid[i]=cpu_to_be64(mdev->bc->md.uuid[i]); + buffer->flags = cpu_to_be32(mdev->bc->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); + + buffer->md_size_sect = cpu_to_be32(mdev->bc->md.md_size_sect); + buffer->al_offset = cpu_to_be32(mdev->bc->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(mdev->bc->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(mdev->bc->md.bm_offset); + + D_ASSERT(drbd_md_ss__(mdev,mdev->bc) == mdev->bc->md.md_offset); + sector = mdev->bc->md.md_offset; + +#if 0 + /* FIXME sooner or later I'd like to use the MD_DIRTY flag everywhere, + * so we can avoid unneccessary md writes. + */ + ERR_IF (!test_bit(MD_DIRTY,&mdev->flags)) { + dump_stack(); + } +#endif + + if (drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) { + clear_bit(MD_DIRTY,&mdev->flags); + } else { + /* this was a try anyways ... */ + ERR("meta data update failed!\n"); + + drbd_chk_io_error(mdev, 1, TRUE); + drbd_io_error(mdev, TRUE); + } + + // Update mdev->bc->md.la_size_sect, since we updated it on metadata. + mdev->bc->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); + + up(&mdev->md_io_mutex); + dec_local(mdev); +} + +/** + * drbd_md_read: + * @bdev: describes the backing storage and the meta-data storage + * Reads the meta data from bdev. Return 0 (NoError) on success, and an + * enum ret_codes in case something goes wrong. + * Currently only: MDIOError, MDInvalid. + */ +int drbd_md_read(drbd_dev *mdev, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk * buffer; + int i,rv = NoError; + + if(!inc_local_if_state(mdev,Attaching)) return MDIOError; + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + + if ( ! drbd_md_sync_page_io(mdev,bdev,bdev->md.md_offset,READ) ) { + /* NOTE: cant do normal error processing here as this is + called BEFORE disk is attached */ + ERR("Error while reading metadata.\n"); + rv = MDIOError; + goto err; + } + + if(be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { + ERR("Error while reading metadata, magic not found.\n"); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { + ERR("unexpected al_offset: %d (expected %d)\n", + be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + ERR("unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + rv = MDInvalid; + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + ERR("unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + rv = MDInvalid; + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + ERR("unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + rv = MDInvalid; + goto err; + } + + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + for (i = Current; i < UUID_SIZE; i++) + bdev->md.uuid[i]=be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + /* FIXME if this ever happens when reading meta data, + * it possibly screws up reading of the activity log? + */ + + err: + up(&mdev->md_io_mutex); + dec_local(mdev); + + return rv; +} + +/** + * drbd_md_mark_dirty: + * Call this function if you change enything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +void drbd_md_mark_dirty(drbd_dev *mdev) +{ + set_bit(MD_DIRTY,&mdev->flags); + mod_timer(&mdev->md_sync_timer,jiffies + 5*HZ ); +} + + +STATIC void drbd_uuid_move_history(drbd_dev *mdev) +{ + int i; + + for ( i=History_start ; ibc->md.uuid[i+1] = mdev->bc->md.uuid[i]; + + MTRACE(TraceTypeUuid,TraceLvlAll, + drbd_print_uuid(mdev,i+1); + ); + } +} + +void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +{ + if(idx == Current) { + if (mdev->state.role == Primary) { + val |= 1; + } else { + val &= ~((u64)1); + } + } + + mdev->bc->md.uuid[idx] = val; + + MTRACE(TraceTypeUuid,TraceLvlSummary, + drbd_print_uuid(mdev,idx); + ); + + drbd_md_mark_dirty(mdev); +} + + +void drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +{ + if(mdev->bc->md.uuid[idx]) { + drbd_uuid_move_history(mdev); + mdev->bc->md.uuid[History_start]=mdev->bc->md.uuid[idx]; + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,History_start); + ); + } + _drbd_uuid_set(mdev,idx,val); +} + +void drbd_uuid_new_current(drbd_dev *mdev) +{ + INFO("Creating new current UUID\n"); + D_ASSERT(mdev->bc->md.uuid[Bitmap] == 0); + mdev->bc->md.uuid[Bitmap] = mdev->bc->md.uuid[Current]; + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,Bitmap); + ); + + get_random_bytes(&mdev->bc->md.uuid[Current], sizeof(u64)); + if (mdev->state.role == Primary) { + mdev->bc->md.uuid[Current] |= 1; + } else { + mdev->bc->md.uuid[Current] &= ~((u64)1); + } + + MTRACE(TraceTypeUuid,TraceLvlSummary, + drbd_print_uuid(mdev,Current); + ); + + drbd_md_mark_dirty(mdev); +} + +void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) +{ + if( mdev->bc->md.uuid[Bitmap]==0 && val==0 ) return; + + if(val==0) { + drbd_uuid_move_history(mdev); + mdev->bc->md.uuid[History_start]=mdev->bc->md.uuid[Bitmap]; + mdev->bc->md.uuid[Bitmap]=0; + + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,History_start); + drbd_print_uuid(mdev,Bitmap); + ); + } else { + if( mdev->bc->md.uuid[Bitmap] ) WARN("bm UUID already set"); + + mdev->bc->md.uuid[Bitmap] = val; + mdev->bc->md.uuid[Bitmap] &= ~((u64)1); + + MTRACE(TraceTypeUuid,TraceLvlMetrics, + drbd_print_uuid(mdev,Bitmap); + ); + } + drbd_md_mark_dirty(mdev); +} + + +void drbd_md_set_flag(drbd_dev *mdev, int flag) +{ + MUST_HOLD(mdev->req_lock); + if ( (mdev->bc->md.flags & flag) != flag) { + drbd_md_mark_dirty(mdev); + mdev->bc->md.flags |= flag; + } +} +void drbd_md_clear_flag(drbd_dev *mdev, int flag) +{ + MUST_HOLD(mdev->req_lock); + if ( (mdev->bc->md.flags & flag) != 0 ) { + drbd_md_mark_dirty(mdev); + mdev->bc->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return ((bdev->md.flags & flag) != 0); +} + +STATIC void md_sync_timer_fn(unsigned long data) +{ + drbd_dev* mdev = (drbd_dev*) data; + + drbd_queue_work_front(&mdev->data.work,&mdev->md_sync_work); +} + +STATIC int w_md_sync(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + WARN("BUG! md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(mdev); + + return 1; +} + +#ifdef DRBD_ENABLE_FAULTS +// Fault insertion support including random number generator shamelessly +// stolen from kernel/rcutorture.c +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +STATIC unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (--rsp->count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +STATIC char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + "Meta-data write", + "Meta-data read", + "Resync write", + "Resync read", + "Data write", + "Data read", + "Data read ahead", + }; + + return (type < DRBD_FAULT_MAX)? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(unsigned int type) +{ + static struct fault_random_state rrs = {0,0}; + + unsigned int rnd = ((_drbd_fault_random(&rrs) % 100) + 1); + unsigned int ret = (rnd <= fault_rate); + + if (ret) { + fault_count++; + + if (printk_ratelimit()) + printk(KERN_ALERT "Simulating %s failure\n", _drbd_fault_str(type)); + } + + return ret; +} +#endif + +#ifdef ENABLE_DYNAMIC_TRACE + +STATIC char *_drbd_uuid_str(unsigned int idx) { + static char *uuid_str[] = { + "Current", + "Bitmap", + "History_start", + "History_end", + "UUID_SIZE", + "UUID_FLAGS", + }; + + return (idx < EXT_UUID_SIZE) ? uuid_str[idx] : "*Unknown UUID index*"; +} + +/* Pretty print a UUID value */ +void +drbd_print_uuid(drbd_dev *mdev, unsigned int idx) { + INFO(" uuid[%s] now %016llX\n",_drbd_uuid_str(idx),mdev->bc->md.uuid[idx]); +} + + +/* + +drbd_print_buffer + +This routine dumps binary data to the debugging output. Can be +called at interrupt level. + +Arguments: + + prefix - String is output at the beginning of each line output + flags - Control operation of the routine. Currently defined + Flags are: + DBGPRINT_BUFFADDR; if set, each line starts with the + virtual address of the line being outupt. If clear, + each line starts with the offset from the beginning + of the buffer. + size - Indicates the size of each entry in the buffer. Supported + values are sizeof(char), sizeof(short) and sizeof(int) + buffer - Start address of buffer + buffer_va - Virtual address of start of buffer (normally the same + as Buffer, but having it separate allows it to hold + file address for example) + length - length of buffer + +*/ +void +drbd_print_buffer(const char *prefix,unsigned int flags,int size, + const void *buffer,const void *buffer_va, + unsigned int length) + +#define LINE_SIZE 16 +#define LINE_ENTRIES (int)(LINE_SIZE/size) +{ + const unsigned char *pstart; + const unsigned char *pstart_va; + const unsigned char *pend; + char bytes_str[LINE_SIZE*3+8],ascii_str[LINE_SIZE+8]; + char *pbytes=bytes_str,*pascii=ascii_str; + int offset=0; + long sizemask; + int field_width; + int index; + const unsigned char *pend_str; + const unsigned char *p; + int count; + + // verify size parameter + if (size != sizeof(char) && size != sizeof(short) && size != sizeof(int)) { + printk(KERN_DEBUG "drbd_print_buffer: ERROR invalid size %d\n", size); + return; + } + + sizemask = size-1; + field_width = size*2; + + // Adjust start/end to be on appropriate boundary for size + buffer = (const char *)((long)buffer & ~sizemask); + pend = (const unsigned char *)(((long)buffer + length + sizemask) & ~sizemask); + + if (flags & DBGPRINT_BUFFADDR) { + // Move start back to nearest multiple of line size if printing address + // This results in nicely formatted output with addresses being on + // line size (16) byte boundaries + pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); + } + else { + pstart = (const unsigned char *)buffer; + } + + // Set value of start VA to print if addresses asked for + pstart_va = (const unsigned char *)buffer_va - ((const unsigned char *)buffer-pstart); + + // Calculate end position to nicely align right hand side + pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); + + // Init strings + *pbytes = *pascii = '\0'; + + // Start at beginning of first line + p = pstart; + count=0; + + while (p < pend_str) { + if (p < (const unsigned char *)buffer || p >= pend) { + // Before start of buffer or after end- print spaces + pbytes += sprintf(pbytes,"%*c ",field_width,' '); + pascii += sprintf(pascii,"%*c",size,' '); + p += size; + } + else { + // Add hex and ascii to strings + int val; + switch (size) { + default: + case 1: + val = *(unsigned char *)p; + break; + case 2: + val = *(unsigned short *)p; + break; + case 4: + val = *(unsigned int *)p; + break; + } + + pbytes += sprintf(pbytes,"%0*x ",field_width,val); + + for (index = size; index; index--) { + *pascii++ = isprint(*p) ? *p : '.'; + p++; + } + } + + count++; + + if (count == LINE_ENTRIES || p >= pend_str) { + // Null terminate and print record + *pascii = '\0'; + printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", + prefix, + (flags & DBGPRINT_BUFFADDR) + ? (long)pstart_va : (long)offset, + LINE_ENTRIES*(field_width+1),bytes_str, + LINE_SIZE,ascii_str); + + // Move onto next line + pstart_va += (p-pstart); + pstart = p; + count = 0; + offset+= LINE_SIZE; + + // Re-init strings + pbytes = bytes_str; + pascii = ascii_str; + *pbytes = *pascii = '\0'; + } + } +} + +#define PSM(A) \ +do { \ + if( mask.A ) { \ + int i = snprintf(p, len, " " #A "( %s )", \ + A##s_to_name(val.A)); \ + if (i >= len) return op; \ + p += i; \ + len -= i; \ + } \ +} while (0) + +STATIC char *dump_st(char *p, int len, drbd_state_t mask, drbd_state_t val) +{ + char *op=p; + *p = '\0'; + PSM(role); + PSM(peer); + PSM(conn); + PSM(disk); + PSM(pdsk); + + return op; +} + +#define INFOP(fmt, args...) \ +do { \ + if (trace_level >= TraceLvlAll) { \ + INFO("%s:%d: %s [%d] %s %s " fmt , \ + file, line, current->comm, current->pid, \ + sockname, recv?"<<<":">>>", \ + ## args ); \ + } \ + else { \ + INFO("%s %s " fmt, sockname, \ + recv?"<<<":">>>", \ + ## args ); \ + } \ +} while (0) + +char *_dump_block_id(u64 block_id, char *buff) { + if (is_syncer_block_id(block_id)) + strcpy(buff,"SyncerId"); + else + sprintf(buff,"%llx",block_id); + + return buff; +} + +void +_dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line) +{ + char *sockname = sock == mdev->meta.socket ? "meta" : "data"; + int cmd = (recv == 2) ? p->head.command : be16_to_cpu(p->head.command); + char tmp[300]; + drbd_state_t m,v; + + switch (cmd) { + case HandShake: + INFOP("%s (protocol %u)\n", cmdname(cmd), be32_to_cpu(p->HandShake.protocol_version)); + break; + + case ReportBitMap: /* don't report this */ + break; + + case Data: + INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp), + be32_to_cpu(p->Data.seq_num), + be32_to_cpu(p->Data.dp_flags) + ); + break; + + case DataReply: + case RSDataReply: + INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp) + ); + break; + + case RecvAck: + case WriteAck: + case RSWriteAck: + case DiscardAck: + case NegAck: + case NegRSDReply: + INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockAck.sector), + be32_to_cpu(p->BlockAck.blksize), + _dump_block_id(p->BlockAck.block_id,tmp), + be32_to_cpu(p->BlockAck.seq_num) + ); + break; + + case DataRequest: + case RSDataRequest: + INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockRequest.sector), + be32_to_cpu(p->BlockRequest.blksize), + _dump_block_id(p->BlockRequest.block_id,tmp) + ); + break; + + case Barrier: + case BarrierAck: + INFOP("%s (barrier %u)\n", cmdname(cmd), p->Barrier.barrier); + break; + + case ReportUUIDs: + INFOP("%s Curr:%016llX, Bitmap:%016llX, HisSt:%016llX, HisEnd:%016llX\n", cmdname(cmd), + be64_to_cpu(p->GenCnt.uuid[Current]), + be64_to_cpu(p->GenCnt.uuid[Bitmap]), + be64_to_cpu(p->GenCnt.uuid[History_start]), + be64_to_cpu(p->GenCnt.uuid[History_end])); + break; + + case ReportSizes: + INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, max bio %x, q order %x)\n", cmdname(cmd), + (long long)(be64_to_cpu(p->Sizes.d_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.u_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.c_size)>>(20-9)), + be32_to_cpu(p->Sizes.max_segment_size), + be32_to_cpu(p->Sizes.queue_order_type)); + break; + + case ReportState: + v.i = be32_to_cpu(p->State.state); + m.i = 0xffffffff; + dump_st(tmp,sizeof(tmp),m,v); + INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); + break; + + case StateChgRequest: + m.i = be32_to_cpu(p->ReqState.mask); + v.i = be32_to_cpu(p->ReqState.val); + dump_st(tmp,sizeof(tmp),m,v); + INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); + break; + + case StateChgReply: + INFOP("%s (ret %x)\n", cmdname(cmd), + be32_to_cpu(p->RqSReply.retcode)); + break; + + case Ping: + case PingAck: + /* + * Dont trace pings at summary level + */ + if (trace_level < TraceLvlAll) + break; + /* fall through... */ + default: + INFOP("%s (%u)\n",cmdname(cmd), cmd); + break; + } +} + +// Debug routine to dump info about bio + +void _dump_bio(drbd_dev *mdev, struct bio *bio, int complete) +{ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lx" +#else +#define SECTOR_FORMAT "%lx" +#endif +#define SECTOR_SHIFT 9 + + unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); + char *faddr = (char *)(lowaddr); + struct bio_vec *bvec; + int segno; + + INFO("%s %s Bio:%p - %soffset " SECTOR_FORMAT ", size %x\n", + complete? "<<<":">>>", + bio_rw(bio)==WRITE?"Write":"Read",bio, + complete? (drbd_bio_uptodate(bio)? "Success, ":"Failed, ") : "", + bio->bi_sector << SECTOR_SHIFT, + bio->bi_size); + + if (trace_level >= TraceLvlMetrics && + ((bio_rw(bio) == WRITE) ^ complete) ) { + printk(KERN_DEBUG " ind page offset length\n"); + __bio_for_each_segment(bvec, bio, segno, 0) { + printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n",segno, + bvec->bv_page, bvec->bv_offset, bvec->bv_len); + + if (trace_level >= TraceLvlAll) { + char *bvec_buf; + unsigned long flags; + + bvec_buf = bvec_kmap_irq(bvec, &flags); + + drbd_print_buffer(" ",DBGPRINT_BUFFADDR,1, + bvec_buf, + faddr, + (bvec->bv_len <= 0x80)? bvec->bv_len : 0x80); + + bvec_kunmap_irq(bvec_buf, &flags); + + if (bvec->bv_len > 0x40) + printk(KERN_DEBUG " ....\n"); + + faddr += bvec->bv_len; + } + } + } +} +#endif + +module_init(drbd_init) +module_exit(drbd_cleanup) diff -uprN linux-2.6.24/drivers/block/drbd/drbd_nl.c linux-2.6.24.ovz/drivers/block/drbd/drbd_nl.c --- linux-2.6.24/drivers/block/drbd/drbd_nl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_nl.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,1736 @@ +/* +-*- linux-c -*- + drbd_nl.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "drbd_int.h" + +/* see get_sb_bdev and bd_claim */ +char *drbd_d_holder = "Hands off! this is DRBD's data storage device."; +char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + + +// Generate the tag_list to struct functions +#define PACKET(name, number, fields) \ +int name ## _from_tags (drbd_dev *mdev, unsigned short* tags, struct name * arg) \ +{ \ + int tag; \ + int dlen; \ + \ + while( (tag = *tags++) != TT_END ) { \ + dlen = *tags++; \ + switch( tag_number(tag) ) { \ + fields \ + default: \ + if( tag & T_MANDATORY ) { \ + ERR("Unknown tag: %d\n",tag_number(tag)); \ + return 0; \ + } \ + } \ + tags = (unsigned short*)((char*)tags + dlen); \ + } \ + return 1; \ +} +#define INTEGER(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ + arg->member = *(int*)(tags); \ + break; +#define INT64(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ + arg->member = *(u64*)(tags); \ + break; +#define BIT(pn,pr,member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ + arg->member = *(char*)(tags) ? 1 : 0; \ + break; +#define STRING(pn,pr,member,len) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ + arg->member ## _len = dlen; \ + memcpy(arg->member,tags,dlen); \ + break; +#include "linux/drbd_nl.h" + +// Generate the struct to tag_list functions +#define PACKET(name, number, fields) \ +unsigned short* \ +name ## _to_tags (drbd_dev *mdev, struct name * arg, unsigned short* tags) \ +{ \ + fields \ + return tags; \ +} + +#define INTEGER(pn,pr,member) \ + *tags++ = pn | pr | TT_INTEGER; \ + *tags++ = sizeof(int); \ + *(int*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(int)); +#define INT64(pn,pr,member) \ + *tags++ = pn | pr | TT_INT64; \ + *tags++ = sizeof(u64); \ + *(u64*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(u64)); +#define BIT(pn,pr,member) \ + *tags++ = pn | pr | TT_BIT; \ + *tags++ = sizeof(char); \ + *(char*)tags = arg->member; \ + tags = (unsigned short*)((char*)tags+sizeof(char)); +#define STRING(pn,pr,member,len) \ + *tags++ = pn | pr | TT_STRING; \ + *tags++ = arg->member ## _len; \ + memcpy(tags,arg->member, arg->member ## _len); \ + tags = (unsigned short*)((char*)tags + arg->member ## _len); +#include "linux/drbd_nl.h" + +extern void drbd_init_set_defaults(drbd_dev *mdev); +void drbd_bcast_ev_helper(drbd_dev *mdev, char* helper_name); +void drbd_nl_send_reply(struct cn_msg *, int); + +char *nl_packet_name(int packet_type) { +// Generate packet type strings +#define PACKET(name, number, fields) \ + [ P_ ## name ] = # name, +#define INTEGER Argh! +#define BIT Argh! +#define INT64 Argh! +#define STRING Argh! + + static char *nl_tag_name[P_nl_after_last_packet] = { +#include "linux/drbd_nl.h" + }; + + return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? + nl_tag_name[packet_type] : "*Unknown*"; +} + +void nl_trace_packet(void *data) { + struct cn_msg *req = data; + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; + + printk(KERN_INFO DEVICE_NAME "%d: " + "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->drbd_minor, + nl_packet_name(nlp->packet_type), + nlp->packet_type, + req->seq, req->ack, req->len); +} + +void nl_trace_reply(void *data) { + struct cn_msg *req = data; + struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply*)req->data; + + printk(KERN_INFO DEVICE_NAME "%d: " + "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->minor, + nlp->packet_type==P_nl_after_last_packet? + "Empty-Reply" : nl_packet_name(nlp->packet_type), + nlp->packet_type, + req->seq, req->ack, req->len); +} + +int drbd_khelper(drbd_dev *mdev, char* cmd) +{ + char mb[12]; + char *argv[] = {"/sbin/drbdadm", cmd, mb, NULL }; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + + snprintf(mb,12,"minor-%d",mdev_to_minor(mdev)); + + drbd_bcast_ev_helper(mdev,cmd); + return call_usermodehelper("/sbin/drbdadm",argv,envp,1); +} + +drbd_disks_t drbd_try_outdate_peer(drbd_dev *mdev) +{ + int r; + drbd_disks_t nps; + enum fencing_policy fp; + + D_ASSERT(mdev->state.pdsk == DUnknown); + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + D_ASSERT( fp > DontCare ); + + if( fp == Stonith ) drbd_request_state(mdev,NS(susp,1)); + + r=drbd_khelper(mdev,"outdate-peer"); + + switch( (r>>8) & 0xff ) { + case 3: /* peer is inconsistent */ + nps = Inconsistent; + break; + case 4: /* peer is outdated */ + nps = Outdated; + break; + case 5: /* peer was down, we will(have) create(d) a new UUID anyways... */ + /* If we would be more strict, we would return DUnknown here. */ + nps = Outdated; + break; + case 6: /* Peer is primary, voluntarily outdate myself */ + WARN("Peer is primary, outdating myself.\n"); + nps = DUnknown; + drbd_request_state(mdev,NS(disk,Outdated)); + break; + case 7: + if( fp != Stonith ) { + ERR("outdate-peer() = 7 && fencing != Stonith !!!\n"); + } + nps = Outdated; + break; + default: + /* The script is broken ... */ + nps = DUnknown; + drbd_request_state(mdev,NS(disk,Outdated)); + ERR("outdate-peer helper broken, returned %d \n",(r>>8)&0xff); + return nps; + } + + INFO("outdate-peer helper returned %d \n",(r>>8)&0xff); + return nps; +} + + +int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) +{ + int r=0,forced = 0, try=0; + drbd_state_t mask, val; + drbd_disks_t nps; + + if ( new_role == Primary ) { + request_ping(mdev); // Detect a dead peer ASAP + } + + mask.i = 0; mask.role = role_mask; + val.i = 0; val.role = new_role; + + while (try++ < 3) { + r = _drbd_request_state(mdev,mask,val,0); + if( r == SS_NoUpToDateDisk && force && + ( mdev->state.disk == Inconsistent || + mdev->state.disk == Outdated ) ) { + mask.disk = disk_mask; + val.disk = UpToDate; + forced = 1; + continue; + } + if ( r == SS_NothingToDo ) goto fail; + if ( r == SS_PrimaryNOP ) { + nps = drbd_try_outdate_peer(mdev); + + if ( force && nps > Outdated ) { + WARN("Forced into split brain situation!\n"); + nps = Outdated; + } + + mask.pdsk = disk_mask; + val.pdsk = nps; + + continue; + } + if( r == SS_TwoPrimaries ) { + // Maybe the peer is detected as dead very soon... + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); + if(try == 1) try++; // only a single retry in this case. + continue; + } + if ( r < SS_Success ) { + r = drbd_request_state(mdev,mask,val); // Be verbose. + if( r < SS_Success ) goto fail; + } + break; + } + + if(forced) WARN("Forced to conisder local data as UpToDate!\n"); + + drbd_sync_me(mdev); + + /* Wait until nothing is on the fly :) */ + if ( wait_event_interruptible( mdev->misc_wait, + atomic_read(&mdev->ap_pending_cnt) == 0 ) ) { + r = GotSignal; + goto fail; + } + + /* FIXME RACE here: if our direct user is not using bd_claim (i.e. + * not a filesystem) since cstate might still be >= Connected, new + * ap requests may come in and increase ap_pending_cnt again! + * but that means someone is misusing DRBD... + * */ + + if (new_role == Secondary) { + set_disk_ro(mdev->vdisk, TRUE ); + } else { + if(inc_net(mdev)) { + mdev->net_conf->want_lose = 0; + dec_net(mdev); + } + set_disk_ro(mdev->vdisk, FALSE ); + /* why?? what for?? + mdev->this_bdev->bd_disk = mdev->vdisk; + */ + + if ( ( ( mdev->state.conn < Connected || + mdev->state.pdsk <= Failed ) && + mdev->bc->md.uuid[Bitmap] == 0) || forced ) { + drbd_uuid_new_current(mdev); + } + } + + if((new_role == Secondary) && inc_local(mdev) ) { + drbd_al_to_on_disk_bm(mdev); + dec_local(mdev); + } + + if (mdev->state.conn >= WFReportParams) { + /* if this was forced, we should consider sync */ + if(forced) drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + drbd_md_sync(mdev); + + return r; + + fail: + return r; +} + + +STATIC int drbd_nl_primary(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct primary primary_args; + + memset(&primary_args, 0, sizeof(struct primary)); + if(!primary_from_tags(mdev,nlp->tag_list,&primary_args)) { + reply->ret_code=UnknownMandatoryTag; + return 0; + } + + reply->ret_code = drbd_set_role(mdev, Primary, primary_args.overwrite_peer); + + return 0; +} + +STATIC int drbd_nl_secondary(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_set_role(mdev, Secondary, 0); + + return 0; +} + +/* initializes the md.*_offset members, so we are able to find + * the on disk meta data */ +STATIC void drbd_md_set_sector_offsets(drbd_dev *mdev, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + switch(bdev->dc.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_offset = drbd_md_ss__(mdev,bdev); + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.md_offset = 0; + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + bdev->md.md_offset = drbd_md_ss__(mdev,bdev); + /* al size is still fixed */ + bdev->md.al_offset = -MD_AL_MAX_SIZE; + //LGE FIXME max size check missing. + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect,BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect,8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_BM_OFFSET; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + break; + } +} + +char* ppsize(char* buf, unsigned long long size) +{ + // Needs 9 bytes at max. + static char units[] = { 'K','M','G','T','P','E' }; + int base = 0; + while (size >= 10000 ) { + size = size >> 10; + base++; + } + sprintf(buf,"%lu %cB",(long)size,units[base]); + + return buf; +} + +/* You should call drbd_md_sync() after calling this. + */ +int drbd_determin_dev_size(struct Drbd_Conf* mdev) +{ + sector_t prev_first_sect, prev_size; // previous meta location + sector_t la_size; + sector_t size; + char ppb[10]; + + int md_moved, la_size_changed; + int rv=0; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + prev_first_sect = drbd_md_first_sector(mdev->bc); + prev_size = mdev->bc->md.md_size_sect; + la_size = mdev->bc->md.la_size_sect; + + // TODO: should only be some assert here, not (re)init... + drbd_md_set_sector_offsets(mdev,mdev->bc); + + size = drbd_new_dev_size(mdev,mdev->bc); + + if( drbd_get_capacity(mdev->this_bdev) != size || + drbd_bm_capacity(mdev) != size ) { + int err; + err = drbd_bm_resize(mdev,size); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(mdev)>>1; + if (size == 0) { + ERR("OUT OF MEMORY! Could not allocate bitmap! Set device size => 0\n"); + } else { + /* FIXME this is problematic, + * if we in fact are smaller now! */ + ERR("BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = err; + } + // racy, see comments above. + drbd_set_my_capacity(mdev,size); + mdev->bc->md.la_size_sect = size; + INFO("size = %s (%llu KB)\n",ppsize(ppb,size>>1), + (unsigned long long)size>>1); + } + if (rv < 0) goto out; + + la_size_changed = (la_size != mdev->bc->md.la_size_sect); + + //LGE: flexible device size!! is this the right thing to test? + md_moved = prev_first_sect != drbd_md_first_sector(mdev->bc) + || prev_size != mdev->bc->md.md_size_sect; + + if ( md_moved ) { + WARN("Moving meta-data.\n"); + /* assert: (flexible) internal meta data */ + } + + if ( la_size_changed || md_moved ) { + if( inc_local_if_state(mdev,Attaching) ) { + drbd_al_shrink(mdev); // All extents inactive. + rv = drbd_bm_write(mdev); // write bitmap + // Write mdev->la_size to on disk. + drbd_md_mark_dirty(mdev); + dec_local(mdev); + } + } + out: + lc_unlock(mdev->act_log); + + return rv; +} + +sector_t +drbd_new_dev_size(struct Drbd_Conf* mdev, struct drbd_backing_dev *bdev) +{ + sector_t p_size = mdev->p_size; // partner's disk size. + sector_t la_size = bdev->md.la_size_sect; // last agreed size. + sector_t m_size; // my size + sector_t u_size = bdev->dc.disk_size; // size requested by user. + sector_t size=0; + + m_size = drbd_get_max_capacity(bdev); + + if(p_size && m_size) { + size=min_t(sector_t,p_size,m_size); + } else { + if(la_size) { + size=la_size; + if(m_size && m_size < size) size=m_size; + if(p_size && p_size < size) size=p_size; + } else { + if(m_size) size=m_size; + if(p_size) size=p_size; + } + } + + if(size == 0) { + ERR("Both nodes diskless!\n"); + } + + if(u_size) { + if(u_size<<1 > size) { + ERR("Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size, (unsigned long)size>>1); + } else { + size = u_size<<1; + } + } + + return size; +} + +/** + * drbd_check_al_size: + * checks that the al lru is of requested size, and if neccessary tries to + * allocate a new one. returns -EBUSY if current al lru is still used, + * -ENOMEM when allocation failed, and 0 on success. You should call + * drbd_md_sync() after you called this function. + */ +STATIC int drbd_check_al_size(drbd_dev *mdev) +{ + struct lru_cache *n,*t; + struct lc_element *e; + unsigned int in_use; + int i; + + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + if ( mdev->act_log && + mdev->act_log->nr_elements == mdev->sync_conf.al_extents ) + return 0; + + in_use = 0; + t = mdev->act_log; + n = lc_alloc("act_log", mdev->sync_conf.al_extents, + sizeof(struct lc_element), mdev); + + if (n==NULL) { + ERR("Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&mdev->al_lock); + if (t) { + for (i=0; i < t->nr_elements; i++) { + e = lc_entry(t,i); + if (e->refcnt) + ERR("refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) { + mdev->act_log = n; + } + spin_unlock_irq(&mdev->al_lock); + if (in_use) { + ERR("Activity log still in use!\n"); + lc_free(n); + return -EBUSY; + } else { + if (t) lc_free(t); + } + drbd_md_mark_dirty(mdev); //we changed mdev->act_log->nr_elemens + return 0; +} + +void drbd_setup_queue_param(drbd_dev *mdev, unsigned int max_seg_s) +{ + request_queue_t * const q = mdev->rq_queue; + request_queue_t * const b = mdev->bc->backing_bdev->bd_disk->queue; + //unsigned int old_max_seg_s = q->max_segment_size; + + if (b->merge_bvec_fn && !mdev->bc->dc.use_bmbv) + max_seg_s = PAGE_SIZE; + + q->max_sectors = max_seg_s >> 9; + q->max_phys_segments = max_seg_s >> PAGE_SHIFT; + q->max_hw_segments = max_seg_s >> PAGE_SHIFT; + q->max_segment_size = max_seg_s; + q->hardsect_size = 512; + q->seg_boundary_mask = PAGE_SIZE-1; + blk_queue_stack_limits(q, b); + + // KERNEL BUG. in ll_rw_blk.c + // t->max_segment_size = min(t->max_segment_size,b->max_segment_size); + // should be + // t->max_segment_size = min_not_zero(...,...) + + // workaround here: + if(q->max_segment_size == 0) q->max_segment_size = max_seg_s; + + if(b->merge_bvec_fn) { + WARN("Backing device's merge_bvec_fn() = %p\n", + b->merge_bvec_fn); + } + INFO("max_segment_size ( = BIO size ) = %u\n", q->max_segment_size); + + if( q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + INFO("Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } +} + +/* does always return 0; + * interesting return code is in reply->ret_code */ +STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + enum ret_codes retcode; + struct drbd_backing_dev* nbc=NULL; // new_backing_conf + struct inode *inode, *inode2; + struct lru_cache* resync_lru = NULL; + drbd_state_t ns,os; + int rv; + + /* if you want to reconfigure, please tear down first */ + if (mdev->state.disk > Diskless) { + retcode=HaveDiskConfig; + goto fail; + } + + nbc = kmalloc(sizeof(struct drbd_backing_dev),GFP_KERNEL); + if(!nbc) { + retcode=KMallocFailed; + goto fail; + } + + if( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_local(mdev) ) { + memcpy(&nbc->dc,&mdev->bc->dc,sizeof(struct disk_conf)); + dec_local(mdev); + } else { + memset(&nbc->dc,0,sizeof(struct disk_conf)); + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; + nbc->dc.fencing = DRBD_FENCING_DEF; + } + + if(!disk_conf_from_tags(mdev,nlp->tag_list,&nbc->dc)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + nbc->lo_file = NULL; + nbc->md_file = NULL; + + if ( nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode=LDMDInvalid; + goto fail; + } + + nbc->lo_file = filp_open(nbc->dc.backing_dev,O_RDWR,0); + if (!nbc->lo_file) { + retcode=LDNameInvalid; + goto fail; + } + + inode = nbc->lo_file->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode)) { + retcode=LDNoBlockDev; + goto fail; + } + + nbc->md_file = filp_open(nbc->dc.meta_dev,O_RDWR,0); + + if (!nbc->md_file) { + retcode=MDNameInvalid; + goto fail; + } + + inode2 = nbc->md_file->f_dentry->d_inode; + + if (!S_ISBLK(inode2->i_mode)) { + retcode=MDNoBlockDev; + goto fail; + } + + nbc->backing_bdev = inode->i_bdev; + if (BD_CLAIM(nbc->backing_bdev, mdev)) { + retcode=LDMounted; + goto fail; + } + + resync_lru = lc_alloc("resync",31, sizeof(struct bm_extent),mdev); + if(!resync_lru) { + retcode=KMallocFailed; + goto fail; + } + + nbc->md_bdev = inode2->i_bdev; + if (BD_CLAIM(nbc->md_bdev, + (nbc->dc.meta_dev_idx==DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx==DRBD_MD_INDEX_FLEX_INT) ? + (void *)mdev : (void*) drbd_m_holder )) { + retcode=MDMounted; + goto release_bdev_fail; + } + + if ( (nbc->backing_bdev==nbc->md_bdev) != + (nbc->dc.meta_dev_idx==DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx==DRBD_MD_INDEX_FLEX_INT) ) { + retcode=LDMDInvalid; + goto release_bdev2_fail; + } + + if ((drbd_get_capacity(nbc->backing_bdev)>>1) < nbc->dc.disk_size) { + retcode = LDDeviceTooSmall; + goto release_bdev2_fail; + } + +// warning LGE checks below no longer valid +// --- rewrite +#if 0 + if (drbd_get_capacity(nbc->backing_bdev) >= (sector_t)DRBD_MAX_SECTORS) { + retcode = LDDeviceTooLarge; + goto release_bdev2_fail; + } + + if ( nbc->dc.meta_dev_idx == -1 ) i = 1; + else i = nbc->dc.meta_dev_idx+1; + + /* for internal, we need to check agains <= (then we have a drbd with + * zero size, but meta data...) to be on the safe side, I require 32MB + * minimal data storage area for drbd with internal meta data (thats + * 160 total). if someone wants to use that small devices, she can use + * drbd 0.6 anyways... + * + * FIXME this is arbitrary and needs to be reconsidered as soon as we + * move to flexible size meta data. + */ + if( drbd_get_capacity(nbc->md_bdev) < 2*MD_RESERVED_SIZE*i + + (nbc->dc.meta_dev_idx == -1) ? (1<<16) : 0 ) + { + retcode = MDDeviceTooSmall; + goto release_bdev2_fail; + } +#endif +// -- up to here + + // Make sure the new disk is big enough + if (drbd_get_capacity(nbc->backing_bdev) < + drbd_get_capacity(mdev->this_bdev) ) { + retcode = LDDeviceTooSmall; + goto release_bdev2_fail; + } + + if((retcode = drbd_request_state(mdev,NS(disk,Attaching))) < SS_Success ) { + goto release_bdev2_fail; + } + + drbd_md_set_sector_offsets(mdev,nbc); + + retcode = drbd_md_read(mdev,nbc); + if ( retcode != NoError ) { + goto force_diskless; + } + + // Since we are diskless, fix the AL first... + if (drbd_check_al_size(mdev)) { + retcode = KMallocFailed; + goto force_diskless; + } + + // Prevent shrinking of consistent devices ! + if(drbd_md_test_flag(nbc,MDF_Consistent) && + drbd_new_dev_size(mdev,nbc) < nbc->md.la_size_sect) { + retcode = LDDeviceTooSmall; + goto force_diskless; + } + + if(!drbd_al_read_log(mdev,nbc)) { + retcode = MDIOError; + goto force_diskless; + } + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now mdev takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(mdev->bc == NULL); + mdev->bc = nbc; + mdev->resync = resync_lru; + nbc = NULL; + resync_lru = NULL; + + if(drbd_md_test_flag(mdev->bc,MDF_PrimaryInd)) { + set_bit(CRASHED_PRIMARY, &mdev->flags); + } else { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + } + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + mdev->read_cnt = 0; + mdev->writ_cnt = 0; + + drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + /* + * FIXME currently broken. + * drbd_set_recv_tcq(mdev,drbd_queue_order_type(mdev)==QUEUE_ORDERED_TAG); + */ + + /* If I am currently not Primary, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been Primary before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded Primary), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + if ( mdev->state.role != Primary && + drbd_md_test_flag(mdev->bc,MDF_PrimaryInd) && + !drbd_md_test_flag(mdev->bc,MDF_ConnectedInd) ) { + set_bit(USE_DEGR_WFC_T,&mdev->flags); + } + + drbd_bm_lock(mdev); // racy... + drbd_determin_dev_size(mdev); + + if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { + INFO("Assuming that all blocks are out of sync (aka FullSync)\n"); + drbd_bm_set_all(mdev); + if (unlikely(drbd_bm_write(mdev) < 0)) { + retcode = MDIOError; + goto unlock_bm; + } + drbd_md_clear_flag(mdev,MDF_FullSync); + } else { + if (unlikely(drbd_bm_read(mdev) < 0)) { + retcode = MDIOError; + goto unlock_bm; + } + } + + if(test_bit(CRASHED_PRIMARY, &mdev->flags)) { + drbd_al_apply_to_bm(mdev); + drbd_al_to_on_disk_bm(mdev); + } + /* else { + FIXME wipe out on disk al! + } */ + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = os.i; + /* If MDF_Consistent is not set go into inconsistent state, + otherwise investige MDF_WasUpToDate... + If MDF_WasUpToDate is not set go into Outdated disk state, + otherwise into Consistent state. + */ + if(drbd_md_test_flag(mdev->bc,MDF_Consistent)) { + if(drbd_md_test_flag(mdev->bc,MDF_WasUpToDate)) { + ns.disk = Consistent; + } else { + ns.disk = Outdated; + } + } else { + ns.disk = Inconsistent; + } + + if(drbd_md_test_flag(mdev->bc,MDF_PeerOutDated)) { + ns.pdsk = Outdated; + } + + if( ns.disk == Consistent && + ( ns.pdsk == Outdated || mdev->bc->dc.fencing == DontCare ) ) { + ns.disk = UpToDate; + } + + /* All tests on MDF_PrimaryInd, MDF_ConnectedInd, + MDF_Consistent and MDF_WasUpToDate must happen before + this point, because drbd_request_state() modifies these + flags. */ + + /* In case we are Connected postpone any desicion on the new disk + state after the negotiation phase. */ + if(mdev->state.conn == Connected) { + mdev->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = Negotiating; + } + + rv = _drbd_set_state(mdev, ns, ChgStateVerbose); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + if (rv==SS_Success) after_state_ch(mdev,os,ns,ChgStateVerbose); + + if (rv < SS_Success) { + goto unlock_bm; + } + + drbd_bm_unlock(mdev); + drbd_md_sync(mdev); + + reply->ret_code = retcode; + return 0; + + unlock_bm: + drbd_bm_unlock(mdev); + force_diskless: + drbd_force_state(mdev,NS(disk,Diskless)); + drbd_md_sync(mdev); + release_bdev2_fail: + if (nbc) BD_RELEASE(nbc->md_bdev); + release_bdev_fail: + if (nbc) BD_RELEASE(nbc->backing_bdev); + fail: + if (nbc) { + if (nbc->lo_file) fput(nbc->lo_file); + if (nbc->md_file) fput(nbc->md_file); + kfree(nbc); + } + if (resync_lru) lc_free(resync_lru); + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_detach(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + drbd_sync_me(mdev); + reply->ret_code = drbd_request_state(mdev,NS(disk,Diskless)); + + return 0; +} + +#define HMAC_NAME_L 20 + +STATIC int drbd_nl_net_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int i,ns; + enum ret_codes retcode; + struct net_conf *new_conf = NULL; + struct crypto_hash *tfm = NULL; + struct hlist_head *new_tl_hash = NULL; + struct hlist_head *new_ee_hash = NULL; + drbd_dev *odev; + char hmac_name[HMAC_NAME_L]; + + if (mdev->state.conn > StandAlone) { + retcode=HaveNetConfig; + goto fail; + } + + new_conf = kmalloc(sizeof(struct net_conf),GFP_KERNEL); + if(!new_conf) { + retcode=KMallocFailed; + goto fail; + } + + if( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_net(mdev)) { + memcpy(new_conf,mdev->net_conf,sizeof(struct net_conf)); + dec_local(mdev); + } else { + memset(new_conf,0,sizeof(struct net_conf)); + new_conf->timeout = DRBD_TIMEOUT_DEF; + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; + new_conf->ping_int = DRBD_PING_INT_DEF; + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; + new_conf->unplug_watermark= DRBD_UNPLUG_WATERMARK_DEF; + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; + new_conf->ko_count = DRBD_KO_COUNT_DEF; + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; + new_conf->want_lose = 0; + new_conf->two_primaries = 0; + new_conf->wire_protocol = DRBD_PROT_C; + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + } + + if (!net_conf_from_tags(mdev,nlp->tag_list,new_conf)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if (new_conf->two_primaries && (new_conf->wire_protocol != DRBD_PROT_C)) { + retcode=ProtocolCRequired; + goto fail; + }; + + if( mdev->state.role == Primary && new_conf->want_lose ) { + retcode=DiscardNotAllowed; + goto fail; + } + +#define M_ADDR(A) (((struct sockaddr_in *)&A->my_addr)->sin_addr.s_addr) +#define M_PORT(A) (((struct sockaddr_in *)&A->my_addr)->sin_port) +#define O_ADDR(A) (((struct sockaddr_in *)&A->peer_addr)->sin_addr.s_addr) +#define O_PORT(A) (((struct sockaddr_in *)&A->peer_addr)->sin_port) + retcode = NoError; + for(i=0;inet_conf) && + M_PORT(new_conf) == M_PORT(odev->net_conf) ) { + retcode=LAAlreadyInUse; + } + if(O_ADDR(new_conf) == O_ADDR(odev->net_conf) && + O_PORT(new_conf) == O_PORT(odev->net_conf) ) { + retcode=OAAlreadyInUse; + } + dec_net(odev); + if(retcode != NoError) goto fail; + } + } +#undef M_ADDR +#undef M_PORT +#undef O_ADDR +#undef O_PORT + + if( new_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name,HMAC_NAME_L,"hmac(%s)",new_conf->cram_hmac_alg); + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + tfm = NULL; + retcode=CRAMAlgNotAvail; + goto fail; + } + + if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) != CRYPTO_ALG_TYPE_HASH ) { + retcode=CRAMAlgNotDigest; + goto fail; + } + } + + + ns = new_conf->max_epoch_size/8; + if (mdev->tl_hash_s != ns) { + new_tl_hash=kmalloc(ns*sizeof(void*), GFP_KERNEL); + if(!new_tl_hash) { + retcode=KMallocFailed; + goto fail; + } + memset(new_tl_hash, 0, ns*sizeof(void*)); + } + + ns = new_conf->max_buffers/8; + if (new_conf->two_primaries && ( mdev->ee_hash_s != ns ) ) { + new_ee_hash=kmalloc(ns*sizeof(void*), GFP_KERNEL); + if(!new_ee_hash) { + retcode=KMallocFailed; + goto fail; + } + memset(new_ee_hash, 0, ns*sizeof(void*)); + } + + ((char*)new_conf->shared_secret)[SHARED_SECRET_MAX-1]=0; + +#if 0 +FIXME LGE + /* for the connection loss logic in drbd_recv + * I _need_ the resulting timeo in jiffies to be + * non-zero and different + * + * XXX maybe rather store the value scaled to jiffies? + * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT + * and HZ > 10; which is unlikely to change... + * Thus, if interrupted by a signal, + * sock_{send,recv}msg returns -EINTR, + * if the timeout expires, -EAGAIN. + */ + // unlikely: someone disabled the timeouts ... + // just put some huge values in there. + if (!new_conf->ping_int) + new_conf->ping_int = MAX_SCHEDULE_TIMEOUT/HZ; + if (!new_conf->timeout) + new_conf->timeout = MAX_SCHEDULE_TIMEOUT/HZ*10; + if (new_conf->ping_int*10 < new_conf->timeout) + new_conf->timeout = new_conf->ping_int*10/6; + if (new_conf->ping_int*10 == new_conf->timeout) + new_conf->ping_int = new_conf->ping_int+1; +#endif + + D_ASSERT(mdev->net_conf==NULL); + mdev->net_conf = new_conf; + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + + if(new_tl_hash) { + if (mdev->tl_hash) kfree(mdev->tl_hash); + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; + mdev->tl_hash = new_tl_hash; + } + + if(new_ee_hash) { + if (mdev->ee_hash) kfree(mdev->ee_hash); + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; + mdev->ee_hash = new_ee_hash; + } + + if ( mdev->cram_hmac_tfm ) { + crypto_free_hash(mdev->cram_hmac_tfm); + } + mdev->cram_hmac_tfm = tfm; + + retcode = drbd_request_state(mdev,NS(conn,Unconnected)); + + reply->ret_code = retcode; + return 0; + + fail: + if (tfm) crypto_free_hash(tfm); + if (new_tl_hash) kfree(new_tl_hash); + if (new_ee_hash) kfree(new_ee_hash); + if (new_conf) kfree(new_conf); + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_disconnect(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev,NS(conn,Disconnecting),0); // silently. + + if ( retcode == SS_NothingToDo ) goto done; + else if ( retcode == SS_AlreadyStandAlone ) goto done; + else if ( retcode == SS_PrimaryNOP ) { + // Our statche checking code wants to see the peer outdated. + retcode = drbd_request_state(mdev,NS2(conn,Disconnecting, + pdsk,Outdated)); + } else if (retcode == SS_CW_FailedByPeer) { + // The peer probabely wants to see us outdated. + retcode = _drbd_request_state(mdev,NS2(conn,Disconnecting, + disk,Outdated),0); + if( retcode == SS_CanNotOutdateDL ) { + // We are diskless and our peer wants to outdate us. + // So, simply go away, and let the peer try to + // outdate us with its 'outdate-peer' handler later. + retcode = drbd_request_state(mdev,NS(conn,StandAlone)); + } + } + + if( retcode < SS_Success ) goto fail; + + if( wait_event_interruptible( mdev->misc_wait, + mdev->state.conn==StandAlone) ) { + retcode = GotSignal; + goto fail; + } + + done: + retcode = NoError; + fail: + drbd_md_sync(mdev); + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_resize(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct resize rs; + int retcode=NoError; + + memset(&rs, 0, sizeof(struct resize)); + if (!resize_from_tags(mdev,nlp->tag_list,&rs)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if (mdev->state.conn > Connected) { + retcode = NoResizeDuringResync; + goto fail; + } + + if ( mdev->state.role == Secondary && + mdev->state.peer == Secondary) { + retcode = APrimaryNodeNeeded; + goto fail; + } + + if(!inc_local(mdev)) { + retcode = HaveNoDiskConfig; + goto fail; + } + + mdev->bc->dc.disk_size = (sector_t)rs.resize_size; + drbd_bm_lock(mdev); + drbd_determin_dev_size(mdev); + drbd_md_sync(mdev); + drbd_bm_unlock(mdev); + dec_local(mdev); + if (mdev->state.conn == Connected) { + drbd_send_uuids(mdev); // to start sync... + drbd_send_sizes(mdev); + } + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_syncer_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + struct syncer_conf sc; + drbd_dev *odev; + int err; + + memcpy(&sc,&mdev->sync_conf,sizeof(struct syncer_conf)); + + if(nlp->flags & DRBD_NL_SET_DEFAULTS) { + sc.rate = DRBD_RATE_DEF; + sc.after = DRBD_AFTER_DEF; + sc.al_extents = DRBD_AL_EXTENTS_DEF; + } + + if (!syncer_conf_from_tags(mdev,nlp->tag_list,&sc)) { + retcode=UnknownMandatoryTag; + goto fail; + } + + if( sc.after != -1) { + if( sc.after < -1 || minor_to_mdev(sc.after) == NULL ) { + retcode=SyncAfterInvalid; + goto fail; + } + odev = minor_to_mdev(sc.after); // check against loops in + while(1) { + if( odev == mdev ) { + retcode=SyncAfterCycle; + goto fail; + } + if( odev->sync_conf.after == -1 ) break; // no cycles. + odev = minor_to_mdev(odev->sync_conf.after); + } + } + + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; // arbitrary minimum +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if(sc.al_extents > AL_MAX) { + ERR("sc.al_extents > %d\n",AL_MAX); + sc.al_extents = AL_MAX; + } +#undef AL_MAX + + mdev->sync_conf = sc; + + if(inc_local(mdev)) { + err = drbd_check_al_size(mdev); + dec_local(mdev); + drbd_md_sync(mdev); + + if (err) { + retcode = KMallocFailed; + goto fail; + } + } + + if (mdev->state.conn >= Connected) + drbd_send_sync_param(mdev,&sc); + + drbd_alter_sa(mdev, sc.after); + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_invalidate(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS2(conn,StartingSyncT, + disk,Inconsistent)); + return 0; +} + +STATIC int drbd_nl_invalidate_peer(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + + reply->ret_code = drbd_request_state(mdev,NS2(conn,StartingSyncS, + pdsk,Inconsistent)); + + return 0; +} + +STATIC int drbd_nl_pause_sync(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + + if(drbd_request_state(mdev,NS(user_isp,1)) == SS_NothingToDo) + retcode = PauseFlagAlreadySet; + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_resume_sync(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode=NoError; + + if(drbd_request_state(mdev,NS(user_isp,0)) == SS_NothingToDo) + retcode = PauseFlagAlreadyClear; + + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_suspend_io(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS(susp,1)); + + return 0; +} + +STATIC int drbd_nl_resume_io(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev,NS(susp,0)); + return 0; +} + +STATIC int drbd_nl_outdate(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + drbd_state_t os,ns; + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if( mdev->state.disk < Outdated ) { + retcode = -999; + } else { + retcode = _drbd_set_state(_NS(mdev,disk,Outdated),ChgStateVerbose); + } + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + if (retcode==SS_Success) after_state_ch(mdev,os,ns, ChgStateVerbose); + + if( retcode == -999 ) { + retcode = DiskLowerThanOutdated; + goto fail; + } + + drbd_md_sync(mdev); + + fail: + reply->ret_code = retcode; + return 0; +} + +STATIC int drbd_nl_get_config(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if(inc_local(mdev)) { + tl = disk_conf_to_tags(mdev,&mdev->bc->dc,tl); + dec_local(mdev); + } + + if(inc_net(mdev)) { + tl = net_conf_to_tags(mdev,mdev->net_conf,tl); + dec_net(mdev); + } + tl = syncer_conf_to_tags(mdev,&mdev->sync_conf,tl); + + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC int drbd_nl_get_state(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + tl = get_state_to_tags(mdev,(struct get_state*)&mdev->state,tl); + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC int drbd_nl_get_uuids(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if(inc_local(mdev)) { + // This is a hand crafted add tag ;) + *tl++ = T_uuids; + *tl++ = UUID_SIZE*sizeof(u64); + memcpy(tl,mdev->bc->md.uuid,UUID_SIZE*sizeof(u64)); + tl=(unsigned short*)((char*)tl + UUID_SIZE*sizeof(u64)); + dec_local(mdev); + *tl++ = T_uuids_flags; + *tl++ = sizeof(int); + memcpy(tl,&mdev->bc->md.flags,sizeof(int)); + tl=(unsigned short*)((char*)tl + sizeof(int)); + } + *tl++ = TT_END; /* Close the tag list */ + + return (int)((char*)tl - (char*)reply->tag_list); +} + + +STATIC int drbd_nl_get_timeout_flag(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + // This is a hand crafted add tag ;) + *tl++ = T_use_degraded; + *tl++ = sizeof(char); + *((char*)tl) = test_bit(USE_DEGR_WFC_T,&mdev->flags) ? 1 : 0 ; + tl=(unsigned short*)((char*)tl + sizeof(char)); + *tl++ = TT_END; + + return (int)((char*)tl - (char*)reply->tag_list); +} + +STATIC drbd_dev *ensure_mdev(struct drbd_nl_cfg_req *nlp) +{ + drbd_dev *mdev; + + mdev = minor_to_mdev(nlp->drbd_minor); + + if(!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { + mdev = drbd_new_device(nlp->drbd_minor); + + spin_lock_irq(&drbd_pp_lock); + if( minor_table[nlp->drbd_minor] == NULL) { + minor_table[nlp->drbd_minor] = mdev; + mdev = NULL; + } + spin_unlock_irq(&drbd_pp_lock); + + if(mdev) { + if(mdev->app_reads_hash) kfree(mdev->app_reads_hash); + if(mdev->md_io_page) __free_page(mdev->md_io_page); + kfree(mdev); + mdev = NULL; + } + + mdev = minor_to_mdev(nlp->drbd_minor); + } + + return mdev; +} + +struct cn_handler_struct { + int (*function)(drbd_dev *, + struct drbd_nl_cfg_req *, + struct drbd_nl_cfg_reply* ); + int reply_body_size; +}; + +static struct cn_handler_struct cnd_table[] = { + [ P_primary ] = { &drbd_nl_primary, 0 }, + [ P_secondary ] = { &drbd_nl_secondary, 0 }, + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, + [ P_detach ] = { &drbd_nl_detach, 0 }, + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, + [ P_resize ] = { &drbd_nl_resize, 0 }, + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, + [ P_outdate ] = { &drbd_nl_outdate, 0 }, + [ P_get_config ] = { &drbd_nl_get_config, + sizeof(struct syncer_conf_tag_len_struct) + + sizeof(struct disk_conf_tag_len_struct) + + sizeof(struct net_conf_tag_len_struct) }, + [ P_get_state ] = { &drbd_nl_get_state, + sizeof(struct get_state_tag_len_struct) }, + [ P_get_uuids ] = { &drbd_nl_get_uuids, + sizeof(struct get_uuids_tag_len_struct) }, + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, + sizeof(struct get_timeout_flag_tag_len_struct)}, + +}; + +void drbd_connector_callback(void *data) +{ + struct cn_msg *req = data; + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; + struct cn_handler_struct *cm; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply* reply; + drbd_dev *mdev; + int retcode,rr; + int reply_size = sizeof(struct cn_msg) + + sizeof(struct drbd_nl_cfg_reply) + + sizeof(short int); + + if(!try_module_get(THIS_MODULE)) { + printk(KERN_ERR DEVICE_NAME "try_module_get() failed!\n"); + return; + } + + if( !(mdev = ensure_mdev(nlp)) ) { + retcode=MinorNotKnown; + goto fail; + } + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_packet(data);); + + if( nlp->packet_type >= P_nl_after_last_packet ) { + retcode=UnknownNetLinkPacket; + goto fail; + } + + cm = cnd_table + nlp->packet_type; + reply_size += cm->reply_body_size; + + if( !(cn_reply = kmalloc(reply_size,GFP_KERNEL)) ) { + retcode=KMallocFailed; + goto fail; + } + reply = (struct drbd_nl_cfg_reply*) cn_reply->data; + + reply->packet_type = cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; + reply->minor = nlp->drbd_minor; + reply->ret_code = NoError; // Might by modified by cm->function. + // reply->tag_list; might be modified by cm->fucntion. + + rr = cm->function(mdev,nlp,reply); + + cn_reply->id = req->id; + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; + cn_reply->flags = 0; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if(rr && rr != -ESRCH) { + printk(KERN_INFO DEVICE_NAME " cn_netlink_send()=%d\n",rr); + } + kfree(cn_reply); + module_put(THIS_MODULE); + return; + fail: + drbd_nl_send_reply(req, retcode); + module_put(THIS_MODULE); +} + +atomic_t drbd_nl_seq = ATOMIC_INIT(2); // two. + +void drbd_bcast_state(drbd_dev *mdev) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct get_state_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + unsigned short *tl = reply->tag_list; + + // WARN("drbd_bcast_state() got called\n"); + + tl = get_state_to_tags(mdev,(struct get_state*)&mdev->state,tl); + *tl++ = TT_END; /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_get_state; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NoError; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); +} + +void drbd_bcast_ev_helper(drbd_dev *mdev, char* helper_name) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct call_helper_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + unsigned short *tl = reply->tag_list; + int str_len; + + // WARN("drbd_bcast_state() got called\n"); + + str_len = strlen(helper_name)+1; + *tl++ = T_helper; + *tl++ = str_len; + memcpy(tl,helper_name,str_len); + tl=(unsigned short*)((char*)tl + str_len); + *tl++ = TT_END; /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_call_helper; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NoError; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); +} + +#ifdef NETLINK_ROUTE6 +int __init cn_init(void); +void __exit cn_fini(void); +#endif + +int __init drbd_nl_init() +{ + static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; + int err; + +#ifdef NETLINK_ROUTE6 + /* pre 2.6.16 */ + err = cn_init(); + if(err) return err; +#endif + err = cn_add_callback(&cn_id_drbd,"cn_drbd",&drbd_connector_callback); + if(err) { + printk(KERN_ERR DEVICE_NAME "cn_drbd failed to register\n"); + return err; + } + + return 0; +} + +void drbd_nl_cleanup() +{ + static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; + + cn_del_callback(&cn_id_drbd); + +#ifdef NETLINK_ROUTE6 + /* pre 2.6.16 */ + cn_fini(); +#endif +} + +void drbd_nl_send_reply( struct cn_msg *req, + int ret_code) +{ + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply* reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + int rr; + + cn_reply->id = req->id; + + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); + cn_reply->flags = 0; + + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; + reply->ret_code = ret_code; + + TRACE(TraceTypeNl, TraceLvlSummary, nl_trace_reply(cn_reply);); + + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if(rr && rr != -ESRCH) { + printk(KERN_INFO DEVICE_NAME " cn_netlink_send()=%d\n",rr); + } +} + diff -uprN linux-2.6.24/drivers/block/drbd/drbd_proc.c linux-2.6.24.ovz/drivers/block/drbd/drbd_proc.c --- linux-2.6.24/drivers/block/drbd/drbd_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_proc.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,267 @@ +/* +-*- linux-c -*- + drbd_proc.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "lru_cache.h" /* for lc_sprintf_stats */ + +STATIC int drbd_proc_open(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +STATIC void drbd_syncer_progress(struct Drbd_Conf* mdev, struct seq_file *seq) +{ + unsigned long res , db, dt, dbdt, rt, rs_left; + + /* the whole sector_div thingy was wrong (did overflow, + * did not use correctly typed parameters), and is not even + * neccessary as long as rs_total and drbd_bm_total_weight + * are both unsigned long. + * + * this is to break it at compile time when we change that + * (we may feel 4TB maximum storage per drbd is not enough) + */ + typecheck(unsigned long, mdev->rs_total); + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (rs_left > mdev->rs_total) { + /* doh. logic bug somewhere. + * for now, just try to prevent in-kernel buffer overflow. + */ + ERR("logic bug? rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + rs_left, mdev->rs_total, mdev->rs_failed); + res = 1000; + } else { + res = (rs_left >> 10)*1000/((mdev->rs_total >> 10) + 1); + } + { + int i, y = res/50, x = 20-y; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + res = 1000L - res; + seq_printf(seq,"sync'ed:%3lu.%lu%% ", res / 10, res % 10); + /* if more than 1 GB display in MB */ + if (mdev->rs_total > 0x100000L) { + seq_printf(seq,"(%lu/%lu)M\n\t", + (unsigned long) Bit2KB(rs_left) >> 10, + (unsigned long) Bit2KB(mdev->rs_total) >> 10 ); + } else { + seq_printf(seq,"(%lu/%lu)K\n\t", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(mdev->rs_total) ); + } + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = (jiffies - mdev->rs_mark_time) / HZ; + + if (dt > 20) { + /* if we made no update to rs_mark_time for too long, + * we are stalled. show that. */ + seq_printf(seq, "stalled\n"); + return; + } + + if (!dt) dt++; + db = mdev->rs_mark_left - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " speed: %ld,%03ld", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " speed: %ld", dbdt); + + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total - rs_left; + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " (%ld,%03ld)", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " (%ld)", dbdt); + + seq_printf(seq," K/sec\n"); +} + +#if 0 +STATIC void resync_dump_detail(struct seq_file *seq, struct lc_element * e) +{ + struct bm_extent *bme = (struct bm_extent *)e; + + seq_printf(seq,"%5d %s %s\n",bme->rs_left, + bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", + bme->flags & BME_LOCKED ? "LOCKED" : "------" + ); +} +#endif + +STATIC int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i,hole=0; + const char *sn; + drbd_dev *mdev; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d)\n%s\n", + API_VERSION,PRO_VERSION, drbd_buildtag()); + + /* + cs .. connection state + st .. node state (local/remote) + ld .. local data consistentency + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + pe .. pending (waiting for ack) + ua .. unack'd (still need to send ack) + al .. access log write count + */ + + for (i = 0; i < minor_count; i++) { + mdev = minor_to_mdev(i); + if(!mdev) { + hole=1; + continue; + } + if( hole ) { + hole=0; + seq_printf( seq, "\n"); + } + + sn = conns_to_name(mdev->state.conn); + + if ( mdev->state.conn == StandAlone && + mdev->state.disk == Diskless) { + seq_printf( seq, "%2d: cs:Unconfigured\n", i); + } else { + seq_printf( seq, + "%2d: cs:%s st:%s/%s ds:%s/%s %c %c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d\n", + i, sn, + roles_to_name(mdev->state.role), + roles_to_name(mdev->state.peer), + disks_to_name(mdev->state.disk), + disks_to_name(mdev->state.pdsk), + (mdev->net_conf == NULL ? ' ' : + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), + mdev->state.susp ? 's' : 'r', + mdev->state.aftr_isp ? 'a' : '-', + mdev->state.peer_isp ? 'p' : '-', + mdev->state.user_isp ? 'u' : '-', + mdev->send_cnt/2, + mdev->recv_cnt/2, + mdev->writ_cnt/2, + mdev->read_cnt/2, + mdev->al_writ_cnt, + mdev->bm_writ_cnt, + atomic_read(&mdev->local_cnt), + atomic_read(&mdev->ap_pending_cnt) + + atomic_read(&mdev->rs_pending_cnt), + atomic_read(&mdev->unacked_cnt), + atomic_read(&mdev->ap_bio_cnt) + ); + } + if ( mdev->state.conn == SyncSource || + mdev->state.conn == SyncTarget ) { + drbd_syncer_progress(mdev,seq); + } + if(mdev->resync) { + lc_printf_stats(seq,mdev->resync); + } + if(mdev->act_log) { + lc_printf_stats(seq,mdev->act_log); + } +#if 0 + if(mdev->resync) { + lc_dump(mdev->resync,seq,"rs_left", + resync_dump_detail); + } +#endif + + } + + return 0; +} + +STATIC int drbd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_seq_show, PDE(inode)->data); +} + +/* PROC FS stuff end */ diff -uprN linux-2.6.24/drivers/block/drbd/drbd_receiver.c linux-2.6.24.ovz/drivers/block/drbd/drbd_receiver.c --- linux-2.6.24/drivers/block/drbd/drbd_receiver.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_receiver.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,3401 @@ +/* +-*- linux-c -*- + drbd_receiver.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" + +#if defined(__arch_um__) && !defined(HAVE_UML_TO_VIRT) +static inline void *to_virt(unsigned long phys) +{ + return((void *) uml_physmem + phys); +} +#endif + +#ifdef DBG_ASSERTS +void drbd_assert_breakpoint(drbd_dev *mdev, char *exp, + char *file, int line) +{ + ERR("ASSERT( %s ) in %s:%d\n", exp, file, line); +} +#endif + + +#if 0 +#define CHECK_LIST_LIMIT 1000 +void check_list(drbd_dev *mdev,struct list_head *list,char *t) +{ + struct list_head *le,*la; + int forward=0,backward=0; + + le=list; + do { + la=le; + le=le->next; + if( le->prev != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + mdev_to_minor(mdev),t); + break; + } + if( forward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s forward > 1000\n", + mdev_to_minor(mdev),t); + break; + } + } while(le != list); + + le=list; + do { + la=le; + le=le->prev; + if( le->next != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + mdev_to_minor(mdev),t); + break; + } + if( backward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s backward > 1000\n", + mdev_to_minor(mdev),t); + break; + } + } while(le != list); + + if(forward != backward) { + printk(KERN_ERR DEVICE_NAME "%d: forward=%d, backward=%d\n", + mdev_to_minor(mdev),forward,backward); + } +} +#endif + +#define GFP_TRY ( __GFP_HIGHMEM | __GFP_NOWARN ) + +/** + * drbd_bp_alloc: Returns a page. Fails only if a signal comes in. + */ +STATIC struct page * drbd_pp_alloc(drbd_dev *mdev, unsigned int gfp_mask) +{ + unsigned long flags=0; + struct page *page; + DEFINE_WAIT(wait); + + /* FIXME Add some usefull watermark again to "kick_lo", if pages get + * used up too quickly. The watermark that had been in place here did + * not make sense. + */ + + spin_lock_irqsave(&drbd_pp_lock,flags); + /* This lock needs to lock out irq because we might call drdb_pp_free() + from IRQ context. + FIXME but why irq _save_ ? + this is only called from drbd_alloc_ee, + and that is strictly process context! */ + if ( (page = drbd_pp_pool) ) { + drbd_pp_pool = (struct page*)page_private(page); + drbd_pp_vacant--; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + if (page) goto got_page; + + drbd_kick_lo(mdev); + + for (;;) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + /* try the pool again, maybe the drbd_kick_lo set some free */ + spin_lock_irqsave(&drbd_pp_lock,flags); + if ( (page = drbd_pp_pool) ) { + drbd_pp_pool = (struct page*)page_private(page); + drbd_pp_vacant--; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + + if (page) break; + + /* hm. pool was empty. try to allocate from kernel. + * don't wait, if none is available, though. + */ + if ( atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers ) { + if( (page = alloc_page(GFP_TRY)) ) + break; + } + + /* doh. still no page. + * either used up the configured maximum number, + * or we are low on memory. + * wait for someone to return a page into the pool. + * unless, of course, someone signalled us. + */ + if (signal_pending(current)) { + WARN("drbd_pp_alloc interrupted!\n"); + finish_wait(&drbd_pp_wait, &wait); + return NULL; + } + drbd_kick_lo(mdev); + schedule(); + } + finish_wait(&drbd_pp_wait, &wait); + + got_page: + atomic_inc(&mdev->pp_in_use); + return page; +} + +STATIC void drbd_pp_free(drbd_dev *mdev,struct page *page) +{ + unsigned long flags=0; + int free_it; + + spin_lock_irqsave(&drbd_pp_lock,flags); + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + free_it = 1; + } else { + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + drbd_pp_vacant++; + free_it = 0; + } + spin_unlock_irqrestore(&drbd_pp_lock,flags); + + atomic_dec(&mdev->pp_in_use); + + if(free_it) __free_page(page); + + /* + * FIXME + * typically there are no waiters. + * we should try to avoid any unnecessary call to wake_up. + */ + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + drbd_free_ee() + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() + drbd_ee_fix_bhs() + drbd_process_done_ee() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + unsigned int gfp_mask) +{ + struct Tl_epoch_entry* e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + unsigned int ds; + int i; + + e = mempool_alloc(drbd_ee_mempool, gfp_mask); + if (!e) return NULL; + + bio = bio_alloc(GFP_KERNEL, div_ceil(data_size,PAGE_SIZE)); + if (!bio) goto fail1; + + bio->bi_bdev = mdev->bc->backing_bdev; + bio->bi_sector = sector; + + ds = data_size; + while(ds) { + page = drbd_pp_alloc(mdev, gfp_mask); + if (!page) goto fail2; + if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { + drbd_pp_free(mdev,page); + goto fail2; + break; + } + ds -= min_t(int, ds, PAGE_SIZE); + } + + D_ASSERT( data_size == bio->bi_size); + + bio->bi_private = e; + e->mdev = mdev; + e->sector = sector; + e->size = bio->bi_size; + + e->private_bio = bio; + e->block_id = id; + INIT_HLIST_NODE(&e->colision); + e->barrier_nr = 0; + e->barrier_nr2 = 0; + e->flags = 0; + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("allocated EE sec=%llus size=%u ee=%p\n", + (unsigned long long)sector,data_size,e); + ); + + return e; + + fail2: + __bio_for_each_segment(bvec, bio, i, 0) { + drbd_pp_free(mdev,bvec->bv_page); + } + bio_put(bio); + fail1: + mempool_free(e, drbd_ee_mempool); + + return NULL; +} + +void drbd_free_ee(drbd_dev *mdev, struct Tl_epoch_entry* e) +{ + struct bio *bio=e->private_bio; + struct bio_vec *bvec; + int i; + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Free EE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + + __bio_for_each_segment(bvec, bio, i, 0) { + drbd_pp_free(mdev,bvec->bv_page); + } + + bio_put(bio); + + D_ASSERT(hlist_unhashed(&e->colision)); + + mempool_free(e, drbd_ee_mempool); +} + +/* currently on module unload only */ +int drbd_release_ee(drbd_dev *mdev,struct list_head* list) +{ + int count=0; + struct Tl_epoch_entry* e; + struct list_head *le; + + spin_lock_irq(&mdev->req_lock); + while(!list_empty(list)) { + le = list->next; + e = list_entry(le, struct Tl_epoch_entry, w.list); + drbd_free_ee(mdev,e); + count++; + } + spin_unlock_irq(&mdev->req_lock); + + return count; +} + + +STATIC void reclaim_net_ee(drbd_dev *mdev) +{ + struct Tl_epoch_entry *e; + struct list_head *le,*tle; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_safe(le, tle, &mdev->net_ee) { + e = list_entry(le, struct Tl_epoch_entry, w.list); + if( drbd_bio_has_active_page(e->private_bio) ) break; + list_del(le); + drbd_free_ee(mdev,e); + } +} + + +/* + * This function is called from _asender only_ + * but see also comments in _req_mod(,barrier_acked) + * and receive_Barrier_no_tcq. + * + * Move entries from net_ee to done_ee, if ready. + * Grab done_ee, call all callbacks, free the entries. + * The callbacks typically send out ACKs. + */ +STATIC int drbd_process_done_ee(drbd_dev *mdev) +{ + LIST_HEAD(work_list); + struct Tl_epoch_entry *e, *t; + int ok=1; + int do_clear_bit = test_bit(WRITE_ACK_PENDING,&mdev->flags); + + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev); + list_splice_init(&mdev->done_ee,&work_list); + spin_unlock_irq(&mdev->req_lock); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_discard_ack. + * all ignore the last argument. + */ + list_for_each_entry_safe(e, t, &work_list, w.list) { + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Process EE on done_ee sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + // list_del not necessary, next/prev members not touched + if (e->w.cb(mdev,&e->w,0) == 0) ok = 0; + drbd_free_ee(mdev,e); + } + if (do_clear_bit) + clear_bit(WRITE_ACK_PENDING,&mdev->flags); + wake_up(&mdev->ee_wait); + + return ok; +} + + + +/* clean-up helper for drbd_disconnect */ +void _drbd_clear_done_ee(drbd_dev *mdev) +{ + struct list_head *le; + struct Tl_epoch_entry *e; + int n = 0; + + MUST_HOLD(&mdev->req_lock); + + reclaim_net_ee(mdev); + + while(!list_empty(&mdev->done_ee)) { + le = mdev->done_ee.next; + list_del(le); + e = list_entry(le, struct Tl_epoch_entry, w.list); + if(mdev->net_conf->wire_protocol == DRBD_PROT_C || + is_syncer_block_id(e->block_id)) { + ++n; + } + if(!hlist_unhashed(&e->colision)) hlist_del_init(&e->colision); + drbd_free_ee(mdev,e); + } + + sub_unacked(mdev, n); +} + +void _drbd_wait_ee_list_empty(drbd_dev *mdev,struct list_head *head) +{ + DEFINE_WAIT(wait); + MUST_HOLD(&mdev->req_lock); + + /* avoids spin_lock/unlock and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&mdev->ee_wait,&wait,TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + drbd_kick_lo(mdev); + schedule(); + finish_wait(&mdev->ee_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } +} + +void drbd_wait_ee_list_empty(drbd_dev *mdev,struct list_head *head) +{ + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, head); + spin_unlock_irq(&mdev->req_lock); +} + +STATIC struct socket* drbd_accept(drbd_dev *mdev,struct socket* sock) +{ + struct socket *newsock; + int err = 0; + + err = sock->ops->listen(sock, 5); + if (err) + goto out; + + if (sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock)) + goto out; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + err = newsock->ops->accept(sock, newsock, 0); + if (err < 0) + goto out_release; + + return newsock; + + out_release: + sock_release(newsock); + out: + if(err != -EAGAIN && err != -EINTR) + ERR("accept failed! %d\n", err); + return 0; +} + +STATIC int drbd_recv_short(drbd_dev *mdev, struct socket *sock, + void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); + + set_fs(oldfs); + + return rv; +} + +int drbd_recv(drbd_dev *mdev,void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + for(;;) { + rv = sock_recvmsg(mdev->data.socket,&msg,size,msg.msg_flags); + if (rv == size) break; + + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ + + if (rv < 0) { + if (rv == -ECONNRESET) + INFO("sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + ERR("sock_recvmsg returned %d\n",rv); + break; + } else if (rv == 0) { + INFO("sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + // D_ASSERT(signal_pending(current)); + break; + } + }; + + set_fs(oldfs); + + if(rv != size) drbd_force_state(mdev,NS(conn,BrokenPipe)); + + return rv; +} + +STATIC struct socket *drbd_try_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock; + struct sockaddr_in src_in; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + if(!inc_net(mdev)) return NULL; + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + /* explicitly bind to the configured IP as source IP + for the outgoing connections. + This is needed for multihomed hosts and to be + able to use lo: interfaces for drbd. + Make sure to use 0 as portnumber, so linux selects + a free one dynamically. + */ + memcpy (&src_in, &(mdev->net_conf->my_addr), sizeof(struct sockaddr_in)); + src_in.sin_port = 0; + + err = sock->ops->bind(sock, + (struct sockaddr * ) &src_in, + sizeof (struct sockaddr_in)); + if (err) { + ERR("Unable to bind source sock (%d)\n", err); + sock_release(sock); + sock = NULL; + dec_net(mdev); + return sock; + } + + err = sock->ops->connect(sock, + (struct sockaddr *)mdev->net_conf->peer_addr, + mdev->net_conf->peer_addr_len, 0); + + if (err) { + sock_release(sock); + sock = NULL; + } + + dec_net(mdev); + return sock; +} + +STATIC struct socket *drbd_wait_for_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock,*sock2; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock2); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + if(!inc_net(mdev)) return NULL; + + sock2->sk->sk_reuse = 1; /* SO_REUSEADDR */ + sock2->sk->sk_rcvtimeo = + sock2->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + err = sock2->ops->bind(sock2, + (struct sockaddr *) mdev->net_conf->my_addr, + mdev->net_conf->my_addr_len); + dec_net(mdev); + + if (err) { + ERR("Unable to bind sock2 (%d)\n", err); + sock_release(sock2); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return NULL; + } + + sock = drbd_accept(mdev,sock2); + sock_release(sock2); + + return sock; +} + +STATIC int drbd_do_handshake(drbd_dev *mdev); +STATIC int drbd_do_auth(drbd_dev *mdev); + +STATIC int drbd_send_fp(drbd_dev *mdev,struct socket *sock,Drbd_Packet_Cmd cmd) +{ + Drbd_Header *h = (Drbd_Header *) &mdev->data.sbuf.head; + + return _drbd_send_cmd(mdev,sock,cmd,h,sizeof(*h),0); +} + +STATIC Drbd_Packet_Cmd drbd_recv_fp(drbd_dev *mdev,struct socket *sock) +{ + Drbd_Header *h = (Drbd_Header *) &mdev->data.sbuf.head; + int rr; + + rr = drbd_recv_short(mdev, sock, h, sizeof(*h)); + + if( rr==sizeof(*h) && h->magic==BE_DRBD_MAGIC ) { + return be16_to_cpu(h->command); + } + + return 0xffff; +} + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +int drbd_connect(drbd_dev *mdev) +{ + struct socket *s, *sock,*msock; + int try,h; + + D_ASSERT(mdev->state.conn >= Unconnected); + D_ASSERT(!mdev->data.socket); + + if(drbd_request_state(mdev,NS(conn,WFConnection)) < SS_Success ) return 0; + clear_bit(DISCARD_CONCURRENT, &mdev->flags); + + sock = NULL; + msock = NULL; + //printk() + do { + for(try=0;;) { // 3 tries, this should take less than a second! + s=drbd_try_connect(mdev); + if(s || ++try >= 3 ) break; + // give the other side time to call bind() & listen() + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + } + + if(s) { + if( !sock ) { + if( drbd_send_fp(mdev, s, HandShakeS) ) { + sock = s; + s = NULL; + } + } else if( !msock ) { + if( drbd_send_fp(mdev, s, HandShakeM) ) { + msock = s; + s = NULL; + } + } else { + ERR("Logic error in drbd_connect()\n"); + return -1; + } + if(s) { + ERR("Error during sending initial packet.\n"); + sock_release(s); + } + } + + if(sock && msock) break; + + s=drbd_wait_for_connect(mdev); + if(s) { + switch(drbd_recv_fp(mdev,s)) { + case HandShakeS: + if(sock) sock_release(sock); + sock = s; + break; + case HandShakeM: + if(msock) sock_release(msock); + msock = s; + set_bit(DISCARD_CONCURRENT, &mdev->flags); + break; + default: + WARN("Error receiving initial packet\n"); + sock_release(s); + } + } + + if(mdev->state.conn <= Disconnecting) return -1; + if(signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&mdev->receiver) == Exiting) { + if(sock) sock_release(sock); + if(msock) sock_release(msock); + return -1; + } + } + + } while( !sock || !msock ); + + msock->sk->sk_reuse=1; /* SO_REUSEADDR */ + sock->sk->sk_reuse=1; /* SO_REUSEADDR */ + + sock->sk->sk_allocation = GFP_NOIO; + msock->sk->sk_allocation = GFP_NOIO; + + sock->sk->sk_priority=TC_PRIO_BULK; + // FIXME fold to limits. should be done in drbd_ioctl + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size; + /* NOT YET ... + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the HandShake timeout, wich is hardcoded for now: */ + sock->sk->sk_sndtimeo = + sock->sk->sk_rcvtimeo = 2*HZ; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK; + + msock->sk->sk_priority=TC_PRIO_INTERACTIVE; + msock->sk->sk_sndbuf = 2*32767; + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; + + if(drbd_request_state(mdev,NS(conn,WFReportParams)) < SS_Success) return 0; + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); + if (h <= 0) return h; + + if ( mdev->cram_hmac_tfm ) { + if (!drbd_do_auth(mdev)) { + ERR("Authentication of peer failed\n"); + return 0; + } + } + + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + atomic_set(&mdev->packet_seq,0); + mdev->peer_seq=0; + + drbd_thread_start(&mdev->asender); + + drbd_send_protocol(mdev); + drbd_send_sync_param(mdev,&mdev->sync_conf); + drbd_send_sizes(mdev); + drbd_send_uuids(mdev); + drbd_send_state(mdev); + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + + return 1; +} + +STATIC int drbd_recv_header(drbd_dev *mdev, Drbd_Header *h) +{ + int r; + + r = drbd_recv(mdev,h,sizeof(*h)); + + if (unlikely( r != sizeof(*h) )) { + ERR("short read expecting header on sock: r=%d\n",r); + return FALSE; + }; + h->command = be16_to_cpu(h->command); + h->length = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? on data m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + return FALSE; + } + mdev->last_received = jiffies; + + return TRUE; +} + +#if 0 +STATIC int receive_Barrier_tcq(drbd_dev *mdev, Drbd_Header* h) +{ + int rv; + int epoch_size=0; + Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + spin_lock_irq(&mdev->ee_lock); + if(list_empty(&mdev->active_ee)) { + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + } else if (mdev->last_write_w_barrier) { + mdev->last_write_w_barrier->barrier_nr2 = be32_to_cpu(p->barrier); + } else { + mdev->next_barrier_nr = be32_to_cpu(p->barrier); + } + spin_unlock_irq(&mdev->ee_lock); + + if(epoch_size) { + rv = drbd_send_b_ack(mdev, p->barrier, epoch_size); + dec_unacked(mdev); + } + + return rv; +} +#endif + +STATIC int receive_Barrier_no_tcq(drbd_dev *mdev, Drbd_Header* h) +{ + int rv; + int epoch_size; + Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + if (mdev->net_conf->wire_protocol != DRBD_PROT_C) + drbd_kick_lo(mdev); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev,&mdev->active_ee); + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + spin_unlock_irq(&mdev->req_lock); + + /* FIXME CAUTION! receiver thread sending via msock. + * to make sure this BarrierAck will not be received before the asender + * had a chance to send all the write acks corresponding to this epoch, + * wait_for that bit to clear... */ + set_bit(WRITE_ACK_PENDING,&mdev->flags); + wake_asender(mdev); + rv = wait_event_interruptible(mdev->ee_wait, + !test_bit(WRITE_ACK_PENDING,&mdev->flags)); + + if (rv == 0 && mdev->state.conn >= Connected) + rv = drbd_send_b_ack(mdev, p->barrier, epoch_size); + else + rv = 0; + dec_unacked(mdev); + + return rv; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +STATIC struct Tl_epoch_entry * +read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) +{ + struct Tl_epoch_entry *e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int ds,i,rr; + + e = drbd_alloc_ee(mdev,id,sector,data_size,GFP_KERNEL); + if(!e) return 0; + bio = e->private_bio; + ds = data_size; + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + rr = drbd_recv(mdev,kmap(page),min_t(int,ds,PAGE_SIZE)); + kunmap(page); + if( rr != min_t(int,ds,PAGE_SIZE) ) { + drbd_free_ee(mdev,e); + WARN("short read receiving data: read %d expected %d\n", + rr, min_t(int,ds,PAGE_SIZE)); + return 0; + } + ds -= rr; + } + + mdev->recv_cnt+=data_size>>9; + return e; +} + +/* drbd_drain_block() just takes a data block out of the socket input + * buffer and discards ist. + */ +STATIC int +drbd_drain_block(drbd_dev *mdev, int data_size) +{ + struct page *page; + int rr, rv=1; + void* data; + + page = drbd_pp_alloc(mdev, GFP_KERNEL); + + data=kmap(page); + while(data_size) { + rr = drbd_recv(mdev,data,min_t(int,data_size,PAGE_SIZE)); + if( rr != min_t(int,data_size,PAGE_SIZE) ) { + rv = 0; + WARN("short read receiving data: read %d expected %d\n", + rr, min_t(int,data_size,PAGE_SIZE)); + goto out; + } + + data_size -= rr; + } + kunmap(page); + out: + drbd_pp_free(mdev,page); + return rv; +} + +/* kick lower level device, if we have more than (arbitrary number) + * reference counts on it, which typically are locally submitted io + * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ +static void maybe_kick_lo(drbd_dev *mdev) +{ + if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark ) { + /* FIXME hysteresis ?? */ + drbd_kick_lo(mdev); + } +} + +STATIC int recv_dless_read(drbd_dev *mdev, drbd_request_t *req, + sector_t sector, int data_size) +{ + struct bio_vec *bvec; + struct bio *bio; + int rr,i,expect; + + bio = req->master_bio; + D_ASSERT( sector == bio->bi_sector ); + + bio_for_each_segment(bvec, bio, i) { + expect = min_t(int,data_size,bvec->bv_len); + rr=drbd_recv(mdev, + kmap(bvec->bv_page)+bvec->bv_offset, + expect); + kunmap(bvec->bv_page); + if (rr != expect) { + WARN("short read receiving data reply: read %d expected %d\n", + rr, expect); + return 0; + } + data_size -= rr; + } + + D_ASSERT(data_size == 0); + /* FIXME recv_cnt accounting ?? */ + return 1; +} + +/* e_end_resync_block() is called via + * drbd_process_done_ee() by asender only */ +STATIC int e_end_resync_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = e->sector; + int ok; + + D_ASSERT(hlist_unhashed(&e->colision)); + + if (likely( drbd_bio_uptodate(e->private_bio) )) { + drbd_set_in_sync(mdev, sector, e->size); + ok = drbd_send_ack(mdev,RSWriteAck,e); + } else { + // Record failure to sync + drbd_rs_failed_io(mdev, sector, e->size); + + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev, FALSE); + } + dec_unacked(mdev); + + return ok; +} + +STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector, int data_size) +{ + struct Tl_epoch_entry *e; + + e = read_in_block(mdev,ID_SYNCER,sector,data_size); + if(!e) return FALSE; + + dec_rs_pending(mdev); + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_resync_block; + + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list,&mdev->sync_ee); + spin_unlock_irq(&mdev->req_lock); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE (RS)WRITE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + drbd_generic_make_request(WRITE,DRBD_FAULT_RS_WR,e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; +} + +STATIC int receive_DataReply(drbd_dev *mdev,Drbd_Header* h) +{ + drbd_request_t *req; + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, + * and no more than DRBD_MAX_SEGMENT_SIZE. + * is this too restrictive? */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev,p->block_id, sector); + spin_unlock_irq(&mdev->req_lock); + if (unlikely(!req)) { + ERR("Got a corrupt block_id/sector pair(1).\n"); + return FALSE; + } + + /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + ok = recv_dless_read(mdev,req,sector,data_size); + + if (ok) req_mod(req, data_received, 0); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return ok; +} + +STATIC int receive_RSDataReply(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, + * and no more than DRBD_MAX_SEGMENT_SIZE. + * is this too restrictive? */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + if(inc_local(mdev)) { + /* data is submitted to disk within recv_resync_read. + * corresponding dec_local done below on error, + * or in drbd_endio_write_sec. */ + /* FIXME paranoia: + * verify that the corresponding bit is set. + * in case we are Primary SyncTarget, + * verify there are no pending write request to that area. + */ + ok = recv_resync_read(mdev,sector,data_size); + if (!ok) dec_local(mdev); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write resync data to local disk.\n"); + + ok = drbd_drain_block(mdev,data_size); + + drbd_send_ack_dp(mdev,NegAck,p); + } + + return ok; +} + +/* e_end_block() is called via drbd_process_done_ee(). + * this means this function only runs in the asender thread + * + * for a broken example implementation of the TCQ barrier version of + * e_end_block see older revisions... + */ +STATIC int e_end_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = e->sector; + // unsigned int epoch_size; + int ok=1,pcmd; + + if(mdev->net_conf->wire_protocol == DRBD_PROT_C) { + if(likely(drbd_bio_uptodate(e->private_bio))) { + pcmd = (mdev->state.conn >= SyncSource && + mdev->state.conn <= PausedSyncT && + e->flags & EE_MAY_SET_IN_SYNC) ? + RSWriteAck : WriteAck; + ok &= drbd_send_ack(mdev,pcmd,e); + if(pcmd==RSWriteAck) + drbd_set_in_sync(mdev,sector,e->size); + } else { + /* FIXME I think we should send a NegAck regardless of + * which protocol is in effect. + * In which case we would need to make sure that any + * NegAck is sent. basically that means that drbd_process_done_ee + * may not list_del() the ee before this callback did run... + * maybe even move the list_del(e) in here... */ + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev, FALSE); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(mdev); + } else if(unlikely(!drbd_bio_uptodate(e->private_bio))) { + ok = drbd_io_error(mdev, FALSE); + } + + /* we delete from the conflict detection hash _after_ we sent out the + * WriteAck / NegAck, to get the sequence number right. */ + if (mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + } else { + D_ASSERT(hlist_unhashed(&e->colision)); + } + + return ok; +} + +STATIC int e_send_discard_ack(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok=1; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ok = drbd_send_ack(mdev,DiscardAck,e); + + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + + dec_unacked(mdev); + + return ok; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a Data packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking Data packets. + * + * In case packet_seq is larger than mdev->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update mdev->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * FIXME verify that atomic_t guarantees 32bit wrap around, + * otherwise we have to play tricks with << ... + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int drbd_wait_peer_seq(drbd_dev *mdev, const u32 packet_seq) +{ + DEFINE_WAIT(wait); + int ret = 0; + spin_lock(&mdev->peer_seq_lock); + for (;;) { + prepare_to_wait(&mdev->seq_wait,&wait,TASK_INTERRUPTIBLE); + if (seq_le(packet_seq,mdev->peer_seq+1)) + break; + spin_unlock(&mdev->peer_seq_lock); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + schedule(); + spin_lock(&mdev->peer_seq_lock); + } + finish_wait(&mdev->seq_wait, &wait); + if (mdev->peer_seq+1 == packet_seq) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + return ret; +} + +// mirrored write +STATIC int receive_Data(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + struct Tl_epoch_entry *e; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + int header_size, data_size; + unsigned int barrier_nr = 0; + unsigned int epoch_size = 0; + u32 dp_flags; + + // FIXME merge this code dups into some helper function + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + if(!inc_local(mdev)) { + /* data is submitted to disk at the end of this function. + * corresponding dec_local done either below (on error), + * or in drbd_endio_write_sec. */ + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write mirrored data block to local disk.\n"); + spin_lock(&mdev->peer_seq_lock); + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + + drbd_send_ack_dp(mdev,NegAck,p); + mdev->epoch_size++; // spin lock ? + return drbd_drain_block(mdev,data_size); + } + + sector = be64_to_cpu(p->sector); + e = read_in_block(mdev,p->block_id,sector,data_size); + if (!e) { + dec_local(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_block; + + dp_flags = be32_to_cpu(p->dp_flags); + if ( dp_flags & DP_HARDBARRIER ) { + e->private_bio->bi_rw |= BIO_RW_BARRIER; + } + if ( dp_flags & DP_RW_SYNC ) { + e->private_bio->bi_rw |= BIO_RW_SYNC; + } + if ( dp_flags & DP_MAY_SET_IN_SYNC ) { + e->flags |= EE_MAY_SET_IN_SYNC; + } + + /* I'm the receiver, I do hold a net_cnt reference. */ + if (!mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + } else { + /* don't get the req_lock yet, + * we may sleep in drbd_wait_peer_seq */ + const sector_t sector = e->sector; + const int size = e->size; + const int discard = test_bit(DISCARD_CONCURRENT,&mdev->flags); + DEFINE_WAIT(wait); + drbd_request_t *i; + struct hlist_node *n; + struct hlist_head *slot; + int first; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + BUG_ON(mdev->ee_hash == NULL); + BUG_ON(mdev->tl_hash == NULL); + + /* conflict detection and handling: + * 1. wait on the sequence number, + * in case this data packet overtook ACK packets. + * 2. check our hash tables for conflicting requests. + * we only need to walk the tl_hash, since an ee can not + * have a conflict with an other ee: on the submitting + * node, the corresponding req had already been conflicting, + * and a conflicting req is never sent. + * + * Note: for two_primaries, we are protocol C, + * so there cannot be any request that is DONE + * but still on the transfer log. + * + * unconditionally add to the ee_hash. + * + * if no conflicting request is found: + * submit. + * + * if any conflicting request is found that has not yet been acked, + * AND I have the "discard concurrent writes" flag: + * queue (via done_ee) the DiscardAck; OUT. + * + * if any conflicting request is found: + * block the receiver, waiting on misc_wait + * until no more conflicting requests are there, + * or we get interrupted (disconnect). + * + * we do not just write after local io completion of those + * requests, but only after req is done completely, i.e. + * we wait for the DiscardAck to arrive! + * + * then proceed normally, i.e. submit. + */ + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) + goto out_interrupted; + + spin_lock_irq(&mdev->req_lock); + + hlist_add_head(&e->colision,ee_hash_slot(mdev,sector)); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev,sector); + first = 1; + for(;;) { + int have_unacked = 0; + int have_conflict = 0; + prepare_to_wait(&mdev->misc_wait,&wait,TASK_INTERRUPTIBLE); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + if (first) { + /* only ALERT on first iteration, + * we may be woken up early... */ + ALERT("%s[%u] Concurrent local write detected!" + " new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + } + if (i->rq_state & RQ_NET_PENDING) ++have_unacked; + ++have_conflict; + } + } +#undef OVERLAPS + if (!have_conflict) break; + + /* Discard Ack only for the _first_ iteration */ + if (first && discard && have_unacked) { + ALERT("Concurrent write! [DISCARD BY FLAG] sec=%llus\n", + (unsigned long long)sector); + inc_unacked(mdev); + mdev->epoch_size++; + e->w.cb = e_send_discard_ack; + list_add_tail(&e->w.list,&mdev->done_ee); + + spin_unlock_irq(&mdev->req_lock); + + /* we could probably send that DiscardAck ourselves, + * but I don't like the receiver using the msock */ + + dec_local(mdev); + wake_asender(mdev); + finish_wait(&mdev->misc_wait, &wait); + return TRUE; + } + + if (signal_pending(current)) { + hlist_del_init(&e->colision); + + spin_unlock_irq(&mdev->req_lock); + + finish_wait(&mdev->misc_wait, &wait); + goto out_interrupted; + } + + spin_unlock_irq(&mdev->req_lock); + if (first) { + first = 0; + ALERT("Concurrent write! [W AFTERWARDS] " + "sec=%llus\n",(unsigned long long)sector); + } else if (discard) { + /* we had none on the first iteration. + * there must be none now. */ + D_ASSERT(have_unacked == 0); + } + schedule(); + spin_lock_irq(&mdev->req_lock); + } + finish_wait(&mdev->misc_wait, &wait); + } + + /* when using TCQ: + * note that, when using tagged command queuing, we may + * have more than one reorder domain "active" at a time. + * + * THINK: + * do we have any guarantees that we get the completion + * events of the different reorder domains in order? + * or does the api only "guarantee" that the events + * _happened_ in order, but eventually the completion + * callbacks are shuffeled again? + * + * note that I wonder about the order in which the + * callbacks are run, I am reasonable confident that the + * actual completion happens in order. + * + * - can it happen that the tagged write completion is + * called even though not all of the writes before it + * have run their completion callback? + * - can it happen that some completion callback of some + * write after the tagged one is run, even though the + * callback of the tagged one itself is still pending? + * + * if this can happen, we either need to drop our "debug + * assertion" about the epoch size and just trust our code + * and the layers below us (nah, won't do that). + * + * or we need to replace the "active_ee" list by some sort + * of "transfer log" on the receiving side, too, which + * uses epoch counters per reorder domain. + */ + + /* when using tcq: + * if we got a barrier packet before, but at that time the active_ee + * was not yet empty, we just "remembered" this barrier request. + * + * if this is the first data packet since that barrier, maybe meanwhile + * all previously active writes have been completed? + * if so, send the b_ack right now + * (though, maybe rather move it into the e_end_block callback, + * where it would be sent as soon as possible). + * + * otherwise, tag the write with the barrier number, so it + * will trigger the b_ack before its own ack. + */ + if (mdev->next_barrier_nr) { + /* only when using TCQ */ + if (list_empty(&mdev->active_ee)) { + barrier_nr = mdev->next_barrier_nr; + epoch_size = mdev->epoch_size; + mdev->epoch_size = 0; + } else { + e->barrier_nr = mdev->next_barrier_nr; + } + e->private_bio->bi_rw |= BIO_RW_BARRIER; + mdev->next_barrier_nr = 0; + } + list_add(&e->w.list,&mdev->active_ee); + spin_unlock_irq(&mdev->req_lock); + + if (barrier_nr) { + /* only when using TCQ + * maybe rather move it into the e_end_block callback, + * where it would be sent as soon as possible). + */ + (void)drbd_send_b_ack(mdev, cpu_to_be32(barrier_nr), epoch_size); + } + + switch(mdev->net_conf->wire_protocol) { + case DRBD_PROT_C: + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + break; + case DRBD_PROT_B: + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(mdev, RecvAck, e); + break; + case DRBD_PROT_A: + // nothing to do + break; + } + + if(mdev->state.pdsk == Diskless) { + // In case we have the only disk of the cluster, + drbd_set_out_of_sync(mdev,e->sector,e->size); + e->flags |= EE_CALL_AL_COMPLETE_IO; + drbd_al_begin_io(mdev, e->sector); + } + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE (DATA)WRITE sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + /* FIXME drbd_al_begin_io in case we have two primaries... */ + drbd_generic_make_request(WRITE,DRBD_FAULT_DT_WR,e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; + + out_interrupted: + /* yes, the epoch_size now is imbalanced. + * but we drop the connection anyways, so we don't have a chance to + * receive a barrier... atomic_inc(&mdev->epoch_size); */ + dec_local(mdev); + drbd_free_ee(mdev,e); + return FALSE; +} + +STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h) +{ + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct Tl_epoch_entry *e; + int size; + unsigned int fault_type; + Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector,size); + return FALSE; + } + if ( sector + (size>>9) > capacity) { + ERR("%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector,size); + return FALSE; + } + + if(!inc_local_if_state(mdev, UpToDate)) { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not satisfy peer's read request, no local data.\n"); + drbd_send_ack_rp(mdev,h->command == DataRequest ? NegDReply : + NegRSDReply ,p); + return TRUE; + } + + e = drbd_alloc_ee(mdev,p->block_id,sector,size,GFP_KERNEL); + if (!e) { + dec_local(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_read_sec; + + switch (h->command) { + case DataRequest: + e->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + break; + case RSDataRequest: + e->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchrously. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev,sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + dec_local(mdev); + drbd_free_ee(mdev,e); + return 0; + } + break; + default:; /* avoid compiler warning */ + fault_type = DRBD_FAULT_MAX; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list,&mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + inc_unacked(mdev); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("submit EE READ sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + /* FIXME actually, it could be a READA originating from the peer ... */ + drbd_generic_make_request(READ,fault_type,e->private_bio); + maybe_kick_lo(mdev); + + return TRUE; +} + +STATIC int drbd_asb_recover_0p(drbd_dev *mdev) +{ + int self, peer, rv=-100; + unsigned long ch_self, ch_peer; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + ch_peer = mdev->p_uuid[UUID_SIZE]; + ch_self = mdev->comm_bm_set; + + switch ( mdev->net_conf->after_sb_0p ) { + case Consensus: + case DiscardSecondary: + case CallHelper: + ERR("Configuration error.\n"); + break; + case Disconnect: + break; + case DiscardYoungerPri: + if (self == 0 && peer == 1) { rv = -1; break; } + if (self == 1 && peer == 0) { rv = 1; break; } + /* Else fall through to one of the other strategies... */ + case DiscardOlderPri: + if (self == 0 && peer == 1) { rv = 1; break; } + if (self == 1 && peer == 0) { rv = -1; break; } + /* Else fall through to one of the other strategies... */ + WARN("Discard younger/older primary did not found a decision\n" + "Using discard-least-changes instead\n"); + case DiscardZeroChg: + if( ch_peer == 0 && ch_self == 0) { + rv=test_bit(DISCARD_CONCURRENT,&mdev->flags) ? -1 : 1; + break; + } else { + if ( ch_peer == 0 ) { rv = 1; break; } + if ( ch_self == 0 ) { rv = -1; break; } + } + if( mdev->net_conf->after_sb_0p == DiscardZeroChg ) break; + case DiscardLeastChg: + if ( ch_self < ch_peer ) rv = -1; + else if ( ch_self > ch_peer ) rv = 1; + else /* ( ch_self == ch_peer ) */ { + // Well, then use something else. + rv=test_bit(DISCARD_CONCURRENT,&mdev->flags) ? -1 : 1; + } + break; + case DiscardLocal: + rv = -1; + break; + case DiscardRemote: + rv = 1; + } + + return rv; +} + +STATIC int drbd_asb_recover_1p(drbd_dev *mdev) +{ + int self, peer, hg, rv=-100; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + switch ( mdev->net_conf->after_sb_1p ) { + case DiscardYoungerPri: + case DiscardOlderPri: + case DiscardLeastChg: + case DiscardLocal: + case DiscardRemote: + ERR("Configuration error.\n"); + break; + case Disconnect: + break; + case Consensus: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 && mdev->state.role==Secondary) rv=hg; + if( hg == 1 && mdev->state.role==Primary) rv=hg; + break; + case Violently: + rv = drbd_asb_recover_0p(mdev); + break; + case DiscardSecondary: + return mdev->state.role==Primary ? 1 : -1; + case CallHelper: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 && mdev->state.role==Primary) { + self = drbd_set_role(mdev,Secondary,0); + if (self != SS_Success) { + drbd_khelper(mdev,"pri-lost-after-sb"); + } else { + WARN("Sucessfully gave up primary role.\n"); + rv = hg; + } + } else rv = hg; + } + + return rv; +} + +STATIC int drbd_asb_recover_2p(drbd_dev *mdev) +{ + int self, peer, hg, rv=-100; + + self = mdev->bc->md.uuid[Bitmap] & 1; + peer = mdev->p_uuid[Bitmap] & 1; + + switch ( mdev->net_conf->after_sb_2p ) { + case DiscardYoungerPri: + case DiscardOlderPri: + case DiscardLeastChg: + case DiscardLocal: + case DiscardRemote: + case Consensus: + case DiscardSecondary: + ERR("Configuration error.\n"); + break; + case Violently: + rv = drbd_asb_recover_0p(mdev); + break; + case Disconnect: + break; + case CallHelper: + hg = drbd_asb_recover_0p(mdev); + if( hg == -1 ) { + self = drbd_set_role(mdev,Secondary,0); + if (self != SS_Success) { + drbd_khelper(mdev,"pri-lost-after-sb"); + } else { + WARN("Sucessfully gave up primary role.\n"); + rv = hg; + } + } else rv = hg; + } + + return rv; +} + +STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid) +{ + INFO("%s %016llX:%016llX:%016llX:%016llX\n", + text, + uuid[Current], + uuid[Bitmap], + uuid[History_start], + uuid[History_end]); +} + +/* + 100 after split brain try auto recover + 2 SyncSource set BitMap + 1 SyncSource use BitMap + 0 no Sync + -1 SyncTarget use BitMap + -2 SyncTarget set BitMap + -100 after split brain, disconnect +-1000 unrelated data + */ +STATIC int drbd_uuid_compare(drbd_dev *mdev, int *rule_nr) +{ + u64 self, peer; + int i,j; + + self = mdev->bc->md.uuid[Current] & ~((u64)1); + peer = mdev->p_uuid[Current] & ~((u64)1); + + *rule_nr = 1; + if (self == UUID_JUST_CREATED && + peer == UUID_JUST_CREATED) return 0; + + *rule_nr = 2; + if (self == UUID_JUST_CREATED && + peer != UUID_JUST_CREATED) return -2; + + *rule_nr = 3; + if (self != UUID_JUST_CREATED && + peer == UUID_JUST_CREATED) return 2; + + *rule_nr = 4; + if (self == peer) { // Common power [off|failure] + int rct,dc; // roles at crash time + + rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + + ( mdev->p_uuid[UUID_FLAGS] & 2 ); + // lowest bit is set when we were primary + // next bit (weight 2) is set when peer was primary + + MTRACE(TraceTypeUuid,TraceLvlMetrics, DUMPI(rct); ); + + switch(rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(DISCARD_CONCURRENT,&mdev->flags); + MTRACE(TraceTypeUuid,TraceLvlMetrics, DUMPI(dc); ); + return dc ? -1 : 1; + } + } + + *rule_nr = 5; + peer = mdev->p_uuid[Bitmap] & ~((u64)1); + if (self == peer) return -1; + + *rule_nr = 6; + for ( i=History_start ; i<=History_end ; i++ ) { + peer = mdev->p_uuid[i] & ~((u64)1); + if (self == peer) return -2; + } + + *rule_nr = 7; + self = mdev->bc->md.uuid[Bitmap] & ~((u64)1); + peer = mdev->p_uuid[Current] & ~((u64)1); + if (self == peer) return 1; + + *rule_nr = 8; + for ( i=History_start ; i<=History_end ; i++ ) { + self = mdev->bc->md.uuid[i] & ~((u64)1); + if (self == peer) return 2; + } + + *rule_nr = 9; + self = mdev->bc->md.uuid[Bitmap] & ~((u64)1); + peer = mdev->p_uuid[Bitmap] & ~((u64)1); + if (self == peer) return 100; + + *rule_nr = 10; + for ( i=History_start ; i<=History_end ; i++ ) { + self = mdev->p_uuid[i] & ~((u64)1); + for ( j=History_start ; j<=History_end ; j++ ) { + peer = mdev->p_uuid[j] & ~((u64)1); + if (self == peer) return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + conn_mask (-1) on failure. + */ +STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, + drbd_disks_t peer_disk) +{ + int hg,rule_nr; + drbd_conns_t rv = conn_mask; + drbd_disks_t mydisk; + + mydisk = mdev->state.disk; + if( mydisk == Negotiating ) mydisk = mdev->new_state_tmp.disk; + + hg = drbd_uuid_compare(mdev,&rule_nr); + + MTRACE(TraceTypeUuid,TraceLvlSummary, + INFO("drbd_sync_handshake:\n"); + drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); + drbd_uuid_dump(mdev,"peer",mdev->p_uuid); + INFO("uuid_compare()=%d by rule %d\n",hg,rule_nr); + ); + + if (hg == 100) { + int pcount = (mdev->state.role==Primary) + (peer_role==Primary); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(mdev); + break; + case 1: + hg = drbd_asb_recover_1p(mdev); + break; + case 2: + hg = drbd_asb_recover_2p(mdev); + break; + } + if ( abs(hg) < 100 ) { + WARN("Split-Brain detected, %d primaries, automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer":"this"); + } + } + + if ( hg == -100 ) { + if(mdev->net_conf->want_lose && !(mdev->p_uuid[UUID_FLAGS]&1)){ + hg = -1; + } + if(!mdev->net_conf->want_lose && (mdev->p_uuid[UUID_FLAGS]&1)){ + hg = 1; + } + + if ( abs(hg) < 100 ) { + WARN("Split-Brain detected, manually solved. Sync from %s node\n", + (hg < 0) ? "peer":"this"); + } + } + + if (abs(hg) < 100) { + // This is needed in case someone does an invalidate on an + // disconnected node. This has priority. + if(mydisk==Inconsistent && peer_disk>Inconsistent) hg=-1; + if(mydisk>Inconsistent && peer_disk==Inconsistent) hg= 1; + } + + if (hg == -1000) { + ALERT("Unrelated data, dropping connection!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if (hg == -100) { + ALERT("Split-Brain detected, dropping connection!\n"); + drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); + drbd_uuid_dump(mdev,"peer",mdev->p_uuid); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if (hg > 0 && mydisk <= Inconsistent ) { + ERR("I shall become SyncSource, but I am inconsistent!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + } + + if (hg < 0 && // by intention we do not use mydisk here. + mdev->state.role == Primary && mdev->state.disk >= Consistent ) { + switch(mdev->net_conf->rr_conflict) { + case CallHelper: + drbd_khelper(mdev,"pri-lost"); + // fall through + case Disconnect: + ERR("I shall become SyncTarget, but I am primary!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return conn_mask; + case Violently: + WARN("Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (abs(hg) >= 2) { + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + + drbd_bm_set_all(mdev); + + if (unlikely(drbd_bm_write(mdev) < 0)) { + return conn_mask; + } + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + } + + if (hg > 0) { // become sync source. + rv = WFBitMapS; + } else if (hg < 0) { // become sync target + drbd_uuid_set(mdev,Current,mdev->p_uuid[Bitmap]); + rv = WFBitMapT; + } else { + rv = Connected; + if(drbd_bm_total_weight(mdev)) { + INFO("No resync, but bits in bitmap!\n"); + } + } + + return rv; +} + +/* returns 1 if invalid */ +STATIC int cmp_after_sb(enum after_sb_handler peer, enum after_sb_handler self) +{ + // DiscardRemote - DiscardLocal is valid + if( (peer == DiscardRemote && self == DiscardLocal) || + (self == DiscardRemote && peer == DiscardLocal) ) return 0; + + // any other things with DiscardRemote or DiscardLocal are invalid + if( peer == DiscardRemote || peer == DiscardLocal || + self == DiscardRemote || self == DiscardLocal ) return 1; + + // everything else is valid if they are equal on both sides. + if( peer == self ) return 0; + + // everything es is invalid. + return 1; +} + +STATIC int receive_protocol(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Protocol_Packet *p = (Drbd_Protocol_Packet*)h; + + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_want_lose, p_two_primaries; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_want_lose = be32_to_cpu(p->want_lose); + p_two_primaries = be32_to_cpu(p->two_primaries); + + if( p_proto != mdev->net_conf->wire_protocol) { + ERR("incompatible communication protocols\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p) ) { + ERR("incompatible after-sb-0pri settings\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p) ) { + ERR("incompatible after-sb-1pri settings\n"); + goto disconnect; + } + + if( cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p) ) { + ERR("incompatible after-sb-2pri settings\n"); + goto disconnect; + } + + if( p_want_lose && mdev->net_conf->want_lose ) { + ERR("both sides have the 'want_lose' flag set\n"); + goto disconnect; + } + + if( p_two_primaries != mdev->net_conf->two_primaries ) { + ERR("incompatible setting of the two-primaries options\n"); + goto disconnect; + } + + return TRUE; + + disconnect: + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; +} + +STATIC int receive_SyncParam(drbd_dev *mdev,Drbd_Header *h) +{ + int ok = TRUE; + Drbd_SyncParam_Packet *p = (Drbd_SyncParam_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + // XXX harmless race with ioctl ... + mdev->sync_conf.rate = be32_to_cpu(p->rate); + + return ok; +} + +STATIC void drbd_setup_order_type(drbd_dev *mdev, int peer) +{ +#if 0 + int self = drbd_queue_order_type(mdev); + int type; + + static char *order_txt[] = { + [QUEUE_ORDERED_NONE] = "none - oldIDE", + [QUEUE_ORDERED_FLUSH] = "flush - IDE", + [QUEUE_ORDERED_TAG] = "tag - TCQ", + }; + + if(self == QUEUE_ORDERED_NONE || + peer == QUEUE_ORDERED_NONE) { + type = QUEUE_ORDERED_NONE; + } else if (self == QUEUE_ORDERED_FLUSH || + peer == QUEUE_ORDERED_FLUSH) { + type = QUEUE_ORDERED_FLUSH; + } else if(self == QUEUE_ORDERED_TAG || + peer == QUEUE_ORDERED_TAG) { + type = QUEUE_ORDERED_TAG; + } else { + D_ASSERT(0); + type = QUEUE_ORDERED_NONE; + } + + if (type != self ) { + INFO("Exposing an order type of '%s' to the kernel\n", + order_txt[type]); + blk_queue_ordered(mdev->rq_queue,type); + } +#endif +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(drbd_dev *mdev, const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) return; + d = (a > b) ? (a - b) : (b - a); + if ( d > (a>>3) || d > (b>>3)) { + WARN("Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); + } +} + +STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Sizes_Packet *p = (Drbd_Sizes_Packet*)h; + unsigned int max_seg_s; + sector_t p_size, p_usize, my_usize; + drbd_conns_t nconn; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_size=be64_to_cpu(p->d_size); + p_usize=be64_to_cpu(p->u_size); + + if(p_size == 0 && mdev->state.disk == Diskless ) { + ERR("some backing storage is needed\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + if(inc_local(mdev)) { + warn_if_differ_considerably(mdev, "lower level device sizes", + p_size, drbd_get_capacity(mdev->bc->backing_bdev)); + warn_if_differ_considerably(mdev, "user requested size", + p_usize, mdev->bc->dc.disk_size); + + if (mdev->state.conn == WFReportParams) { + /* this is first connect, or an otherwise expected + param exchange. choose the minimum */ + p_usize = min_not_zero(mdev->bc->dc.disk_size, p_usize); + } + + my_usize = mdev->bc->dc.disk_size; + + if( mdev->bc->dc.disk_size != p_usize ) { + mdev->bc->dc.disk_size = p_usize; + INFO("Peer sets u_size to %lu KB\n", + (unsigned long)mdev->bc->dc.disk_size); + } + + // Never shrink a device with usable data. + if(drbd_new_dev_size(mdev,mdev->bc) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= Outdated ) { + dec_local(mdev); + ERR("The peer's disk size is too small!\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + mdev->bc->dc.disk_size = my_usize; + return FALSE; + } + dec_local(mdev); + } +#undef min_not_zero + + mdev->p_size=p_size; + if(inc_local(mdev)) { + drbd_bm_lock(mdev); // { + /* + * you may get a flip-flop connection established/connection loss, + * in case both really have different usize uppon first connect! + * try to solve it thus: + ***/ + + drbd_determin_dev_size(mdev); + drbd_bm_unlock(mdev); // } + dec_local(mdev); + } else { + // I am diskless, need to accept the peer's size. + drbd_set_my_capacity(mdev,p_size); + } + + if (mdev->p_uuid && mdev->state.conn <= Connected && inc_local(mdev)) { + nconn=drbd_sync_handshake(mdev,mdev->state.peer,mdev->state.pdsk); + dec_local(mdev); + + if(nconn == conn_mask) return FALSE; + + if(drbd_request_state(mdev,NS(conn,nconn)) < SS_Success) { + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + } + + if(inc_local(mdev)) { + max_seg_s = be32_to_cpu(p->max_segment_size); + if( max_seg_s != mdev->rq_queue->max_segment_size ) { + drbd_setup_queue_param(mdev, max_seg_s); + } + + drbd_setup_order_type(mdev,be32_to_cpu(p->queue_order_type)); + dec_local(mdev); + } + + if (mdev->state.conn > WFReportParams ) { + if( be64_to_cpu(p->c_size) != + drbd_get_capacity(mdev->this_bdev) ) { + // we have different sizes, probabely peer + // needs to know my new size... + drbd_send_sizes(mdev); + } + } + + return TRUE; +} + +STATIC int receive_uuids(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_GenCnt_Packet *p = (Drbd_GenCnt_Packet*)h; + u64 *p_uuid; + int i; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_uuid = kmalloc(sizeof(u64)*EXT_UUID_SIZE, GFP_KERNEL); + + for (i = Current; i < EXT_UUID_SIZE; i++) { + p_uuid[i] = be64_to_cpu(p->uuid[i]); + } + + if ( mdev->p_uuid ) kfree(mdev->p_uuid); + mdev->p_uuid = p_uuid; + + return TRUE; +} + +/** + * convert_state: + * Switches the view of the state. + */ +STATIC drbd_state_t convert_state(drbd_state_t ps) +{ + drbd_state_t ms; + + static drbd_conns_t c_tab[] = { + [Connected] = Connected, + + [StartingSyncS] = StartingSyncT, + [StartingSyncT] = StartingSyncS, + [Disconnecting] = TearDown, // NetworkFailure, + + [conn_mask] = conn_mask, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = ( ps.aftr_isp | ps.user_isp ); + + return ms; +} + +STATIC int receive_req_state(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Req_State_Packet *p = (Drbd_Req_State_Packet*)h; + drbd_state_t mask,val; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_lock(mdev); + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(mdev,ChgStateVerbose,mask,val); + + if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_unlock(mdev); + + drbd_send_sr_reply(mdev,rv); + drbd_md_sync(mdev); + + return TRUE; +} + +STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_State_Packet *p = (Drbd_State_Packet*)h; + drbd_conns_t nconn; + drbd_state_t os,ns,peer_state; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + nconn = mdev->state.conn; + if (nconn == WFReportParams ) nconn = Connected; + + peer_state.i = be32_to_cpu(p->state); + + if (mdev->p_uuid && mdev->state.conn <= Connected && + inc_local_if_state(mdev,Negotiating) && + peer_state.disk >= Negotiating) { + nconn=drbd_sync_handshake(mdev,peer_state.role,peer_state.disk); + dec_local(mdev); + + if(nconn == conn_mask) return FALSE; + } + + if (mdev->state.conn > WFReportParams ) { + if( nconn > Connected && peer_state.conn <= Connected) { + // we want resync, peer has not yet decided to sync... + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + else if (nconn == Connected && peer_state.disk == Negotiating) { + // peer is waiting for us to respond... + drbd_send_state(mdev); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = mdev->state.i; + ns.conn = nconn; + ns.peer = peer_state.role; + ns.pdsk = peer_state.disk; + ns.peer_isp = ( peer_state.aftr_isp | peer_state.user_isp ); + if((nconn == Connected || nconn == WFBitMapS) && + ns.disk == Negotiating ) ns.disk = UpToDate; + if((nconn == Connected || nconn == WFBitMapT) && + ns.pdsk == Negotiating ) ns.pdsk = UpToDate; + rv = _drbd_set_state(mdev,ns,ChgStateVerbose | ChgStateHard); + spin_unlock_irq(&mdev->req_lock); + if (rv==SS_Success) { + after_state_ch(mdev,os,ns,ChgStateVerbose | ChgStateHard); + } + + if(rv < SS_Success) { + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + + mdev->net_conf->want_lose = 0; + + /* FIXME assertion for (gencounts do not diverge) */ + drbd_md_sync(mdev); // update connected indicator, la_size, ... + + return TRUE; +} + +STATIC int receive_sync_uuid(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_SyncUUID_Packet *p = (Drbd_SyncUUID_Packet*)h; + + wait_event( mdev->misc_wait, + mdev->state.conn < Connected || mdev->state.conn == WFSyncUUID); + + // D_ASSERT( mdev->state.conn == WFSyncUUID ); + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + _drbd_uuid_set(mdev,Current,be64_to_cpu(p->uuid)); + _drbd_uuid_set(mdev,Bitmap,0UL); + + drbd_start_resync(mdev,SyncTarget); + + return TRUE; +} + +/* Since we are processing the bitfild from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we suceessfully received it. */ +STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) +{ + size_t bm_words, bm_i, want, num_words; + unsigned long *buffer; + int ok=FALSE; + + drbd_bm_lock(mdev); // { + + bm_words = drbd_bm_words(mdev); + bm_i = 0; + buffer = vmalloc(BM_PACKET_WORDS*sizeof(long)); + + while (1) { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + ERR_IF(want != h->length) goto out; + if (want==0) break; + if (drbd_recv(mdev, buffer, want) != want) + goto out; + + drbd_bm_merge_lel(mdev, bm_i, num_words, buffer); + bm_i += num_words; + + if (!drbd_recv_header(mdev,h)) + goto out; + D_ASSERT(h->command == ReportBitMap); + } + + clear_bit(CRASHED_PRIMARY, &mdev->flags); // md_write() is in drbd_start_resync. + if (mdev->state.conn == WFBitMapS) { + drbd_start_resync(mdev,SyncSource); + } else if (mdev->state.conn == WFBitMapT) { + ok = drbd_send_bitmap(mdev); + if (!ok) goto out; + ok = drbd_request_state(mdev,NS(conn,WFSyncUUID)); + D_ASSERT( ok == SS_Success ); + } else { + ERR("unexpected cstate (%s) in receive_bitmap\n", + conns_to_name(mdev->state.conn)); + } + + // We just started resync. Now we can be sure that local disk IO is okay. + + /* no, actually we can't. failures happen asynchronously, anytime. + * we can never be sure. disk may have failed while we where busy shaking hands... + */ + + ok=TRUE; + out: + drbd_bm_unlock(mdev); // } + vfree(buffer); + return ok; +} + +STATIC int receive_skip(drbd_dev *mdev,Drbd_Header *h) +{ + // TODO zero copy sink :) + static char sink[128]; + int size,want,r; + + WARN("skipping unknown optional packet type %d, l: %d!\n", + h->command, h->length ); + + size = h->length; + while (size > 0) { + want = min_t(int,size,sizeof(sink)); + r = drbd_recv(mdev,sink,want); + ERR_IF(r < 0) break; + size -= r; + } + return (size == 0); +} + +STATIC int receive_UnplugRemote(drbd_dev *mdev, Drbd_Header *h) +{ + if (mdev->state.disk >= Inconsistent) drbd_kick_lo(mdev); + return TRUE; // cannot fail. +} + +typedef int (*drbd_cmd_handler_f)(drbd_dev*,Drbd_Header*); + +static drbd_cmd_handler_f drbd_default_handler[] = { + [Data] = receive_Data, + [DataReply] = receive_DataReply, + [RSDataReply] = receive_RSDataReply, + [RecvAck] = NULL, // via msock: got_RecvAck, + [WriteAck] = NULL, // via msock: got_WriteAck, + [Barrier] = receive_Barrier_no_tcq, + [BarrierAck] = NULL, // via msock: got_BarrierAck, + [ReportBitMap] = receive_bitmap, + [Ping] = NULL, // via msock: got_Ping, + [PingAck] = NULL, // via msock: got_PingAck, + [UnplugRemote] = receive_UnplugRemote, + [DataRequest] = receive_DataRequest, + [RSDataRequest] = receive_DataRequest, //receive_RSDataRequest, + [SyncParam] = receive_SyncParam, + [ReportProtocol] = receive_protocol, + [ReportUUIDs] = receive_uuids, + [ReportSizes] = receive_sizes, + [ReportState] = receive_state, + [StateChgRequest] = receive_req_state, + [ReportSyncUUID] = receive_sync_uuid, +}; + +static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; +static drbd_cmd_handler_f *drbd_opt_cmd_handler = NULL; + +#if 0 + /* FIXME lge thinks the implementation of barrier handling via + * tcq is currently broken */ +void drbd_set_recv_tcq(drbd_dev * mdev, int tcq_enabled) +{ +// warning LGE "FIXME make drbd_cmd_handler a member of mdev" + if(tcq_enabled && + drbd_default_handler[Barrier] != receive_Barrier_tcq) { + INFO("Enabling TCQ for barrier processing on backend.\n"); + drbd_default_handler[Barrier] = receive_Barrier_tcq; + } + + if(!tcq_enabled && + drbd_default_handler[Barrier] != receive_Barrier_usual) { + INFO("Using conventional (non TCQ) barrier processing" + " on backend.\n"); + drbd_default_handler[Barrier] = receive_Barrier_usual; + } +} +#endif + +STATIC void drbdd(drbd_dev *mdev) +{ + drbd_cmd_handler_f handler; + Drbd_Header *header = &mdev->data.rbuf.head; + + while (get_t_state(&mdev->receiver) == Running) { + if (!drbd_recv_header(mdev,header)) + break; + + if (header->command < MAX_CMD) + handler = drbd_cmd_handler[header->command]; + else if (MayIgnore < header->command && header->command < MAX_OPT_CMD) + handler = drbd_opt_cmd_handler[header->command-MayIgnore]; + else if (header->command > MAX_OPT_CMD) + handler = receive_skip; + else + handler = NULL; + + if (unlikely(!handler)) { + ERR("unknown packet type %d, l: %d!\n", + header->command, header->length); + drbd_force_state(mdev,NS(conn,ProtocolError)); + break; + } + if (unlikely(!handler(mdev,header))) { + ERR("error receiving %s, l: %d!\n", + cmdname(header->command), header->length); + drbd_force_state(mdev,NS(conn,ProtocolError)); + break; + } + + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + } +} + +/* FIXME how should freeze-io be handled? */ +STATIC void drbd_fail_pending_reads(drbd_dev *mdev) +{ + struct hlist_head *slot; + struct hlist_node *n; + drbd_request_t * req; + struct list_head *le; + LIST_HEAD(workset); + int i; + + /* + * Application READ requests + */ + spin_lock_irq(&mdev->req_lock); + for(i=0;iapp_reads_hash+i; + hlist_for_each_entry(req, n, slot, colision) { + list_add(&req->w.list, &workset); + } + } + memset(mdev->app_reads_hash,0,APP_R_HSIZE*sizeof(void*)); + + while(!list_empty(&workset)) { + le = workset.next; + req = list_entry(le, drbd_request_t, w.list); + list_del(le); + + _req_mod(req, connection_lost_while_pending, 0); + } + spin_unlock_irq(&mdev->req_lock); +} + +STATIC void drbd_disconnect(drbd_dev *mdev) +{ + struct drbd_work prev_work_done; + enum fencing_policy fp; + drbd_state_t os,ns; + int rv=SS_UnknownError; + + D_ASSERT(mdev->state.conn < Connected); + /* FIXME verify that: + * the state change magic prevents us from becoming >= Connected again + * while we are still cleaning up. + */ + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&mdev->asender); + + fp = DontCare; + if(inc_local(mdev)) { + fp = mdev->bc->dc.fencing; + dec_local(mdev); + } + + down(&mdev->data.mutex); + drbd_free_sock(mdev); + up(&mdev->data.mutex); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev,&mdev->active_ee); + _drbd_wait_ee_list_empty(mdev,&mdev->sync_ee); + _drbd_clear_done_ee(mdev); + _drbd_wait_ee_list_empty(mdev,&mdev->read_ee); + reclaim_net_ee(mdev); + spin_unlock_irq(&mdev->req_lock); + + /* FIXME: fail pending reads? + * when we are configured for freeze io, + * we could retry them once we un-freeze. */ + drbd_fail_pending_reads(mdev); + + /* We do not have data structures that would allow us to + get the rs_pending_cnt down to 0 again. + * On SyncTarget we do not have any data structures describing + the pending RSDataRequest's we have sent. + * On SyncSource there is no data structure that tracks + the RSDataReply blocks that we sent to the SyncTarget. + And no, it is not the sum of the reference counts in the + resync_LRU. The resync_LRU tracks the whole operation including + the disk-IO, while the rs_pending_cnt only tracks the blocks + on the fly. */ + drbd_rs_cancel_all(mdev); + mdev->rs_total=0; + mdev->rs_failed=0; + atomic_set(&mdev->rs_pending_cnt,0); + wake_up(&mdev->misc_wait); + + /* make sure syncer is stopped and w_resume_next_sg queued */ + del_timer_sync(&mdev->resync_timer); + set_bit(STOP_SYNC_TIMER,&mdev->flags); + resync_timer_fn((unsigned long)mdev); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + set_bit(WORK_PENDING,&mdev->flags); + prev_work_done.cb = w_prev_work_done; + drbd_queue_work(&mdev->data.work,&prev_work_done); + wait_event(mdev->misc_wait, !test_bit(WORK_PENDING,&mdev->flags)); + + if ( mdev->p_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } + + /* queue cleanup for the worker. + * FIXME this should go into after_state_ch */ + if (!mdev->state.susp) + tl_clear(mdev); + + INFO("Connection closed\n"); + + drbd_md_sync(mdev); + + if ( mdev->state.role == Primary ) { + if( fp >= Resource && + mdev->state.pdsk >= DUnknown ) { + drbd_disks_t nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev,NS(pdsk,nps)); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if ( os.conn >= Unconnected ) { + // Do not restart in case we are Disconnecting + ns = os; + ns.conn = Unconnected; + rv=_drbd_set_state(mdev,ns,ChgStateVerbose); + } + spin_unlock_irq(&mdev->req_lock); + if (rv == SS_Success) { + after_state_ch(mdev,os,ns,ChgStateVerbose); + } + + if(os.conn == Disconnecting) { + wait_event( mdev->misc_wait,atomic_read(&mdev->net_cnt) == 0 ); + if(mdev->ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + } + + if(mdev->tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + } + if(mdev->cram_hmac_tfm) { + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + } + kfree(mdev->net_conf); + mdev->net_conf=NULL; + drbd_request_state(mdev, NS(conn,StandAlone)); + } + + /* they do trigger all the time. + * hm. why won't tcp release the page references, + * we already released the socket!? + D_ASSERT(atomic_read(&mdev->pp_in_use) == 0); + D_ASSERT(list_empty(&mdev->net_ee)); + */ + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + mdev->epoch_size = 0; +} + +/* + * we hereby assure that we always support the drbd dialects + * PRO_VERSION and (PRO_VERSION -1), allowing for rolling upgrades + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +int drbd_send_handshake(drbd_dev *mdev) +{ + // ASSERT current == mdev->receiver ... + Drbd_HandShake_Packet *p = &mdev->data.sbuf.HandShake; + int ok; + + if (down_interruptible(&mdev->data.mutex)) { + ERR("interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + /* FIXME do we need to verify this here? */ + if (mdev->data.socket == NULL) { + up(&mdev->data.mutex); + return 0; + } + + memset(p,0,sizeof(*p)); + p->protocol_version = cpu_to_be32(PRO_VERSION); + ok = _drbd_send_cmd( mdev, mdev->data.socket, HandShake, + (Drbd_Header *)p, sizeof(*p), 0 ); + up(&mdev->data.mutex); + return ok; +} + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +STATIC int drbd_do_handshake(drbd_dev *mdev) +{ + // ASSERT current == mdev->receiver ... + Drbd_HandShake_Packet *p = &mdev->data.rbuf.HandShake; + const int expect = sizeof(Drbd_HandShake_Packet)-sizeof(Drbd_Header); + int rv; + + rv = drbd_send_handshake(mdev); + if (!rv) goto break_c_loop; + + rv = drbd_recv_header(mdev,&p->head); + if (!rv) goto break_c_loop; + + if (p->head.command != HandShake) { + ERR( "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(p->head.command), p->head.command ); + return -1; + } + + if (p->head.length != expect) { + ERR( "expected HandShake length: %u, received: %u\n", + expect, p->head.length ); + return -1; + } + + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + ERR("short read receiving handshake packet: l=%u\n", rv); + return 0; + } + + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + + p->protocol_version = be32_to_cpu(p->protocol_version); + + if ( p->protocol_version == PRO_VERSION || + p->protocol_version == (PRO_VERSION+1) ) { + if (p->protocol_version == (PRO_VERSION+1)) { + WARN( "You should upgrade me! " + "Peer wants protocol version: %u\n", + p->protocol_version ); + } + INFO( "Handshake successful: DRBD Network Protocol version %u\n", + PRO_VERSION ); + } /* else if ( p->protocol_version == (PRO_VERSION-1) ) { + // not yet; but next time :) + INFO( "Handshake successful: DRBD Protocol version %u\n", + (PRO_VERSION-1) ); + ... do some remapping of defaults and jump tables here ... + } */ else { + ERR( "incompatible DRBD dialects: " + "I support %u, peer wants %u\n", + PRO_VERSION, p->protocol_version ); + return -1; + } + + return 1; + + break_c_loop: + WARN( "My msock connect got accepted onto peer's sock!\n"); + /* In case a tcp connection set-up takes longer than + connect-int, we might get into the situation that this + node's msock gets connected to the peer's sock! + + To break out of this endless loop behaviour, we need to + wait unti the peer's msock connect tries are over. (1 Second) + + Additionally we wait connect-int/2 to hit with our next + connect try exactly in the peer's window of expectation. */ + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ + (mdev->net_conf->try_connect_int*HZ)/2); + + return 0; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +STATIC int drbd_do_auth(drbd_dev *mdev) +{ + ERR( "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + ERR( "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return 0; +} +#else +#define CHALLENGE_LEN 64 +STATIC int drbd_do_auth(drbd_dev *mdev) +{ + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + Drbd_Header p; + unsigned int key_len = strlen(mdev->net_conf->shared_secret); + unsigned int resp_size; + struct hash_desc desc; + int rv; + + desc.tfm=mdev->cram_hmac_tfm; + desc.flags=0; + + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, + (u8*)mdev->net_conf->shared_secret, key_len); + if(rv) { + ERR("crypto_hash_setkey() failed with %d\n",rv); + rv = 0; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + rv = drbd_send_cmd2(mdev,AuthChallenge,my_challenge,CHALLENGE_LEN); + if (!rv) goto fail; + + rv = drbd_recv_header(mdev,&p); + if (!rv) goto fail; + + if (p.command != AuthChallenge) { + ERR( "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command ); + rv = 0; + goto fail; + } + + if (p.length > CHALLENGE_LEN*2 ) { + ERR( "expected AuthChallenge payload too big.\n"); + rv = 0; + goto fail; + } + + peers_ch = kmalloc(p.length,GFP_KERNEL); + if(peers_ch == NULL) { + ERR("kmalloc of peers_ch failed\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, peers_ch, p.length); + + if (rv != p.length) { + ERR("short read AuthChallenge: l=%u\n", rv); + rv = 0; + goto fail; + } + + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); + response = kmalloc(resp_size,GFP_KERNEL); + if(response == NULL) { + ERR("kmalloc of response failed\n"); + rv = 0; + goto fail; + } + + sg.page = virt_to_page(peers_ch); + sg.offset = offset_in_page(peers_ch); + sg.length = p.length; + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if(rv) { + ERR( "crypto_hash_digest() failed with %d\n",rv); + rv = 0; + goto fail; + } + + rv = drbd_send_cmd2(mdev,AuthResponse,response,resp_size); + if (!rv) goto fail; + + rv = drbd_recv_header(mdev,&p); + if (!rv) goto fail; + + if (p.command != AuthResponse) { + ERR( "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command ); + rv = 0; + goto fail; + } + + if (p.length != resp_size ) { + ERR( "expected AuthResponse payload of wrong size\n" ); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, response , resp_size); + + if (rv != resp_size) { + ERR("short read receiving AuthResponse: l=%u\n", rv); + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size,GFP_KERNEL); + if(response == NULL) { + ERR("kmalloc of right_response failed\n"); + rv = 0; + goto fail; + } + + sg.page = virt_to_page(my_challenge); + sg.offset = offset_in_page(my_challenge); + sg.length = CHALLENGE_LEN; + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if(rv) { + ERR( "crypto_hash_digest() failed with %d\n",rv); + rv = 0; + goto fail; + } + + rv = ! memcmp(response,right_response,resp_size); + + if(rv) { + INFO("Peer authenticated using %d bytes of '%s' HMAC\n", + resp_size,mdev->net_conf->cram_hmac_alg); + } + + fail: + if(peers_ch) kfree(peers_ch); + if(response) kfree(response); + if(right_response) kfree(right_response); + + return rv; +} +#endif + +int drbdd_init(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + int minor = mdev_to_minor(mdev); + int h; + + sprintf(current->comm, "drbd%d_receiver", minor); + + INFO("receiver (re)started\n"); + + do { + h = drbd_connect(mdev); + if (h == 0) { + drbd_disconnect(mdev); + schedule_timeout(HZ); + } + if( h < 0 ) { + WARN("Discarding network configuration.\n"); + drbd_force_state(mdev,NS(conn,Disconnecting)); + } + } while ( h == 0 ); + + if( h > 0 ) { + if(inc_net(mdev)) { + drbdd(mdev); + dec_net(mdev); + } + } + + drbd_disconnect(mdev); + + // Ensure that the thread state fits to our connection state. + if( mdev->state.conn == Unconnected ) { + ERR_IF( mdev->receiver.t_state != Restarting ) + drbd_thread_restart_nowait(&mdev->receiver); + } else if( mdev->state.conn == StandAlone ) { + ERR_IF( mdev->receiver.t_state != Exiting ) + drbd_thread_stop_nowait(&mdev->receiver); + } + + INFO("receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +STATIC int got_RqSReply(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_RqS_Reply_Packet *p = (Drbd_RqS_Reply_Packet*)h; + + int retcode = be32_to_cpu(p->retcode); + + if(retcode >= SS_Success) { + set_bit(CL_ST_CHG_SUCCESS,&mdev->flags); + } else { + set_bit(CL_ST_CHG_FAIL,&mdev->flags); + ERR("Requested state change failed by peer: %s\n", + set_st_err_name(retcode)); + } + wake_up(&mdev->state_wait); + + return TRUE; +} + +STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h) +{ + return drbd_send_ping_ack(mdev); + +} + +STATIC int got_PingAck(drbd_dev *mdev, Drbd_Header* h) +{ + // restore idle timeout + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + return TRUE; +} + +STATIC int got_BlockAck(drbd_dev *mdev, Drbd_Header* h) +{ + drbd_request_t *req; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + update_peer_seq(mdev,be32_to_cpu(p->seq_num)); + + if( is_syncer_block_id(p->block_id)) { + drbd_set_in_sync(mdev,sector,blksize); + dec_rs_pending(mdev); + } else { + spin_lock_irq(&mdev->req_lock); + req = _ack_id_to_req(mdev, p->block_id, sector); + + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(2).\n"); + return FALSE; + } + + switch (be16_to_cpu(h->command)) { + case RSWriteAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + _req_mod(req,write_acked_by_peer_and_sis,0); + break; + case WriteAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + _req_mod(req,write_acked_by_peer,0); + break; + case RecvAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_A); + _req_mod(req,recv_acked_by_peer,0); + break; + case DiscardAck: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ALERT("Got DiscardAck packet %llus +%u!" + " DRBD is not a random data generator!\n", + (unsigned long long)req->sector, req->size); + _req_mod(req, conflict_discarded_by_peer, 0); + break; + default: + D_ASSERT(0); + } + spin_unlock_irq(&mdev->req_lock); + } + /* dec_ap_pending is handled within _req_mod */ + + return TRUE; +} + +STATIC int got_NegAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + drbd_request_t *req; + + if (DRBD_ratelimit(5*HZ,5)) + WARN("Got NegAck packet. Peer is in troubles?\n"); + + update_peer_seq(mdev,be32_to_cpu(p->seq_num)); + + if(is_syncer_block_id(p->block_id)) { + sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); + + dec_rs_pending(mdev); + + drbd_rs_failed_io(mdev, sector, size); + } else { + req = _ack_id_to_req(mdev, p->block_id, sector); + + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(2).\n"); + return FALSE; + } + + req_mod(req, neg_acked, 0); + } + + return TRUE; +} + +STATIC int got_NegDReply(drbd_dev *mdev, Drbd_Header* h) +{ + drbd_request_t *req; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev,p->block_id, sector); + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + ERR("Got a corrupt block_id/sector pair(3).\n"); + return FALSE; + } + + /* FIXME explicitly warn if protocol != C */ + + ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n", + (unsigned long long)sector,be32_to_cpu(p->blksize)); + + _req_mod(req, neg_acked, 0); + spin_unlock_irq(&mdev->req_lock); + +// warning LGE "ugly and wrong" + drbd_khelper(mdev,"pri-on-incon-degr"); + + return TRUE; +} + +STATIC int got_NegRSDReply(drbd_dev *mdev, Drbd_Header* h) +{ + sector_t sector; + int size; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + D_ASSERT(p->block_id == ID_SYNCER); + + dec_rs_pending(mdev); + + drbd_rs_complete_io(mdev,sector); + + drbd_rs_failed_io(mdev, sector, size); + + return TRUE; +} + +STATIC int got_BarrierAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BarrierAck_Packet *p = (Drbd_BarrierAck_Packet*)h; + + tl_release(mdev,p->barrier,be32_to_cpu(p->set_size)); + dec_ap_pending(mdev); + + return TRUE; +} + +struct asender_cmd { + size_t pkt_size; + int (*process)(drbd_dev *mdev, Drbd_Header* h); +}; + +int drbd_asender(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + Drbd_Header *h = &mdev->meta.rbuf.head; + + int rv,len; + void *buf = h; + int received = 0; + int expect = sizeof(Drbd_Header); + int cmd = -1; + int empty; + + static struct asender_cmd asender_tbl[] = { + [Ping] ={ sizeof(Drbd_Header), got_Ping }, + [PingAck] ={ sizeof(Drbd_Header), got_PingAck }, + [RecvAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [WriteAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [RSWriteAck]={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [DiscardAck]={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [NegAck] ={ sizeof(Drbd_BlockAck_Packet), got_NegAck }, + [NegDReply] ={ sizeof(Drbd_BlockAck_Packet), got_NegDReply }, + [NegRSDReply]={sizeof(Drbd_BlockAck_Packet), got_NegRSDReply}, + [BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck }, + [StateChgReply]={sizeof(Drbd_RqS_Reply_Packet),got_RqSReply }, + }; + + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); + + current->policy = SCHED_RR; /* Make this a realtime task! */ + current->rt_priority = 2; /* more important than all other tasks */ + + while (get_t_state(thi) == Running) { + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto err; + mdev->meta.socket->sk->sk_rcvtimeo = + mdev->net_conf->ping_timeo*HZ/10; + } + + while(1) { + if (!drbd_process_done_ee(mdev)) { + ERR("process_done_ee() = NOT_OK\n"); + goto err; + } + set_bit(SIGNAL_ASENDER, &mdev->flags); + spin_lock_irq(&mdev->req_lock); + empty = list_empty(&mdev->done_ee); + spin_unlock_irq(&mdev->req_lock); + if(empty) break; + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); + } + drbd_tcp_flush(mdev->meta.socket); + + rv = drbd_recv_short(mdev, mdev->meta.socket, + buf,expect-received); + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + flush_signals(current); + + drbd_tcp_cork(mdev->meta.socket); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + ERR("meta connection shut down by peer.\n"); + goto err; + } else if (rv == -EAGAIN) { + if( mdev->meta.socket->sk->sk_rcvtimeo == + mdev->net_conf->ping_timeo*HZ/10 ) { + ERR("PingAck did not arrive in time.\n"); + goto err; + } + set_bit(SEND_PING,&mdev->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + ERR("sock_recvmsg returned %d\n", rv); + goto err; + } + + if (received == expect && cmd == -1 ) { + cmd = be16_to_cpu(h->command); + len = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto err; + } + expect = asender_tbl[cmd].pkt_size; + ERR_IF(len != expect-sizeof(Drbd_Header)) { + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + DUMPI(expect); + } + } + if(received == expect) { + D_ASSERT(cmd != -1); + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + if(!asender_tbl[cmd].process(mdev,h)) goto err; + + buf = h; + received = 0; + expect = sizeof(Drbd_Header); + cmd = -1; + } + } //while + + if(0) { + err: + clear_bit(SIGNAL_ASENDER, &mdev->flags); + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + } + + D_ASSERT(mdev->state.conn < Connected); + INFO("asender terminated\n"); + + return 0; +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_req.c linux-2.6.24.ovz/drivers/block/drbd/drbd_req.c --- linux-2.6.24/drivers/block/drbd/drbd_req.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_req.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,1187 @@ +/* +-*- linux-c -*- + drbd_req.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" + +//#define VERBOSE_REQUEST_CODE +#if defined(VERBOSE_REQUEST_CODE) || defined(ENABLE_DYNAMIC_TRACE) +void _print_req_mod(drbd_request_t *req,drbd_req_event_t what) +{ + drbd_dev *mdev = req->mdev; + const int rw = (req->master_bio == NULL || + bio_data_dir(req->master_bio) == WRITE) ? + 'W' : 'R'; + + static const char *rq_event_names[] = { + [created] = "created", + [to_be_send] = "to_be_send", + [to_be_submitted] = "to_be_submitted", + [queue_for_net_write] = "queue_for_net_write", + [queue_for_net_read] = "queue_for_net_read", + [send_canceled] = "send_canceled", + [send_failed] = "send_failed", + [handed_over_to_network] = "handed_over_to_network", + [connection_lost_while_pending] = "connection_lost_while_pending", + [recv_acked_by_peer] = "recv_acked_by_peer", + [write_acked_by_peer] = "write_acked_by_peer", + [neg_acked] = "neg_acked", + [conflict_discarded_by_peer] = "conflict_discarded_by_peer", + [barrier_acked] = "barrier_acked", + [data_received] = "data_received", + [read_completed_with_error] = "read_completed_with_error", + [write_completed_with_error] = "write_completed_with_error", + [completed_ok] = "completed_ok", + }; + + INFO("_req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); +} + +void _print_rq_state(drbd_request_t *req, const char *txt) +{ + const unsigned long s = req->rq_state; + drbd_dev *mdev = req->mdev; + const int rw = (req->master_bio == NULL || + bio_data_dir(req->master_bio) == WRITE) ? + 'W' : 'R'; + + INFO("%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", + txt, req, rw, + s & RQ_LOCAL_PENDING ? 'p' : '-', + s & RQ_LOCAL_COMPLETED ? 'c' : '-', + s & RQ_LOCAL_OK ? 'o' : '-', + s & RQ_NET_PENDING ? 'p' : '-', + s & RQ_NET_QUEUED ? 'q' : '-', + s & RQ_NET_SENT ? 's' : '-', + s & RQ_NET_DONE ? 'd' : '-', + s & RQ_NET_OK ? 'o' : '-', + req->epoch, + (unsigned long long)req->sector, + req->size, + conns_to_name(mdev->state.conn)); +} + +# ifdef ENABLE_DYNAMIC_TRACE +# define print_rq_state(R,T) MTRACE(TraceTypeRq,TraceLvlMetrics,_print_rq_state(R,T);) +# define print_req_mod(T,W) MTRACE(TraceTypeRq,TraceLvlMetrics,_print_req_mod(T,W);) +# else +# define print_rq_state(R,T) _print_rq_state(R,T) +# define print_req_mod(T,W) _print_req_mod(T,W) +# endif + +#else +#define print_rq_state(R,T) +#define print_req_mod(T,W) +#endif + +static void _req_is_done(drbd_dev *mdev, drbd_request_t *req, const int rw) +{ + const unsigned long s = req->rq_state; + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (rw == WRITE) { + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialised in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) { + drbd_set_out_of_sync(mdev,req->sector,req->size); + } + + if( (s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && + (s & RQ_NET_SIS) ) { + drbd_set_in_sync(mdev,req->sector,req->size); + } + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_endio_pri. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_LOCAL_MASK) { + if (inc_local_if_state(mdev,Failed)) { + drbd_al_complete_io(mdev, req->sector); + dec_local(mdev); + } else { + WARN("Should have called drbd_al_complete_io(, %llu), " + "but my Disk seems to have failed:(\n", req->sector); + } + } + } + + /* if it was a local io error, we want to notify our + * peer about that, and see if we need to + * detach the disk and stuff. + * to avoid allocating some special work + * struct, reuse the request. */ + + /* THINK + * why do we do this not when we detect the error, + * but delay it until it is "done", i.e. possibly + * until the next barrier ack? */ + + if (rw == WRITE && + (( s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { + if (!(req->w.list.next == LIST_POISON1 || + list_empty(&req->w.list))) { + /* DEBUG ASSERT only; if this triggers, we + * probably corrupt the worker list here */ + DUMPP(req->w.list.next); + DUMPP(req->w.list.prev); + } + req->w.cb = w_io_error; + drbd_queue_work(&mdev->data.work, &req->w); + /* drbd_req_free() is done in w_io_error */ + } else { + drbd_req_free(req); + } +} + +static void _about_to_complete_local_write(drbd_dev *mdev, drbd_request_t *req) +{ + const unsigned long s = req->rq_state; + drbd_request_t *i; + struct Tl_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + /* before we can signal completion to the upper layers, + * we may need to close the current epoch */ + if (req->epoch == mdev->newest_barrier->br_number) + set_bit(ISSUE_BARRIER,&mdev->flags); + + /* we need to do the conflict detection stuff, + * if we have the ee_hash (two_primaries) and + * this has been on the network */ + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { + const sector_t sector = req->sector; + const int size = req->size; + + /* ASSERT: + * there must be no conflicting requests, since + * they must have been failed on the spot */ +#define OVERLAPS overlaps(sector, size, i->sector, i->size) + slot = tl_hash_slot(mdev,sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + ALERT("LOGIC BUG: completed: %p %llus +%u; other: %p %llus +%u\n", + req, (unsigned long long)sector, size, + i, (unsigned long long)i->sector, i->size); + } + } + + /* maybe "wake" those conflicting epoch entries + * that wait for this request to finish. + * + * currently, there can be only _one_ such ee + * (well, or some more, which would be pending + * DiscardAck not yet sent by the asender...), + * since we block the receiver thread upon the + * first conflict detection, which will wait on + * misc_wait. maybe we want to assert that? + * + * anyways, if we found one, + * we just have to do a wake_up. */ +#undef OVERLAPS +#define OVERLAPS overlaps(sector, size, e->sector, e->size) + slot = ee_hash_slot(mdev,req->sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + wake_up(&mdev->misc_wait); + break; + } + } + } +#undef OVERLAPS +} + +static void _complete_master_bio(drbd_dev *mdev, drbd_request_t *req, int error) +{ + dump_bio(mdev,req->master_bio,1); + bio_endio(req->master_bio, req->master_bio->bi_size, error); + req->master_bio = NULL; + dec_ap_bio(mdev); +} + +void _req_may_be_done(drbd_request_t *req, int error) +{ + const unsigned long s = req->rq_state; + drbd_dev *mdev = req->mdev; + int rw; + + print_rq_state(req, "_req_may_be_done"); + MUST_HOLD(&mdev->req_lock) + + if (s & RQ_NET_PENDING) return; + if (s & RQ_LOCAL_PENDING) return; + + if (req->master_bio) { + /* this is data_received (remote read) + * or protocol C WriteAck + * or protocol B RecvAck + * or protocol A "handed_over_to_network" (SendAck) + * or canceled or failed, + * or killed from the transfer log due to connection loss. + */ + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations suceeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + */ + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + rw = bio_data_dir(req->master_bio); + + /* remove the request from the conflict detection + * respective block_id verification hash */ + if (!hlist_unhashed(&req->colision)) hlist_del(&req->colision); + else D_ASSERT((s & RQ_NET_MASK) == 0); + + if (rw == WRITE) { + /* for writes we need to do some extra housekeeping */ + _about_to_complete_local_write(mdev,req); + } + + /* FIXME not yet implemented... + * in case we got "suspended" (on_disconnect: freeze io) + * we may not yet complete the request... + * though, this is probably best handled elsewhere by not + * walking the transfer log until "unfreeze", so we won't end + * up here anyways during the freeze ... + * then again, if it is a READ, it is not in the TL at all. + * is it still leagal to complete a READ during freeze? */ + + _complete_master_bio(mdev,req, + ok ? 0 : ( error ? error : -EIO ) ); + } else { + /* only WRITE requests can end up here without a master_bio */ + rw = WRITE; + } + + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { + /* this is disconnected (local only) operation, + * or protocol C WriteAck, + * or protocol A or B BarrierAck, + * or killed from the transfer log due to connection loss. */ + _req_is_done(mdev,req,rw); + } + /* else: network part and not DONE yet. that is + * protocol A or B, barrier ack still pending... */ +} + +/* + * checks whether there was an overlapping request + * or ee already registered. + * + * if so, return 1, in which case this request is completed on the spot, + * without ever being submitted or send. + * + * return 0 if it is ok to submit this request. + * + * NOTE: + * paranoia: assume something above us is broken, and issues different write + * requests for the same block simultaneously... + * + * To ensure these won't be reordered differently on both nodes, resulting in + * diverging data sets, we discard the later one(s). Not that this is supposed + * to happen, but this is the rationale why we also have to check for + * conflicting requests with local origin, and why we have to do so regardless + * of whether we allowed multiple primaries. + * + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the + * second hlist_for_each_entry becomes a noop. This is even simpler than to + * grab a reference on the net_conf, and check for the two_primaries flag... + */ +STATIC int _req_conflicts(drbd_request_t *req) +{ + drbd_dev *mdev = req->mdev; + const sector_t sector = req->sector; + const int size = req->size; + drbd_request_t *i; + struct Tl_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + MUST_HOLD(&mdev->req_lock); + D_ASSERT(hlist_unhashed(&req->colision)); + + /* FIXME should this inc_net/dec_net + * rather be done in drbd_make_request_common? */ + if (!inc_net(mdev)) + return 0; + + /* BUG_ON */ + ERR_IF (mdev->tl_hash_s == 0) + goto out_no_conflict; + BUG_ON(mdev->tl_hash == NULL); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev,sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + ALERT("%s[%u] Concurrent local write detected!" + " [DISCARD L] new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + goto out_conflict; + } + } + + if(mdev->ee_hash_s) { + /* now, check for overlapping requests with remote origin */ + BUG_ON(mdev->ee_hash == NULL); +#undef OVERLAPS +#define OVERLAPS overlaps(e->sector, e->size, sector, size) + slot = ee_hash_slot(mdev,sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + ALERT("%s[%u] Concurrent remote write detected!" + " [DISCARD L] new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)e->sector, e->size); + goto out_conflict; + } + } + } +#undef OVERLAPS + + out_no_conflict: + /* this is like it should be, and what we expected. + * our users do behave after all... */ + dec_net(mdev); + return 0; + + out_conflict: + dec_net(mdev); + return 1; +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + * + * Though I think it is likely that we break this again into many + * static inline void _req_mod_ ## what (req) ... + */ +void _req_mod(drbd_request_t *req, drbd_req_event_t what, int error) +{ + drbd_dev *mdev = req->mdev; + MUST_HOLD(&mdev->req_lock); + + if (error && ( bio_rw(req->master_bio) != READA ) ) { + ERR("got an _req_mod() errno of %d\n",error); + } + + print_req_mod(req,what); + + switch(what) { + default: + ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__ ); + return; + + /* does not happen... + * initialization done in drbd_req_new + case created: + break; + */ + + case to_be_send: /* via network */ + /* reached via drbd_make_request_common + * and from FIXME w_read_retry_remote */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + break; + + case to_be_submitted: /* locally */ + /* reached via drbd_make_request_common */ + D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); + req->rq_state |= RQ_LOCAL_PENDING; + break; + + /* FIXME these *_completed_* are basically the same. + * can probably be merged with some if (what == xy) */ + + case completed_ok: + if (bio_data_dir(req->private_bio) == WRITE) + mdev->writ_cnt += req->size>>9; + else + mdev->read_cnt += req->size>>9; + + bio_put(req->private_bio); + req->private_bio = NULL; + + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state &= ~RQ_LOCAL_PENDING; + + _req_may_be_done(req,error); + dec_local(mdev); + break; + + case write_completed_with_error: + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + bio_put(req->private_bio); + req->private_bio = NULL; + ALERT("Local WRITE failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* and now: check how to handle local io error. + * FIXME see comment below in read_completed_with_error */ + __drbd_chk_io_error(mdev,FALSE); + _req_may_be_done(req,error); + dec_local(mdev); + break; + + case read_completed_with_error: + if (bio_rw(req->master_bio) != READA) { + drbd_set_out_of_sync(mdev,req->sector,req->size); + } + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + bio_put(req->private_bio); + req->private_bio = NULL; + dec_local(mdev); + if (bio_rw(req->master_bio) == READA) { + /* it is legal to fail READA */ + _req_may_be_done(req,error); + break; + } + /* else */ + ALERT("Local READ failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* _req_mod(req,to_be_send); oops, recursion in static inline */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + + /* and now: check how to handle local io error. + * + * FIXME we should not handle WRITE and READ io errors + * the same. When we retry the READ, and then write + * the answer, that might suceed because modern drives + * would relocate the sectors. We'd need to keep our + * private bio then, and round the offset and size so + * we get back enough data to be able to clear the bits again. + */ + __drbd_chk_io_error(mdev,FALSE); + /* fall through: _req_mod(req,queue_for_net_read); */ + + case queue_for_net_read: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from drbd_make_request_common + * or from bio_endio during read io-error recovery */ + + /* so we can verify the handle in the answer packet + * corresponding hlist_del is in _req_may_be_done() */ + hlist_add_head(&req->colision, ar_hash_slot(mdev,req->sector)); + + set_bit(UNPLUG_REMOTE,&mdev->flags); /* why? */ + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) + ? w_read_retry_remote + : w_send_read_req; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case queue_for_net_write: + /* assert something? */ + /* from drbd_make_request_common only */ + + hlist_add_head(&req->colision,tl_hash_slot(mdev,req->sector)); + /* corresponding hlist_del is in _req_may_be_done() */ + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * drbd_make_request_common, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* see drbd_make_request_common just after it grabs the req_lock */ + D_ASSERT(test_bit(ISSUE_BARRIER, &mdev->flags) == 0); + + req->epoch = mdev->newest_barrier->br_number; + list_add_tail(&req->tl_requests,&mdev->newest_barrier->requests); + + /* mark the current epoch as closed, + * in case it outgrew the limit */ + if( ++mdev->newest_barrier->n_req >= mdev->net_conf->max_epoch_size ) + set_bit(ISSUE_BARRIER,&mdev->flags); + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_dblock; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + /* FIXME + * to implement freeze-io, + * we may not finish the request just yet. + */ + case send_canceled: + /* for the request, this is the same thing */ + case send_failed: + /* real cleanup will be done from tl_clear. just update flags so + * it is no longer marked as on the worker queue */ + req->rq_state &= ~RQ_NET_QUEUED; + /* if we did it right, tl_clear should be scheduled only after this, + * so this should not be necessary! */ + _req_may_be_done(req,error); + break; + + case handed_over_to_network: + /* assert something? */ + if ( bio_data_dir(req->master_bio) == WRITE && + mdev->net_conf->wire_protocol == DRBD_PROT_A ) { + /* this is what is dangerous about protocol A: + * pretend it was sucessfully written on the peer. + * FIXME in case we get a local io-error in + * protocol != C, we might want to defer comletion + * until we get the barrier ack, and send a NegAck + * in case the other node had an io-error, too... + * That way we would at least not report "success" + * if it was not written at all. */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= RQ_NET_OK; + } /* else: neg-ack was faster... */ + /* it is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss */ + } + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_SENT; + /* because _drbd_send_zc_bio could sleep, and may want to + * dereference the bio even after the "write_acked_by_peer" and + * "completed_ok" events came in, once we return from + * _drbd_send_zc_bio (drbd_send_dblock), we have to check + * whether it is done already, and end it. */ + _req_may_be_done(req,error); + break; + + case connection_lost_while_pending: + /* transfer log cleanup after connection loss */ + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + req->rq_state |= RQ_NET_DONE; + /* if it is still queued, we may not complete it here. + * it will be canceled soon. + * FIXME we should change the code so this can not happen. */ + if (!(req->rq_state & RQ_NET_QUEUED)) + _req_may_be_done(req,error); + break; + + case write_acked_by_peer_and_sis: + req->rq_state |= RQ_NET_SIS; + case conflict_discarded_by_peer: + /* interesstingly, this is the same thing! */ + case write_acked_by_peer: + /* assert something? */ + /* protocol C; successfully written on peer */ + req->rq_state |= RQ_NET_DONE; + /* rest is the same as for: */ + case recv_acked_by_peer: + /* protocol B; pretends to be sucessfully written on peer. + * see also notes above in handed_over_to_network about + * protocol != C */ + req->rq_state |= RQ_NET_OK; + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + if (req->rq_state & RQ_NET_SENT) + _req_may_be_done(req,error); + /* else: done by handed_over_to_network */ + break; + + case neg_acked: + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + /* FIXME THINK! is it DONE now, or is it not? */ + req->rq_state |= RQ_NET_DONE; + if (req->rq_state & RQ_NET_SENT) + _req_may_be_done(req,error); + /* else: done by handed_over_to_network */ + break; + + case barrier_acked: + /* can even happen for protocol C, + * when local io is still pending. + * in which case it does nothing. */ + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests have been acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + const unsigned long s = req->rq_state; + INFO("%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", + "FIXME", req, + /* in fact, it can only be a WRITE, but anyways */ + bio_data_dir(req->master_bio) == WRITE ? 'W' : 'R', + s & RQ_LOCAL_PENDING ? 'p' : '-', + s & RQ_LOCAL_COMPLETED ? 'c' : '-', + s & RQ_LOCAL_OK ? 'o' : '-', + s & RQ_NET_PENDING ? 'p' : '-', + s & RQ_NET_QUEUED ? 'q' : '-', + s & RQ_NET_SENT ? 's' : '-', + s & RQ_NET_DONE ? 'd' : '-', + s & RQ_NET_OK ? 'o' : '-', + req->epoch, + (unsigned long long)req->sector, + req->size, + conns_to_name(mdev->state.conn)); + } + D_ASSERT(req->rq_state & RQ_NET_SENT); + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req,error); + break; + + case data_received: + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); + /* can it happen that we receive the DataReply + * before the send DataRequest function returns? */ + if (req->rq_state & RQ_NET_SENT) + _req_may_be_done(req,error); + /* else: done by handed_over_to_network */ + break; + }; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +STATIC int drbd_may_do_local_read(drbd_dev *mdev, sector_t sector, int size) +{ + unsigned long sbnr,ebnr,bnr; + sector_t esector, nr_sectors; + + if (mdev->state.disk == UpToDate) return 1; + if (mdev->state.disk >= Outdated) return 0; + if (mdev->state.disk < Inconsistent) return 0; + // state.disk == Inconsistent We will have a look at the BitMap + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + D_ASSERT(sector < nr_sectors); + D_ASSERT(esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + for (bnr = sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)) return 0; + } + return 1; +} + +/* + * general note: + * looking at the state (conn, disk, susp, pdsk) outside of the spinlock that + * protects the state changes is inherently racy. + * + * FIXME verify this rationale why we may do so anyways: + * + * I think it "should" be like this: + * as soon as we have a "ap_bio_cnt" reference we may test for "bad" states, + * because the transition from "bad" to "good" states may only happen while no + * application request is on the fly, so once we are positive about a "bad" + * state, we know it won't get better during the lifetime of this request. + * + * In case we think we are ok, but "asynchronously" some interrupt or other thread + * marks some operation as impossible, we are still ok, since we would just try + * anyways, and then see that it does not work there and then. + */ + +STATIC int +drbd_make_request_common(drbd_dev *mdev, int rw, int size, + sector_t sector, struct bio *bio) +{ + struct drbd_barrier *b = NULL; + drbd_request_t *req; + int local, remote; + int err = -EIO; + + /* allocate outside of all locks; get a "reference count" (ap_bio_cnt) + * to avoid races with the disconnect/reconnect code. */ + inc_ap_bio(mdev); + req = drbd_req_new(mdev,bio); + if (!req) { + dec_ap_bio(mdev); + /* only pass the error to the upper layers. + * if user cannot handle io errors, thats not our business. */ + ERR("could not kmalloc() req\n"); + bio_endio(bio, bio->bi_size, -ENOMEM); + return 0; + } + + dump_bio(mdev,bio,0); + + local = inc_local(mdev); + if (!local) { + bio_put(req->private_bio); /* or we get a bio leak */ + req->private_bio = NULL; + } + if (rw == WRITE) { + remote = 1; + } else { + /* READ || READA */ + if (local) { + if (!drbd_may_do_local_read(mdev,sector,size)) { + /* we could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + /* FIXME + * I think we have a RACE here. We request + * something from the peer, then later some + * write starts ... and finished *before* + * the answer to the read comes in, because + * the ACK for the WRITE goes over + * meta-socket ... + * Maybe we need to properly lock reads + * against the syncer, too. But if we have + * some user issuing writes on an area that + * he has pending reads on, _he_ is really + * broke anyways, and would get "undefined + * results" on _any_ io stack, even just the + * local io stack. + */ + +/* XXX SHARED DISK mode + * think this over again for two primaries */ + + local = 0; + bio_put(req->private_bio); + req->private_bio = NULL; + dec_local(mdev); + } + } + remote = !local && mdev->state.pdsk >= UpToDate;//Consistent; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are Primary, Inconsistent, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && mdev->state.disk >= Inconsistent && !local) { + err = -EWOULDBLOCK; + goto fail_and_free_req; + } + + /* For WRITES going to the local disk, grab a reference on the target extent. + * This waits for any resync activity in the corresponding resync + * extent to finish, and, if necessary, pulls in the target extent into + * the activity log, which involves further disk io because of transactional + * on-disk meta data updates. */ + if (rw == WRITE && local) + drbd_al_begin_io(mdev, sector); + + remote = remote && (mdev->state.pdsk == UpToDate || + ( mdev->state.pdsk == Inconsistent && + mdev->state.conn >= Connected ) ); + + if (!(local || remote)) { + ERR("IO ERROR: neither local nor remote disk\n"); + goto fail_and_free_req; + } + + /* For WRITE request, we have to make sure that we have an + * unused_spare_barrier, in case we need to start a new epoch. + * I try to be smart and avoid to pre-allocate always "just in case", + * but there is a race between testing the bit and pointer outside the + * spinlock, and grabbing the spinlock. + * if we lost that race, we retry. */ + if (rw == WRITE && remote && + mdev->unused_spare_barrier == NULL && + test_bit(ISSUE_BARRIER,&mdev->flags)) + { + allocate_barrier: + b = kmalloc(sizeof(struct drbd_barrier),GFP_NOIO); + if(!b) { + ERR("Failed to alloc barrier."); + err = -ENOMEM; + goto fail_and_free_req; + } + } + + /* GOOD, everything prepared, grab the spin_lock */ + spin_lock_irq(&mdev->req_lock); + + /* FIXME race with drbd_disconnect and tl_clear? */ + if (remote) { + remote = (mdev->state.pdsk == UpToDate || + ( mdev->state.pdsk == Inconsistent && + mdev->state.conn >= Connected ) ); + if (!remote) { + WARN("lost connection while grabbing the req_lock!\n"); + } + if (!(local || remote)) { + ERR("IO ERROR: neither local nor remote disk\n"); + spin_unlock_irq(&mdev->req_lock); + goto fail_and_free_req; + } + } + + if (b && mdev->unused_spare_barrier == NULL) { + mdev->unused_spare_barrier = b; + b = NULL; + } + if (rw == WRITE && remote && + mdev->unused_spare_barrier == NULL && + test_bit(ISSUE_BARRIER,&mdev->flags)) { + /* someone closed the current epoch + * while we were grabbing the spinlock */ + spin_unlock_irq(&mdev->req_lock); + goto allocate_barrier; + } + + + /* _maybe_start_new_epoch(mdev); + * If we need to generate a write barrier packet, we have to add the + * new epoch (barrier) object, and queue the barrier packet for sending, + * and queue the req's data after it _within the same lock_, otherwise + * we have race conditions were the reorder domains could be mixed up. + * + * Even read requests may start a new epoch and queue the corresponding + * barrier packet. To get the write ordering right, we only have to + * make sure that, if this is a write request and it triggered a + * barrier packet, this request is queued within the same spinlock. */ + if (remote && mdev->unused_spare_barrier && + test_and_clear_bit(ISSUE_BARRIER,&mdev->flags)) { + struct drbd_barrier *b = mdev->unused_spare_barrier; + b = _tl_add_barrier(mdev,b); + mdev->unused_spare_barrier = NULL; + b->w.cb = w_send_barrier; + /* inc_ap_pending done here, so we won't + * get imbalanced on connection loss. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in tl_clear. */ + inc_ap_pending(mdev); + drbd_queue_work(&mdev->data.work, &b->w); + } else { + D_ASSERT(!(remote && rw == WRITE && + test_bit(ISSUE_BARRIER,&mdev->flags))); + } + + /* NOTE + * Actually, 'local' may be wrong here already, since we may have failed + * to write to the meta data, and may become wrong anytime because of + * local io-error for some other request, which would lead to us + * "detaching" the local disk. + * + * 'remote' may become wrong any time because the network could fail. + * + * This is a harmless race condition, though, since it is handled + * correctly at the appropriate places; so it just deferres the failure + * of the respective operation. + */ + + /* mark them early for readability. + * this just sets some state flags. */ + if (remote) _req_mod(req, to_be_send, 0); + if (local) _req_mod(req, to_be_submitted, 0); + + /* check this request on the colison detection hash tables. + * if we have a conflict, just complete it here. + * THINK do we want to check reads, too? (I don't think so...) */ + if (rw == WRITE && _req_conflicts(req)) { + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + drbd_al_complete_io(mdev, req->sector); + dec_local(mdev); + local = 0; + } + if (remote) dec_ap_pending(mdev); + dump_bio(mdev,req->master_bio,1); + /* THINK: do we want to fail it (-EIO), or pretend success? */ + bio_endio(req->master_bio, req->master_bio->bi_size, 0); + req->master_bio = NULL; + dec_ap_bio(mdev); + drbd_req_free(req); + local = remote = 0; + } + + /* NOTE remote first: to get the concurrent write detection right, + * we must register the request before start of local IO. */ + if (remote) { + /* either WRITE and Connected, + * or READ, and no local disk, + * or READ, but not in sync. + */ + if (rw == WRITE) _req_mod(req,queue_for_net_write, 0); + else _req_mod(req,queue_for_net_read, 0); + } + spin_unlock_irq(&mdev->req_lock); + if (b) kfree(b); /* if someone else has beaten us to it... */ + + if (local) { + /* FIXME what ref count do we have to ensure the backing_bdev + * was not detached below us? */ + req->private_bio->bi_bdev = mdev->bc->backing_bdev; + + if (FAULT_ACTIVE(rw==WRITE ? DRBD_FAULT_DT_WR : + ( rw==READ ? DRBD_FAULT_DT_RD : + DRBD_FAULT_DT_RA ) )) + bio_endio(req->private_bio, req->private_bio->bi_size, -EIO); + else + generic_make_request(req->private_bio); + } + + /* we need to plug ALWAYS since we possibly need to kick lo_dev. + * we plug after submit, so we won't miss an unplug event */ + drbd_plug_device(mdev); + + return 0; + + fail_and_free_req: + if (b) kfree(b); + bio_endio(bio, bio->bi_size, err); + drbd_req_free(req); + return 0; +} + +/* helper function for drbd_make_request + * if we can determine just by the mdev (state) that this request will fail, + * return 1 + * otherwise return 0 + */ +static int drbd_fail_request_early(drbd_dev* mdev, int is_write) +{ + // Unconfigured + if (mdev->state.conn == Disconnecting && + mdev->state.disk == Diskless) + return 1; + + if (mdev->state.role != Primary && + ( !allow_oos || is_write) ) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("Process %s[%u] tried to %s; since we are not in Primary state, we cannot allow this\n", + current->comm, current->pid, is_write ? "WRITE" : "READ"); + } + return 1; + } + + /* + * Paranoia: we might have been primary, but sync target, or + * even diskless, then lost the connection. + * This should have been handled (panic? suspend?) somehwere + * else. But maybe it was not, so check again here. + * Caution: as long as we do not have a read/write lock on mdev, + * to serialize state changes, this is racy, since we may lose + * the connection *after* we test for the cstate. + */ + if ( mdev->state.disk < UpToDate && + mdev->state.conn < Connected) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("Sorry, I have no access to good data anymore.\n"); + } + /* + * FIXME suspend, loop waiting on cstate wait? + */ + return 1; + } + + return 0; +} + +int drbd_make_request_26(request_queue_t *q, struct bio *bio) +{ + unsigned int s_enr,e_enr; + struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata; + + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { + bio_endio(bio, bio->bi_size, -EPERM); + return 0; + } + + /* Currently our BARRIER code is disabled. */ + if(unlikely(bio_barrier(bio))) { + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + + /* + * what we "blindly" assume: + */ + D_ASSERT(bio->bi_size > 0); + D_ASSERT( (bio->bi_size & 0x1ff) == 0); + // D_ASSERT(bio->bi_size <= q->max_segment_size); // wrong. + D_ASSERT(bio->bi_idx == 0); + +#if 1 + /* to make some things easier, force allignment of requests within the + * granularity of our hash tables */ + s_enr = bio->bi_sector >> HT_SHIFT; + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; +#else + /* when not using two primaries (and not being as paranoid as lge), + * actually there is no need to be as strict. + * only force allignment within AL_EXTENT boundaries */ + s_enr = bio->bi_sector >> (AL_EXTENT_SIZE_B-9); + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> (AL_EXTENT_SIZE_B-9); + D_ASSERT(e_enr >= s_enr); +#endif + + if(unlikely(s_enr != e_enr)) { + /* This bio crosses some boundary, so we have to split it. + * [So far, only XFS is known to do this...] */ + struct bio_pair *bp; +#if 1 + /* works for the "do not cross hash slot boundaries" case + * e.g. sector 262269, size 4096 + * s_enr = 262269 >> 6 = 4097 + * e_enr = (262269+8-1) >> 6 = 4098 + * HT_SHIFT = 6 + * sps = 64, mask = 63 + * first_sectors = 64 - (262269 & 63) = 3 + */ + const sector_t sect = bio->bi_sector; + const int sps = 1<bi_sector); +#endif + drbd_make_request_26(q,&bp->bio1); + drbd_make_request_26(q,&bp->bio2); + bio_pair_release(bp); + return 0; + } + + return drbd_make_request_common(mdev,bio_rw(bio),bio->bi_size, + bio->bi_sector,bio); +} + +/* This is called by bio_add_page(). With this function we reduce + * the number of BIOs that span over multiple AL_EXTENTs. + * + * we do the calculation within the lower 32bit of the byte offsets, + * since we don't care for actual offset, but only check whether it + * would cross "activity log extent" boundaries. + * + * As long as the BIO is emtpy we have to allow at least one bvec, + * regardless of size and offset. so the resulting bio may still + * cross extent boundaries. those are dealt with (bio_split) in + * drbd_make_request_26. + */ +/* FIXME for two_primaries, + * we should use DRBD_MAX_SEGMENT_SIZE instead of AL_EXTENT_SIZE */ +int drbd_merge_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *bvec) +{ + struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata; + unsigned int bio_offset = (unsigned int)bio->bi_sector << 9; // 32 bit + unsigned int bio_size = bio->bi_size; + int limit, backing_limit; + +#if 1 + limit = DRBD_MAX_SEGMENT_SIZE - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); +#else + limit = AL_EXTENT_SIZE - ((bio_offset & (AL_EXTENT_SIZE-1)) + bio_size); +#endif + if (limit < 0) limit = 0; + if (bio_size == 0) { + if (limit <= bvec->bv_len) limit = bvec->bv_len; + } else if (limit && inc_local(mdev)) { + request_queue_t * const b = mdev->bc->backing_bdev->bd_disk->queue; + if(b->merge_bvec_fn && mdev->bc->dc.use_bmbv) { + backing_limit = b->merge_bvec_fn(b,bio,bvec); + limit = min(limit,backing_limit); + } + dec_local(mdev); + } + return limit; +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_req.h linux-2.6.24.ovz/drivers/block/drbd/drbd_req.h --- linux-2.6.24/drivers/block/drbd/drbd_req.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_req.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,311 @@ +/* + drbd_req.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2007, Lars Ellenberg . + Copyright (C) 2006-2007, Philipp Reisner . + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include +#include + +#include +#include +#include "drbd_int.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type drbd_request_t do only exist on a Primary node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * FIXME I believe that for consistency we should place even READ requests + * on these lists, so we can moan when we detect that the other node is + * writing to an area that we currently read from (when this happens, our + * users are broken). + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either sucessfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + * + * See also documentation/drbd-request-state-overview.dot + * (dot -Tps2 documentation/drbd-request-state-overview.dot | display -) + */ + +typedef enum { + created, + to_be_send, + to_be_submitted, + + /* XXX yes, now I am inconsistent... + * these two are not "events" but "actions" + * oh, well... */ + queue_for_net_write, + queue_for_net_read, + + send_canceled, + send_failed, + handed_over_to_network, + connection_lost_while_pending, + recv_acked_by_peer, + write_acked_by_peer, + write_acked_by_peer_and_sis, // and set_in_sync + conflict_discarded_by_peer, + neg_acked, + barrier_acked, /* in protocol A and B */ + data_received, /* (remote read) */ + + read_completed_with_error, + write_completed_with_error, + completed_ok, +} drbd_req_event_t; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 210 + * 000: no local possible + * 001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 110: completed ok + * 010: completed with error + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + + /* 76543 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write_acked (C), + * data_received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack" */ + __RQ_NET_SENT, + + /* when set, the request may be freed. + * in (C) this happens when WriteAck is received, + * in (B,A) when the corresponding BarrierAck is received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +#define RQ_NET_MASK (((RQ_NET_OK << 1)-1) & ~RQ_LOCAL_MASK) /* 0xf8 */ + +/* epoch entries */ +static inline struct hlist_head* ee_hash_slot(drbd_dev *mdev, sector_t sector) +{ + BUG_ON(mdev->ee_hash_s == 0); + return mdev->ee_hash + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); +} + +/* transfer log (drbd_request objects) */ +static inline struct hlist_head* tl_hash_slot(drbd_dev *mdev, sector_t sector) +{ + BUG_ON(mdev->tl_hash_s == 0); + return mdev->tl_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline drbd_request_t* _ack_id_to_req(drbd_dev *mdev,u64 id, sector_t sector) +{ + struct hlist_head *slot = tl_hash_slot(mdev,sector); + struct hlist_node *n; + drbd_request_t * req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + if (req->sector != sector) { + ERR("_ack_id_to_req: found req %p but it has " + "wrong sector (%llus versus %llus)\n", req, + (unsigned long long)req->sector, + (unsigned long long)sector); + break; + } + return req; + } + } + ERR("_ack_id_to_req: failed to find req %p, sector %llus in list\n", + (void*)(unsigned long)id, (unsigned long long)sector); + return NULL; +} + +/* application reads (drbd_request objects) */ +static struct hlist_head* ar_hash_slot(drbd_dev *mdev, sector_t sector) +{ + return mdev->app_reads_hash + + ((unsigned int)(sector) % APP_R_HSIZE); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline drbd_request_t* _ar_id_to_req(drbd_dev *mdev,u64 id, sector_t sector) +{ + struct hlist_head *slot = ar_hash_slot(mdev,sector); + struct hlist_node *n; + drbd_request_t * req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + D_ASSERT(req->sector == sector); + return req; + } + } + return NULL; +} + +static inline drbd_request_t* drbd_req_new(drbd_dev *mdev, struct bio *bio_src) +{ + struct bio *bio; + drbd_request_t *req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (likely(req)) { + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->rq_state = 0; + req->mdev = mdev; + req->master_bio = bio_src; + req->private_bio = bio; + req->epoch = 0; + req->sector = bio->bi_sector; + req->size = bio->bi_size; + INIT_HLIST_NODE(&req->colision); + INIT_LIST_HEAD(&req->tl_requests); + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = 0; + } + return req; +} + +static inline void drbd_req_free(drbd_request_t *req) +{ + mempool_free(req,drbd_request_mempool); +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !( ( s1 + (l1>>9) <= s2 ) || ( s1 >= s2 + (l2>>9) ) ); +} + +/* aparently too large to be inlined... + * moved to drbd_req.c */ +extern void _req_may_be_done(drbd_request_t *req, int error); +extern void _req_mod(drbd_request_t *req, drbd_req_event_t what, int error); + +/* If you need it irqsave, do it your self! */ +static inline void req_mod(drbd_request_t *req, drbd_req_event_t what, int error) +{ + drbd_dev *mdev = req->mdev; + spin_lock_irq(&mdev->req_lock); + _req_mod(req,what,error); + spin_unlock_irq(&mdev->req_lock); +} +#endif diff -uprN linux-2.6.24/drivers/block/drbd/drbd_strings.c linux-2.6.24.ovz/drivers/block/drbd/drbd_strings.c --- linux-2.6.24/drivers/block/drbd/drbd_strings.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_strings.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,105 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include + +static const char *drbd_conn_s_names[] = { + [StandAlone] = "StandAlone", + [Disconnecting] = "Disconnecting", + [Unconnected] = "Unconnected", + [Timeout] = "Timeout", + [BrokenPipe] = "BrokenPipe", + [NetworkFailure] = "NetworkFailure", + [ProtocolError] = "ProtocolError", + [WFConnection] = "WFConnection", + [WFReportParams] = "WFReportParams", + [TearDown] = "TearDown", + [Connected] = "Connected", + [StartingSyncS] = "StartingSyncS", + [StartingSyncT] = "StartingSyncT", + [WFBitMapS] = "WFBitMapS", + [WFBitMapT] = "WFBitMapT", + [WFSyncUUID] = "WFSyncUUID", + [SyncSource] = "SyncSource", + [SyncTarget] = "SyncTarget", + [PausedSyncS] = "PausedSyncS", + [PausedSyncT] = "PausedSyncT" +}; + +static const char *drbd_role_s_names[] = { + [Primary] = "Primary", + [Secondary] = "Secondary", + [Unknown] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [Diskless] = "Diskless", + [Attaching] = "Attaching", + [Failed] = "Failed", + [Negotiating] = "Negotiating", + [Inconsistent] = "Inconsistent", + [Outdated] = "Outdated", + [DUnknown] = "DUnknown", + [Consistent] = "Consistent", + [UpToDate] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TwoPrimaries] = "Multiple primaries not allowed by config", + [-SS_NoUpToDateDisk] = + "Refusing to be Primary without at least one UpToDate disk", + [-SS_BothInconsistent] = "Refusing to be inconsistent on both nodes", + [-SS_SyncingDiskless] = "Refusing to be syncing and diskless", + [-SS_ConnectedOutdates] = "Refusing to be Outdated while Connected", + [-SS_PrimaryNOP] = "Refusing to be Primary while peer is not outdated", + [-SS_ResyncRunning] = "Can not start resync since it is already active", + [-SS_AlreadyStandAlone] = "Can not disconnect a StandAlone device", + [-SS_CW_FailedByPeer] = "State changed was refused by peer node", + [-SS_CanNotOutdateDL] = "Can not outdate a diskless device", + [-SS_DeviceInUse] = "Device is held open by someone" +}; + +const char* conns_to_name(drbd_conns_t s) { + /* enums are unsigned... */ + return s > PausedSyncT ? "TOO_LARGE" + : drbd_conn_s_names[s]; +} + +const char* roles_to_name(drbd_role_t s) { + return s > Secondary ? "TOO_LARGE" + : drbd_role_s_names[s]; +} + +const char* disks_to_name(drbd_disks_t s) { + return s > UpToDate ? "TOO_LARGE" + : drbd_disk_s_names[s]; +} + +const char* set_st_err_name(set_st_err_t err) { + return err < SS_DeviceInUse ? "TOO_SMALL" : + err > SS_TwoPrimaries ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff -uprN linux-2.6.24/drivers/block/drbd/drbd_worker.c linux-2.6.24.ovz/drivers/block/drbd/drbd_worker.c --- linux-2.6.24/drivers/block/drbd/drbd_worker.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/drbd_worker.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,979 @@ +/* +-*- linux-c -*- + drbd_worker.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2007, Philipp Reisner . + Copyright (C) 2002-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include // for the page_count macro on RH/Fedora +#include +#include + +#include +#include "drbd_int.h" +#include "drbd_req.h" + +/* I choose to have all block layer end_io handlers defined here. + + * For all these callbacks, note the follwing: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +int drbd_md_io_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +int drbd_endio_read_sec(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + e = bio->bi_private; + mdev = e->mdev; + + /* We are called each time a part of the bio is finished, but + * we are only interested when the whole bio is finished, therefore + * return as long as bio->bio_size is positive. */ + if (bio->bi_size) return 1; + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock,flags); + mdev->read_cnt += e->size >> 9; + list_del(&e->w.list); + if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + drbd_chk_io_error(mdev,error,FALSE); + drbd_queue_work(&mdev->data.work,&e->w); + dec_local(mdev); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Moved EE (READ) to worker sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + return 0; +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +int drbd_endio_write_sec(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + drbd_dev *mdev; + int do_wake; + int is_syncer_req; + + e = bio->bi_private; + mdev = e->mdev; + + // see above + if (bio->bi_size) return 1; + + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->req_lock,flags); + mdev->writ_cnt += e->size >> 9; + is_syncer_req = is_syncer_block_id(e->block_id); + list_del(&e->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&e->w.list,&mdev->done_ee); + + /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, + * neither did we wake possibly waiting conflicting requests. + * done from "drbd_process_done_ee" within the appropriate w.cb + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ + + if(!is_syncer_req) mdev->epoch_size++; + + do_wake = is_syncer_req + ? list_empty(&mdev->sync_ee) + : list_empty(&mdev->active_ee); + + if (error) __drbd_chk_io_error(mdev,FALSE); + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if(is_syncer_req) drbd_rs_complete_io(mdev,e->sector); + + if (do_wake) wake_up(&mdev->ee_wait); + + if(e->flags & EE_CALL_AL_COMPLETE_IO) drbd_al_complete_io(mdev,e->sector); + + wake_asender(mdev); + dec_local(mdev); + + MTRACE(TraceTypeEE,TraceLvlAll, + INFO("Moved EE (WRITE) to done_ee sec=%llus size=%u ee=%p\n", + (unsigned long long)e->sector,e->size,e); + ); + return 0; +} + +/* read, readA or write requests on Primary comming from drbd_make_request + */ +int drbd_endio_pri(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags; + drbd_request_t *req=bio->bi_private; + drbd_dev *mdev = req->mdev; + drbd_req_event_t what; + + // see above + if (bio->bi_size) return 1; + + /* to avoid recursion in _req_mod */ + what = error + ? (bio_data_dir(bio) == WRITE) + ? write_completed_with_error + : read_completed_with_error + : completed_ok; + spin_lock_irqsave(&mdev->req_lock,flags); + _req_mod(req, what, error); + spin_unlock_irqrestore(&mdev->req_lock,flags); + return 0; +} + +int w_io_error(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + int ok; + + /* FIXME send a "set_out_of_sync" packet to the peer + * in the PassOn case... + * in the Detach (or Panic) case, we (try to) send + * a "we are diskless" param packet anyways, and the peer + * will then set the FullSync bit in the meta data ... + */ + D_ASSERT(mdev->bc->dc.on_io_error != PassOn); + + /* the only way this callback is scheduled is from _req_may_be_done, + * when it is done and had a local write error, see comments there */ + drbd_req_free(req); + + if(unlikely(cancel)) return 1; + + ok = drbd_io_error(mdev, FALSE); + if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n"); + return ok; +} + +int w_read_retry_remote(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + + spin_lock_irq(&mdev->req_lock); + if ( cancel || + mdev->state.conn < Connected || + mdev->state.pdsk <= Inconsistent ) { + _req_mod(req, send_canceled, 0); /* FIXME freeze? ... */ + spin_unlock_irq(&mdev->req_lock); + drbd_khelper(mdev,"pri-on-incon-degr"); /* FIXME REALLY? */ + ALERT("WE ARE LOST. Local IO failure, no peer.\n"); + return 1; + } + spin_unlock_irq(&mdev->req_lock); + + /* FIXME this is ugly. we should not detach for read io-error, + * but try to WRITE the DataReply to the failed location, + * to give the disk the chance to relocate that block */ + drbd_io_error(mdev,FALSE); /* tries to schedule a detach and notifies peer */ + return w_send_read_req(mdev,w,0); +} + +int w_resync_inactive(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + ERR_IF(cancel) return 1; + ERR("resync inactive, but callback triggered??\n"); + return 1; // Simply ignore this! +} + +void resync_timer_fn(unsigned long data) +{ + unsigned long flags; + drbd_dev* mdev = (drbd_dev*) data; + int queue; + + spin_lock_irqsave(&mdev->req_lock,flags); + + if(likely(!test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags))) { + queue=1; + mdev->resync_work.cb = w_make_resync_request; + } else { + queue=0; + mdev->resync_work.cb = w_resync_inactive; + } + + spin_unlock_irqrestore(&mdev->req_lock,flags); + + /* harmless race: list_empty outside data.work.q_lock */ + if(list_empty(&mdev->resync_work.list) && queue) { + drbd_queue_work(&mdev->data.work,&mdev->resync_work); + } +} + +#define SLEEP_TIME (HZ/10) + +int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + int max_segment_size = mdev->rq_queue->max_segment_size; + int number,i,size; + int align; + + PARANOIA_BUG_ON(w != &mdev->resync_work); + + if(unlikely(cancel)) return 1; + + if(unlikely(mdev->state.conn < Connected)) { + ERR("Confused in w_make_resync_request()! cstate < Connected"); + return 0; + } + + if (mdev->state.conn != SyncTarget) { + ERR("%s in w_make_resync_request\n", conns_to_name(mdev->state.conn)); + } + + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + + if (atomic_read(&mdev->rs_pending_cnt)>number) { + goto requeue; + } + number -= atomic_read(&mdev->rs_pending_cnt); + + for(i=0;iresync_work.cb = w_resync_inactive; + return 1; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(mdev, sector)) { + drbd_bm_set_find(mdev,bit); + goto requeue; + } + + if (unlikely(drbd_bm_test_bit(mdev,bit) == 0 )) { + //INFO("Block got synced while in drbd_rs_begin_io()\n"); + drbd_rs_complete_io(mdev,sector); + goto next_sector; + } + +#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Aditionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + * + * we _do_ care about the agreed-uppon q->max_segment_size + * here, as splitting up the requests on the other side is more + * difficult. the consequence is, that on lvm and md and other + * "indirect" devices, this is dead code, since + * q->max_segment_size will be PAGE_SIZE. + */ + align=1; + for (;;) { + if (size + BM_BLOCK_SIZE > max_segment_size) + break; + + // Be always aligned + if (sector & ((1<<(align+3))-1) ) + break; + + // do not cross extent boundaries + if (( (bit+1) & BM_BLOCKS_PER_BM_EXT_MASK ) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if ( drbd_bm_test_bit(mdev,bit+1) != 1 ) + break; + bit++; + size += BM_BLOCK_SIZE; + if( (BM_BLOCK_SIZE< BM_BLOCK_SIZE) + drbd_bm_set_find(mdev,bit+1); +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; + inc_rs_pending(mdev); + if(!drbd_send_drequest(mdev,RSDataRequest, + sector,size,ID_SYNCER)) { + ERR("drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(mdev); + return 0; + } + } + + if(drbd_bm_rs_done(mdev)) { + /* last syncer _request_ was sent, + * but the RSDataReply not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + +int w_resync_finished(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + + drbd_bm_lock(mdev); + drbd_resync_finished(mdev); + drbd_bm_unlock(mdev); + + return 1; +} + +int drbd_resync_finished(drbd_dev* mdev) +{ + unsigned long db,dt,dbdt; + int dstate, pdstate; + struct drbd_work *w; + + // Remove all elements from the resync LRU. Since future actions + // might set bits in the (main) bitmap, then the entries in the + // resync LRU would be wrong. + if(drbd_rs_del_all(mdev)) { + // In case this is not possible now, most probabely because + // there are RSDataReply Packets lingering on the worker's + // queue (or even the read operations for those packets + // is not finished by now). Retry in 100ms. + + drbd_kick_lo(mdev); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); + if(w) { + w->cb = w_resync_finished; + drbd_queue_work(&mdev->data.work,w); + return 1; + } + ERR("Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); + } + + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total; + dbdt = Bit2KB(db/dt); + mdev->rs_paused /= HZ; + INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n", + dt + mdev->rs_paused, mdev->rs_paused, dbdt); + + D_ASSERT((drbd_bm_total_weight(mdev)-mdev->rs_failed) == 0); + + if (mdev->rs_failed) { + INFO(" %lu failed blocks\n",mdev->rs_failed); + + if (mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncT) { + dstate = Inconsistent; + pdstate = UpToDate; + } else { + dstate = UpToDate; + pdstate = Inconsistent; + } + } else { + dstate = pdstate = UpToDate; + + if (mdev->state.conn == SyncTarget || + mdev->state.conn == PausedSyncT) { + if( mdev->p_uuid ) { + int i; + for ( i=Bitmap ; i<=History_end ; i++ ) { + _drbd_uuid_set(mdev,i,mdev->p_uuid[i]); + } + drbd_uuid_set(mdev,Bitmap,mdev->bc->md.uuid[Current]); + _drbd_uuid_set(mdev,Current,mdev->p_uuid[Current]); + } else { + ERR("mdev->p_uuid is NULL! BUG\n"); + } + } + + drbd_uuid_set_bm(mdev,0UL); + + if ( mdev->p_uuid ) { + // Now the two UUID sets are equal, update what we + // know of the peer. + int i; + for ( i=Current ; i<=History_end ; i++ ) { + mdev->p_uuid[i]=mdev->bc->md.uuid[i]; + } + } + } + + mdev->rs_total = 0; + mdev->rs_failed = 0; + mdev->rs_paused = 0; + + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags)) { + WARN("Writing the whole bitmap, due to failed kmalloc\n"); + drbd_bm_write(mdev); + } + + drbd_request_state(mdev,NS3(conn,Connected, + disk,dstate, + pdsk,pdstate)); + + drbd_md_sync(mdev); + + return 1; +} + +/** + * w_e_end_data_req: Send the answer (DataReply) in response to a DataRequest. + */ +int w_e_end_data_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + drbd_free_ee(mdev,e); + dec_unacked(mdev); + return 1; + } + + if(likely(drbd_bio_uptodate(e->private_bio))) { + ok=drbd_send_block(mdev, DataReply, e); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegDReply. sector=%llus.\n", + (unsigned long long)e->sector); + + ok=drbd_send_ack(mdev,NegDReply,e); + + /* FIXME we should not detach for read io-errors, in particular + * not now: when the peer asked us for our data, we are likely + * the only remaining disk... */ + drbd_io_error(mdev,FALSE); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->req_lock); + if( drbd_bio_has_active_page(e->private_bio) ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_free_ee(mdev,e); + } + spin_unlock_irq(&mdev->req_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +/** + * w_e_end_rsdata_req: Send the answer (RSDataReply) to a RSDataRequest. + */ +int w_e_end_rsdata_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + drbd_free_ee(mdev,e); + dec_unacked(mdev); + return 1; + } + + drbd_rs_complete_io(mdev,e->sector); + + if(likely(drbd_bio_uptodate(e->private_bio))) { + if (likely( mdev->state.pdsk >= Inconsistent )) { + inc_rs_pending(mdev); + ok=drbd_send_block(mdev, RSDataReply, e); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Not sending RSDataReply, partner DISKLESS!\n"); + ok=1; + } + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegRSDReply. sector %llus.\n", + (unsigned long long)e->sector); + + ok=drbd_send_ack(mdev,NegRSDReply,e); + + drbd_io_error(mdev, FALSE); + + // update resync data with failure + drbd_rs_failed_io(mdev, e->sector, e->size); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->req_lock); + if( drbd_bio_has_active_page(e->private_bio) ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_free_ee(mdev,e); + } + spin_unlock_irq(&mdev->req_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +int w_prev_work_done(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + clear_bit(WORK_PENDING,&mdev->flags); + wake_up(&mdev->misc_wait); + return 1; +} + +int w_send_barrier(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_barrier *b = (struct drbd_barrier *)w; + Drbd_Barrier_Packet *p = &mdev->data.sbuf.Barrier; + int ok=1; + + /* really avoid racing with tl_clear. w.cb may have been referenced + * just before it was reassigned and requeued, so double check that. + * actually, this race was harmless, since we only try to send the + * barrier packet here, and otherwise do nothing with the object. + * but compare with the head of w_clear_epoch */ + spin_lock_irq(&mdev->req_lock); + if (w->cb != w_send_barrier || mdev->state.conn < Connected) + cancel = 1; + spin_unlock_irq(&mdev->req_lock); + if (cancel) + return 1; + + if (!drbd_get_data_sock(mdev)) + return 0; + p->barrier = b->br_number; + /* inc_ap_pending was done where this was queued. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in w_clear_epoch. */ + ok = _drbd_send_cmd(mdev,mdev->data.socket,Barrier,(Drbd_Header*)p,sizeof(*p),0); + drbd_put_data_sock(mdev); + + return ok; +} + +int w_send_write_hint(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + if (cancel) return 1; + return drbd_send_short_cmd(mdev,UnplugRemote); +} + +/** + * w_send_dblock: Send a mirrored write request. + */ +int w_send_dblock(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + drbd_request_t *req = (drbd_request_t *)w; + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled, 0); + return 1; + } + + ok = drbd_send_dblock(mdev,req); + req_mod(req,ok ? handed_over_to_network : send_failed, 0); + + return ok; +} + +/** + * w_send_read_req: Send a read requests. + */ +int w_send_read_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + drbd_request_t *req = (drbd_request_t *)w; + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled, 0); + return 1; + } + + ok = drbd_send_drequest(mdev, DataRequest, req->sector, req->size, + (unsigned long)req); + + if(ok) { + req_mod(req, handed_over_to_network, 0); + } else { + /* ?? we set Timeout or BrokenPipe in drbd_send() */ + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + /* req_mod(req, send_failed); we should not fail it here, + * we might have to "freeze" on disconnect. + * handled by req_mod(req, connection_lost_while_pending); + * in drbd_fail_pending_reads soon enough. */ + } + + return ok; +} + +STATIC void drbd_global_lock(void) +{ + drbd_dev *mdev; + int i; + + local_irq_disable(); + for (i=0; i < minor_count; i++) { + if(!(mdev = minor_to_mdev(i))) continue; + spin_lock(&mdev->req_lock); + } +} + +STATIC void drbd_global_unlock(void) +{ + drbd_dev *mdev; + int i; + + for (i=0; i < minor_count; i++) { + if(!(mdev = minor_to_mdev(i))) continue; + spin_unlock(&mdev->req_lock); + } + local_irq_enable(); +} + +STATIC int _drbd_may_sync_now(drbd_dev *mdev) +{ + drbd_dev *odev = mdev; + + while(1) { + if( odev->sync_conf.after == -1 ) return 1; + odev = minor_to_mdev(odev->sync_conf.after); + ERR_IF(!odev) return 1; + if( (odev->state.conn >= SyncSource && + odev->state.conn <= PausedSyncT) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp ) return 0; + } +} + +/** + * _drbd_pause_after: + * Finds all devices that may not resync now, and causes them to + * pause their resynchronisation. + * Called from process context only ( ioctl and after_state_ch ). + */ +STATIC int _drbd_pause_after(drbd_dev *mdev) +{ + drbd_dev *odev; + int i, rv = 0; + + for (i=0; i < minor_count; i++) { + if( !(odev = minor_to_mdev(i)) ) continue; + if (! _drbd_may_sync_now(odev)) { + rv |= ( _drbd_set_state(_NS(odev,aftr_isp,1), + ChgStateHard|ScheduleAfter) + != SS_NothingToDo ) ; + } + } + + return rv; +} + +/** + * _drbd_resume_next: + * Finds all devices that can resume resynchronisation + * process, and causes them to resume. + * Called from process context only ( ioctl and worker ). + */ +STATIC int _drbd_resume_next(drbd_dev *mdev) +{ + drbd_dev *odev; + int i, rv = 0; + + for (i=0; i < minor_count; i++) { + if( !(odev = minor_to_mdev(i)) ) continue; + if ( odev->state.aftr_isp ) { + if (_drbd_may_sync_now(odev)) { + rv |= ( _drbd_set_state(_NS(odev,aftr_isp,0), + ChgStateHard|ScheduleAfter) + != SS_NothingToDo ) ; + } + } + } + return rv; +} + +void resume_next_sg(drbd_dev* mdev) +{ + drbd_global_lock(); + _drbd_resume_next(mdev); + drbd_global_unlock(); +} + +void suspend_other_sg(drbd_dev* mdev) +{ + drbd_global_lock(); + _drbd_pause_after(mdev); + drbd_global_unlock(); +} + +void drbd_alter_sa(drbd_dev *mdev, int na) +{ + int changes; + + drbd_global_lock(); + mdev->sync_conf.after = na; + + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); + + drbd_global_unlock(); +} + +/** + * drbd_start_resync: + * @side: Either SyncSource or SyncTarget + * Start the resync process. Called from process context only, + * either ioctl or drbd_receiver. + * Note, this function might bring you directly into one of the + * PausedSync* states. + */ +void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) +{ + drbd_state_t os,ns; + int r=0; + + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("Resync starting: side=%s\n", + side==SyncTarget?"SyncTarget":"SyncSource"); + ); + + /* In case a previous resync run was aborted by an IO error... */ + drbd_rs_cancel_all(mdev); + + if(side == SyncTarget) { + drbd_bm_reset_find(mdev); + } else /* side == SyncSource */ { + u64 uuid; + + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, Bitmap, uuid); + drbd_send_sync_uuid(mdev,uuid); + + D_ASSERT(mdev->state.disk == UpToDate); + } + + drbd_global_lock(); + ns = os = mdev->state; + + ns.aftr_isp = !_drbd_may_sync_now(mdev); + + ns.conn = side; + + if(side == SyncTarget) { + ns.disk = Inconsistent; + } else /* side == SyncSource */ { + ns.pdsk = Inconsistent; + } + + r = _drbd_set_state(mdev,ns,ChgStateVerbose); + ns = mdev->state; + + if ( r == SS_Success ) { + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->rs_start = + mdev->rs_mark_time = jiffies; + _drbd_pause_after(mdev); + } + drbd_global_unlock(); + + if ( r == SS_Success ) { + after_state_ch(mdev,os,ns,ChgStateVerbose); + + INFO("Began resync as %s (will sync %lu KB [%lu bits set]).\n", + conns_to_name(ns.conn), + (unsigned long) mdev->rs_total << (BM_BLOCK_SIZE_B-10), + (unsigned long) mdev->rs_total); + + if ( mdev->rs_total == 0 ) { + drbd_resync_finished(mdev); + return; + } + + if( ns.conn == SyncTarget ) { + D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags)); + mod_timer(&mdev->resync_timer,jiffies); + } + + drbd_md_sync(mdev); + } +} + +int drbd_worker(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + struct drbd_work *w = 0; + LIST_HEAD(work_list); + int intr=0,i; + + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); + + while (get_t_state(thi) == Running) { + + if(down_trylock(&mdev->data.work.s)) { + down(&mdev->data.mutex); + if(mdev->data.socket)drbd_tcp_flush(mdev->data.socket); + up(&mdev->data.mutex); + + intr = down_interruptible(&mdev->data.work.s); + + down(&mdev->data.mutex); + if(mdev->data.socket) drbd_tcp_cork(mdev->data.socket); + up(&mdev->data.mutex); + } + + if (intr) { + D_ASSERT(intr == -EINTR); + flush_signals(current); + ERR_IF (get_t_state(thi) == Running) + continue; + break; + } + + if (get_t_state(thi) != Running) break; + /* With this break, we have done a down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = 0; + spin_lock_irq(&mdev->data.work.q_lock); + ERR_IF(list_empty(&mdev->data.work.q)) { + /* something terribly wrong in our logic. + * we were able to down() the semaphore, + * but the list is empty... doh. + * + * what is the best thing to do now? + * try again from scratch, restarting the receiver, + * asender, whatnot? could break even more ugly, + * e.g. when we are primary, but no good local data. + * + * I'll try to get away just starting over this loop. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + continue; + } + w = list_entry(mdev->data.work.q.next,struct drbd_work,list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->data.work.q_lock); + + if(!w->cb(mdev,w, mdev->state.conn < Connected )) { + //WARN("worker: a callback failed! \n"); + if (mdev->state.conn >= Connected) + drbd_force_state(mdev,NS(conn,NetworkFailure)); + } + } + + spin_lock_irq(&mdev->data.work.q_lock); + i = 0; + while (!list_empty(&mdev->data.work.q)) { + list_splice_init(&mdev->data.work.q,&work_list); + spin_unlock_irq(&mdev->data.work.q_lock); + + while(!list_empty(&work_list)) { + w = list_entry(work_list.next, struct drbd_work,list); + list_del_init(&w->list); + w->cb(mdev,w,1); + i++; /* dead debugging code */ + } + + spin_lock_irq(&mdev->data.work.q_lock); + } + sema_init(&mdev->data.work.s,0); + /* DANGEROUS race: if someone did queue his work within the spinlock, + * but up() ed outside the spinlock, we could get an up() on the + * semaphore without corresponding list entry. + * So don't do that. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + /* FIXME verify that there absolutely can not be any more work + * on the queue now... + * if so, the comment above is no longer true, but historic + * from the times when the worker did not live as long as the + * device.. */ + + D_ASSERT( mdev->state.disk == Diskless && mdev->state.conn == StandAlone ); + drbd_mdev_cleanup(mdev); + module_put(THIS_MODULE); + + INFO("worker terminated\n"); + + return 0; +} diff -uprN linux-2.6.24/drivers/block/drbd/lru_cache.c linux-2.6.24.ovz/drivers/block/drbd/lru_cache.c --- linux-2.6.24/drivers/block/drbd/lru_cache.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/lru_cache.c 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,370 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include // for memset +#include // for seq_printf +#include "lru_cache.h" + +#define STATIC static + +// this is developers aid only! +#define PARANOIA_ENTRY() BUG_ON(test_and_set_bit(__LC_PARANOIA,&lc->flags)) +#define PARANOIA_LEAVE() do { clear_bit(__LC_PARANOIA,&lc->flags); smp_mb__after_clear_bit(); } while (0) +#define RETURN(x...) do { PARANOIA_LEAVE(); return x ; } while (0) + +/** + * lc_alloc: allocates memory for @e_count objects of @e_size bytes plus the + * struct lru_cache, and the hash table slots. + * returns pointer to a newly initialized lru_cache object with said parameters. + */ +struct lru_cache* lc_alloc(const char *name, unsigned int e_count, + size_t e_size, void *private_p) +{ + unsigned long bytes; + struct lru_cache *lc; + struct lc_element *e; + int i; + + BUG_ON(!e_count); + e_size = max(sizeof(struct lc_element),e_size); + bytes = e_size+sizeof(struct hlist_head); + bytes *= e_count; + bytes += sizeof(struct lru_cache); + lc = vmalloc(bytes); + memset(lc, 0, bytes); + if (lc) { + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + lc->element_size = e_size; + lc->nr_elements = e_count; + lc->new_number = -1; + lc->lc_private = private_p; + lc->name = name; + for(i=0;ilc_number = LC_FREE; + list_add(&e->list,&lc->free); + // memset(,0,) did the rest of init for us + } + } + return lc; +} + +/** + * lc_free: Frees memory allocated by lc_alloc. + * @lc: The lru_cache object + */ +void lc_free(struct lru_cache* lc) +{ + vfree(lc); +} + +size_t lc_printf_stats(struct seq_file *seq, struct lru_cache* lc) +{ + /* NOTE: + * total calls to lc_get are + * starving + hits + misses + * misses include "dirty" count (update from an other thread in progress) + * and "changed", when this in fact lead to an successful update of the cache. + */ + return seq_printf(seq,"\t%s: used:%u/%u " + "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", + lc->name, lc->used, lc->nr_elements, + lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); +} + +static unsigned int lc_hash_fn(struct lru_cache* lc, unsigned int enr) +{ + return enr % lc->nr_elements; +} + + +/** + * lc_find: Returns the pointer to an element, if the element is present + * in the hash table. In case it is not this function returns NULL. + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr) +{ + struct hlist_node *n; + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + hlist_for_each_entry(e, n, lc->slot + lc_hash_fn(lc, enr), colision) { + if (e->lc_number == enr) return e; + } + return NULL; +} + +STATIC struct lc_element * lc_evict(struct lru_cache* lc) +{ + struct list_head *n; + struct lc_element *e; + + if (list_empty(&lc->lru)) return 0; + + n=lc->lru.prev; + e=list_entry(n, struct lc_element,list); + + list_del(&e->list); + hlist_del(&e->colision); + return e; +} + +/** + * lc_del: Removes an element from the cache (and therefore adds the + * element's storage to the free list) + * + * @lc: The lru_cache object + * @e: The element to remove + */ +void lc_del(struct lru_cache* lc, struct lc_element *e) +{ + // FIXME what to do with refcnt != 0 ? + PARANOIA_ENTRY(); + BUG_ON(e->refcnt); + list_del(&e->list); + hlist_del_init(&e->colision); + e->lc_number = LC_FREE; + e->refcnt = 0; + list_add(&e->list,&lc->free); + RETURN(); +} + +STATIC struct lc_element* lc_get_unused_element(struct lru_cache* lc) +{ + struct list_head *n; + + if (list_empty(&lc->free)) return lc_evict(lc); + + n=lc->free.next; + list_del(n); + return list_entry(n, struct lc_element,list); +} + +STATIC int lc_unused_element_available(struct lru_cache* lc) +{ + if (!list_empty(&lc->free)) return 1; // something on the free list + if (!list_empty(&lc->lru)) return 1; // something to evict + + return 0; +} + + +/** + * lc_get: Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes eviced from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL if the requested element number was not in the cache, and no unused + * element could be recycled + * pointer to the element with the REQUESTED element number + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number. + * In this case, the cache is marked dirty, and the returned element + * pointer is removed from the lru list and hash collision chains. + * The user now should do whatever houskeeping is necessary. Then he + * needs to call lc_element_changed(lc,element_pointer), to finish the + * change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. + * + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_get(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + + PARANOIA_ENTRY(); + if ( lc->flags & LC_STARVING ) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if( e->refcnt++ == 0) lc->used++; + list_move(&e->list,&lc->in_use); // Not evictable... + RETURN(e); + } + + ++lc->misses; + + /* In case there is nothing available and we can not kick out + * the LRU element, we have to wait ... + */ + if(!lc_unused_element_available(lc)) { + __set_bit(__LC_STARVING,&lc->flags); + RETURN(NULL); + } + + /* it was not present in the cache, find an unused element, + * which then is replaced. + * we need to update the cache; serialize on lc->flags & LC_DIRTY + */ + if (test_and_set_bit(__LC_DIRTY,&lc->flags)) { + ++lc->dirty; + RETURN(NULL); + } + + e = lc_get_unused_element(lc); + BUG_ON(!e); + + clear_bit(__LC_STARVING,&lc->flags); + BUG_ON(++e->refcnt != 1); + lc->used++; + + lc->changing_element = e; + lc->new_number = enr; + + RETURN(e); +} + +/* similar to lc_get, + * but only gets a new reference on an existing element. + * you either get the requested element, or NULL. + */ +struct lc_element* lc_try_get(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + + PARANOIA_ENTRY(); + if ( lc->flags & LC_STARVING ) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if( e->refcnt++ == 0) lc->used++; + list_move(&e->list,&lc->in_use); // Not evictable... + } + RETURN(e); +} + +void lc_changed(struct lru_cache* lc, struct lc_element* e) +{ + PARANOIA_ENTRY(); + BUG_ON(e != lc->changing_element); + ++lc->changed; + e->lc_number = lc->new_number; + list_add(&e->list,&lc->in_use); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc, lc->new_number) ); + lc->changing_element = NULL; + lc->new_number = -1; + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); + PARANOIA_LEAVE(); +} + + +unsigned int lc_put(struct lru_cache* lc, struct lc_element* e) +{ + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + BUG_ON(!e); + + PARANOIA_ENTRY(); + BUG_ON(e->refcnt == 0); + BUG_ON(e == lc->changing_element); + if ( --e->refcnt == 0) { + list_move(&e->list,&lc->lru); // move it to the front of LRU. + lc->used--; + clear_bit(__LC_STARVING,&lc->flags); + smp_mb__after_clear_bit(); + } + RETURN(e->refcnt); +} + + +/** + * lc_set: Sets an element in the cache. You might use this function to + * setup the cache. It is expected that the elements are properly initialized. + * @lc: The lru_cache object + * @enr: element number + * @index: The elements' position in the cache + */ +void lc_set(struct lru_cache* lc, unsigned int enr, int index) +{ + struct lc_element *e; + + if ( index < 0 || index >= lc->nr_elements ) return; + + e = lc_entry(lc,index); + e->lc_number = enr; + + hlist_del_init(&e->colision); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc,enr) ); + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); +} + +#if 0 +/** + * lc_dump: Dump a complete LRU cache to seq in textual form. + */ +void lc_dump(struct lru_cache* lc, struct seq_file *seq, char* utext, + void (*detail) (struct seq_file *, struct lc_element *) ) +{ + unsigned int nr_elements = lc->nr_elements; + struct lc_element *e; + int i; + + seq_printf(seq,"\tnn: lc_number refcnt %s\n ",utext); + for(i=0;ilc_number == LC_FREE ) { + seq_printf(seq,"\t%2d: FREE\n",i ); + } else { + seq_printf(seq,"\t%2d: %4u %4u ", i, + e->lc_number, + e->refcnt ); + detail(seq,e); + } + } +} + +#endif diff -uprN linux-2.6.24/drivers/block/drbd/lru_cache.h linux-2.6.24.ovz/drivers/block/drbd/lru_cache.h --- linux-2.6.24/drivers/block/drbd/lru_cache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/block/drbd/lru_cache.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,147 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2007, Philipp Reisner . + Copyright (C) 2003-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +/* + The lru_cache describes a big set of objects that are addressed + by an index number (=lc_number). Only a small fraction of this set + is present in the cache. + (You set the size of the cache during lc_alloc) + Once created, the api consists of + lc_find(,nr) -- finds the object with the given number, if present + lc_get(,nr) -- finds the object and increases the usage count + if not present, actions are taken to make sure that + the cache is updated, the user is notified of this by a callback. + Return value is NULL in this case. + As soon as the user informs the cache that it has been updated, + the next lc_get on that very object number will be successfull. + lc_put(,lc_element*) + -- decreases the usage count of this object, and returns the new value. + + NOTE: It is the USERS responsibility to make sure that calls do not happen concurrently. + */ + +#ifndef LRU_CACHE_H +#define LRU_CACHE_H + +#include +#ifndef HLIST_HEAD_INIT +# include "hlist.h" +#endif + +#include + +/* FIXME + * I want these structs opaque outside of lru_cache.c + */ + +struct lc_element { + struct hlist_node colision; + struct list_head list; // LRU list or free list + unsigned int refcnt; + unsigned int lc_number; +}; + +struct lru_cache { + struct list_head lru; + struct list_head free; + struct list_head in_use; + size_t element_size; + unsigned int nr_elements; + unsigned int new_number; + + /* here may or may not be a pad... */ + + unsigned int used; + unsigned long flags; + unsigned long hits, misses, starving, dirty, changed; + struct lc_element *changing_element; // just for paranoia + + void *lc_private; + const char *name; + + struct hlist_head slot[0]; + // hash colision chains here, then element storage. +}; + + +// flag-bits for lru_cache +enum { + __LC_PARANOIA, + __LC_DIRTY, + __LC_STARVING, +}; +#define LC_PARANOIA (1<<__LC_PARANOIA) +#define LC_DIRTY (1<<__LC_DIRTY) +#define LC_STARVING (1<<__LC_STARVING) + +extern struct lru_cache* lc_alloc(const char *name, unsigned int e_count, + size_t e_size, void *private_p); +extern void lc_free(struct lru_cache* lc); +extern void lc_set (struct lru_cache* lc, unsigned int enr, int index); +extern void lc_del (struct lru_cache* lc, struct lc_element *element); + +extern struct lc_element* lc_try_get(struct lru_cache* lc, unsigned int enr); +extern struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr); +extern struct lc_element* lc_get (struct lru_cache* lc, unsigned int enr); +extern unsigned int lc_put (struct lru_cache* lc, struct lc_element* e); +extern void lc_changed(struct lru_cache* lc, struct lc_element* e); + +struct seq_file; +extern size_t lc_printf_stats(struct seq_file *seq, struct lru_cache* lc); + +void lc_dump(struct lru_cache* lc, struct seq_file *seq, char* utext, + void (*detail) (struct seq_file *, struct lc_element *) ); + +/* This can be used to stop lc_get from changing the set of active elements. + * Note that the reference counts and order on the lru list may still change. + * returns true if we aquired the lock. + */ +static inline int lc_try_lock(struct lru_cache* lc) +{ + return !test_and_set_bit(__LC_DIRTY,&lc->flags); +} + +static inline void lc_unlock(struct lru_cache* lc) +{ + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); +} + +static inline int lc_is_used(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element* e = lc_find(lc,enr); + return (e && e->refcnt); +} + +#define LC_FREE (-1U) + +#define lc_e_base(lc) ((char*) ( (lc)->slot + (lc)->nr_elements ) ) +#define lc_entry(lc,i) ((struct lc_element*) \ + (lc_e_base(lc) + (i)*(lc)->element_size)) +#define lc_index_of(lc,e) (((char*)(e) - lc_e_base(lc))/(lc)->element_size) + +#endif diff -uprN linux-2.6.24/drivers/char/drm/drm_stub.c linux-2.6.24.ovz/drivers/char/drm/drm_stub.c --- linux-2.6.24/drivers/char/drm/drm_stub.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/drm/drm_stub.c 2008-03-25 18:53:59.000000000 -0500 @@ -218,6 +218,7 @@ int drm_get_dev(struct pci_dev *pdev, co if (ret) goto err_g1; + pci_set_master(pdev); if ((ret = drm_fill_in_dev(dev, pdev, ent, driver))) { printk(KERN_ERR "DRM: Fill_in_dev failed.\n"); goto err_g2; diff -uprN linux-2.6.24/drivers/char/drm/drm_vm.c linux-2.6.24.ovz/drivers/char/drm/drm_vm.c --- linux-2.6.24/drivers/char/drm/drm_vm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/drm/drm_vm.c 2008-03-25 18:53:59.000000000 -0500 @@ -506,6 +506,7 @@ static int drm_mmap_dma(struct file *fil vma->vm_ops = &drm_vm_dma_ops; vma->vm_flags |= VM_RESERVED; /* Don't swap */ + vma->vm_flags |= VM_DONTEXPAND; vma->vm_file = filp; /* Needed for drm_vm_open() */ drm_vm_open_locked(vma); @@ -655,6 +656,7 @@ static int drm_mmap_locked(struct file * return -EINVAL; /* This should never happen. */ } vma->vm_flags |= VM_RESERVED; /* Don't swap */ + vma->vm_flags |= VM_DONTEXPAND; vma->vm_file = filp; /* Needed for drm_vm_open() */ drm_vm_open_locked(vma); diff -uprN linux-2.6.24/drivers/char/mspec.c linux-2.6.24.ovz/drivers/char/mspec.c --- linux-2.6.24/drivers/char/mspec.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/mspec.c 2008-03-25 18:53:59.000000000 -0500 @@ -283,7 +283,7 @@ mspec_mmap(struct file *file, struct vm_ vdata->refcnt = ATOMIC_INIT(1); vma->vm_private_data = vdata; - vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP); + vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff -uprN linux-2.6.24/drivers/char/pty.c linux-2.6.24.ovz/drivers/char/pty.c --- linux-2.6.24/drivers/char/pty.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/pty.c 2008-03-25 18:53:59.000000000 -0500 @@ -29,16 +29,30 @@ #include #include +#include + /* These are global because they are accessed in tty_io.c */ #ifdef CONFIG_UNIX98_PTYS struct tty_driver *ptm_driver; -static struct tty_driver *pts_driver; +struct tty_driver *pts_driver; +EXPORT_SYMBOL(ptm_driver); +EXPORT_SYMBOL(pts_driver); + +void prepare_pty(void) +{ +#ifdef CONFIG_VE + get_ve0()->ptm_driver = ptm_driver; + /* don't clean ptm_driver and co. here, they are used in vecalls.c */ +#endif +} #endif static void pty_close(struct tty_struct * tty, struct file * filp) { if (!tty) return; + + ub_pty_uncharge(tty); if (tty->driver->subtype == PTY_TYPE_MASTER) { if (tty->count > 1) printk("master pty_close: count = %d!!\n", tty->count); @@ -58,8 +72,12 @@ static void pty_close(struct tty_struct if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS - if (tty->driver == ptm_driver) + if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { + struct ve_struct *old_env; + old_env = set_exec_env(tty->owner_env); devpts_pty_kill(tty->index); + (void)set_exec_env(old_env); + } #endif tty_vhangup(tty->link); } @@ -209,6 +227,10 @@ static int pty_open(struct tty_struct *t if (tty->link->count != 1) goto out; + retval = -ENOMEM; + if (ub_pty_charge(tty)) + goto out; + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); @@ -236,7 +258,9 @@ static const struct tty_operations pty_o /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS -static struct tty_driver *pty_driver, *pty_slave_driver; +struct tty_driver *pty_driver, *pty_slave_driver; +EXPORT_SYMBOL(pty_driver); +EXPORT_SYMBOL(pty_slave_driver); static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) @@ -426,6 +450,7 @@ static void __init unix98_pty_init(void) pty_table[1].data = &ptm_driver->refcount; register_sysctl_table(pty_root_table); + prepare_pty(); } #else static inline void unix98_pty_init(void) { } diff -uprN linux-2.6.24/drivers/char/sysrq.c linux-2.6.24.ovz/drivers/char/sysrq.c --- linux-2.6.24/drivers/char/sysrq.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/sysrq.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include @@ -199,8 +201,14 @@ static struct sysrq_key_op sysrq_showloc static void sysrq_handle_showregs(int key, struct tty_struct *tty) { struct pt_regs *regs = get_irq_regs(); + + bust_spinlocks(1); if (regs) show_regs(regs); + bust_spinlocks(0); +#if defined(__i386__) || defined(__x86_64__) + smp_nmi_call_function(smp_show_regs, NULL, 1); +#endif } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, @@ -235,6 +243,7 @@ static struct sysrq_key_op sysrq_showsta static void sysrq_handle_showmem(int key, struct tty_struct *tty) { show_mem(); + show_slab_info(); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, @@ -250,7 +259,7 @@ static void send_sig_all(int sig) { struct task_struct *p; - for_each_process(p) { + for_each_process_all(p) { if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); @@ -313,7 +322,267 @@ static struct sysrq_key_op sysrq_unrt_op /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -static struct sysrq_key_op *sysrq_key_table[36] = { +#define SYSRQ_KEY_TABLE_LENGTH 37 +static struct sysrq_key_op **sysrq_key_table; +static struct sysrq_key_op *sysrq_default_key_table[]; + +#ifdef CONFIG_SYSRQ_DEBUG +#define SYSRQ_NAMELEN_MAX 64 +#define SYSRQ_DUMP_LINES 32 + +static struct sysrq_key_op *sysrq_debug_key_table[]; +static struct sysrq_key_op *sysrq_input_key_table[]; +static unsigned long *dump_address; +static int orig_console_loglevel; +static void (*sysrq_input_return)(char *) = NULL; + +static void dump_mem(void) +{ + unsigned long value[4]; + mm_segment_t old_fs; + int line, err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = 0; + + for (line = 0; line < SYSRQ_DUMP_LINES; line++) { + err |= __get_user(value[0], dump_address++); + err |= __get_user(value[1], dump_address++); + err |= __get_user(value[2], dump_address++); + err |= __get_user(value[3], dump_address++); + if (err) { + printk("Invalid address %p\n", dump_address - 4); + break; + } +#if BITS_PER_LONG == 32 + printk("0x%p: %08lx %08lx %08lx %08lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#else + printk("0x%p: %016lx %016lx %016lx %016lx\n", + dump_address - 4, + value[0], value[1], value[2], value[3]); +#endif + } + set_fs(old_fs); +} + +static void write_mem(unsigned long val) +{ + mm_segment_t old_fs; + unsigned long old_val; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (__get_user(old_val, dump_address)) { + printk("Invalid address %p\n", dump_address); + goto out; + } + +#if BITS_PER_LONG == 32 + printk("Changing [%p] from %08lx to %08lx\n", + dump_address, old_val, val); +#else + printk("Changing [%p] from %016lx to %016lx\n", + dump_address, old_val, val); +#endif + __put_user(val, dump_address); +out: + set_fs(old_fs); +} + +static void handle_read(int key, struct tty_struct *tty) +{ + static int pos; + static int upper_case; + static char str[SYSRQ_NAMELEN_MAX]; + + if (key == 0) { + /* actually 0 is not shift only... */ + upper_case = 1; + return; + } + + if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) { + /* enter */ + sysrq_key_table = sysrq_debug_key_table; + str[pos] = '\0'; + pos = upper_case = 0; + printk("\n"); + if (sysrq_input_return == NULL) + printk("No return handler!!!\n"); + else + sysrq_input_return(str); + return; + }; + + /* check for alowed symbols */ + if (key == '-') { + if (upper_case) + key = '_'; + goto correct; + }; + if (key >= 'a' && key <= 'z') { + if (upper_case) + key = key - 'a' + 'A'; + goto correct; + }; + if (key >= '0' && key <= '9') + goto correct; + + upper_case = 0; + return; + +correct: + str[pos] = key; + printk("%c", (char)key); + pos++; + upper_case = 0; +} + +static struct sysrq_key_op input_read = { + .handler = handle_read, + .help_msg = "", + .action_msg = NULL, +}; + +static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read, +}; + +static void return_dump_mem(char *str) +{ + unsigned long address; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '\0') { + printk("Bad address [%s]\n", str); + return; + } + + dump_address = (unsigned long *)address; + dump_mem(); +} + +static void handle_dump_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_dump_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_dump_mem = { + .handler = handle_dump_mem, + .help_msg = "Dump", + .action_msg = "Enter address:", +}; + +static void return_resolve(char *str) +{ + unsigned long address; + + address = kallsyms_lookup_name(str); + printk("%s : %lx\n", str, address); + if (address) { + dump_address = (unsigned long *)address; + printk("Now you can dump it via X\n"); + } +} + +static void handle_resolve(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_resolve; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_resolve = { + .handler = handle_resolve, + .help_msg = "Resolve", + .action_msg = "Enter symbol name:", +}; + +static void return_write_mem(char *str) +{ + unsigned long address; + unsigned long value; + char *end; + + address = simple_strtoul(str, &end, 0); + if (*end != '-') { + printk("Bad address in %s\n", str); + return; + } + value = simple_strtoul(end + 1, &end, 0); + if (*end != '\0') { + printk("Bad value in %s\n", str); + return; + } + + dump_address = (unsigned long *)address; + write_mem(value); +} + +static void handle_write_mem(int key, struct tty_struct *tty) +{ + sysrq_input_return = return_write_mem; + sysrq_key_table = sysrq_input_key_table; +} + +static struct sysrq_key_op debug_write_mem = { + .handler = handle_write_mem, + .help_msg = "Writemem", + .action_msg = "Enter address-value:", +}; + +static void handle_next(int key, struct tty_struct *tty) +{ + dump_mem(); +} + +static struct sysrq_key_op debug_next = { + .handler = handle_next, + .help_msg = "neXt", + .action_msg = "continuing", +}; + +static void handle_quit(int key, struct tty_struct *tty) +{ + sysrq_key_table = sysrq_default_key_table; + console_loglevel = orig_console_loglevel; +} + +static struct sysrq_key_op debug_quit = { + .handler = handle_quit, + .help_msg = "Quit", + .action_msg = "Tnahk you for using debugger", +}; + +static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + [13] = &debug_dump_mem, /* d */ + [26] = &debug_quit, /* q */ + [27] = &debug_resolve, /* r */ + [32] = &debug_write_mem, /* w */ + [33] = &debug_next, /* x */ +}; + +static void sysrq_handle_debug(int key, struct tty_struct *tty) +{ + orig_console_loglevel = console_loglevel; + console_loglevel = 8; + sysrq_key_table = sysrq_debug_key_table; + printk("Welcome sysrq debugging mode\n" + "Press H for help\n"); +} + +static struct sysrq_key_op sysrq_debug_op = { + .handler = sysrq_handle_debug, + .help_msg = "debuG", + .action_msg = "Select desired action", +}; +#endif + +static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = { &sysrq_loglevel_op, /* 0 */ &sysrq_loglevel_op, /* 1 */ &sysrq_loglevel_op, /* 2 */ @@ -336,7 +605,11 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ +#ifdef CONFIG_SYSRQ_DEBUG + &sysrq_debug_op, /* g */ +#else NULL, /* g */ +#endif NULL, /* h */ &sysrq_kill_op, /* i */ NULL, /* j */ @@ -358,9 +631,12 @@ static struct sysrq_key_op *sysrq_key_ta /* x: May be registered on ppc/powerpc for xmon */ NULL, /* x */ NULL, /* y */ - NULL /* z */ + NULL, /* z */ + NULL, /* for debugger */ }; +static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table; + /* key2index calculation, -1 on invalid index */ static int sysrq_key_table_key2index(int key) { @@ -370,6 +646,10 @@ static int sysrq_key_table_key2index(int retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; +#ifdef CONFIG_SYSRQ_DEBUG + else if (key == 0 || key == 0x0d || key == '-') + retval = SYSRQ_KEY_TABLE_LENGTH - 1; +#endif else retval = -1; return retval; @@ -411,7 +691,6 @@ void __handle_sysrq(int key, struct tty_ spin_lock_irqsave(&sysrq_key_table_lock, flags); orig_log_level = console_loglevel; console_loglevel = 7; - printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); if (op_p) { @@ -420,16 +699,17 @@ void __handle_sysrq(int key, struct tty_ * should not) and is the invoked operation enabled? */ if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { - printk("%s\n", op_p->action_msg); + if (op_p->action_msg) + printk("%s\n", op_p->action_msg); console_loglevel = orig_log_level; op_p->handler(key, tty); } else { printk("This sysrq operation is disabled.\n"); } } else { - printk("HELP : "); + printk("SysRq HELP : "); /* Only print the help msg once per handler */ - for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { + for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) { if (sysrq_key_table[i]) { int j; diff -uprN linux-2.6.24/drivers/char/tty_io.c linux-2.6.24.ovz/drivers/char/tty_io.c --- linux-2.6.24/drivers/char/tty_io.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/char/tty_io.c 2008-03-25 18:53:59.000000000 -0500 @@ -94,6 +94,8 @@ #include #include #include +#include +#include #include #include @@ -104,6 +106,7 @@ #include #include +#include #undef TTY_DEBUG_HANGUP @@ -128,6 +131,7 @@ EXPORT_SYMBOL(tty_std_termios); into this file */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ +EXPORT_SYMBOL(tty_drivers); /* Mutex to protect creating and releasing a tty. This is shared with vt.c for deeply disgusting hack reasons */ @@ -138,6 +142,15 @@ EXPORT_SYMBOL(tty_mutex); extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ extern int pty_limit; /* Config limit on Unix98 ptys */ static DEFINE_IDR(allocated_ptys); +#ifdef CONFIG_VE +#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) +#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) +#define ve_ptm_driver (get_exec_env()->ptm_driver) +#else +#define __ve_allocated_ptys(ve) allocated_ptys +#define ve_allocated_ptys allocated_ptys +#define ve_ptm_driver ptm_driver +#endif static DECLARE_MUTEX(allocated_ptys_lock); static int ptmx_open(struct inode *, struct file *); #endif @@ -172,9 +185,20 @@ static void proc_set_tty(struct task_str * Locking: none */ +void prepare_tty(void) +{ +#ifdef CONFIG_VE + get_ve0()->allocated_ptys = &allocated_ptys; + /* + * in this case, tty_register_driver() setups + * owner_env correctly right from the bootup + */ +#endif +} + static struct tty_struct *alloc_tty_struct(void) { - return kzalloc(sizeof(struct tty_struct), GFP_KERNEL); + return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC); } static void tty_buffer_free_all(struct tty_struct *); @@ -1148,9 +1172,29 @@ static struct tty_driver *get_tty_driver if (device < base || device >= base + p->num) continue; *index = device - base; - return p; +#ifdef CONFIG_VE + if (in_interrupt()) + goto found; + if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR +#ifdef CONFIG_UNIX98_PTYS + && (p->majormajor>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && + (p->majormajor>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) +#endif + ) + goto found; + if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env())) + goto found; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + continue; +#endif + goto found; } return NULL; + +found: + return p; } /** @@ -1999,13 +2043,21 @@ static void tty_line_name(struct tty_dri */ static int init_dev(struct tty_driver *driver, int idx, - struct tty_struct **ret_tty) + struct tty_struct *i_tty, struct tty_struct **ret_tty) { struct tty_struct *tty, *o_tty; struct ktermios *tp, **tp_loc, *o_tp, **o_tp_loc; struct ktermios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; + struct ve_struct * owner; int retval = 0; + owner = driver->owner_env; + + if (i_tty) { + tty = i_tty; + goto fast_track; + } + /* check whether we're reopening an existing tty */ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { tty = devpts_get_tty(idx); @@ -2054,6 +2106,7 @@ static int init_dev(struct tty_driver *d tty->driver = driver; tty->index = idx; tty_line_name(driver, idx, tty->name); + tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { tp_loc = &tty->termios; @@ -2064,14 +2117,14 @@ static int init_dev(struct tty_driver *d } if (!*tp_loc) { - tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); + tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!tp) goto free_mem_out; *tp = driver->init_termios; } if (!*ltp_loc) { - ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); + ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!ltp) goto free_mem_out; } @@ -2084,6 +2137,7 @@ static int init_dev(struct tty_driver *d o_tty->driver = driver->other; o_tty->index = idx; tty_line_name(driver->other, idx, o_tty->name); + o_tty->owner_env = owner; if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { o_tp_loc = &o_tty->termios; @@ -2094,14 +2148,14 @@ static int init_dev(struct tty_driver *d } if (!*o_tp_loc) { - o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); + o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!o_tp) goto free_mem_out; *o_tp = driver->other->init_termios; } if (!*o_ltp_loc) { - o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); + o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); if (!o_ltp) goto free_mem_out; } @@ -2118,6 +2172,10 @@ static int init_dev(struct tty_driver *d *o_ltp_loc = o_ltp; o_tty->termios = *o_tp_loc; o_tty->termios_locked = *o_ltp_loc; +#ifdef CONFIG_VE + if (driver->other->refcount == 0) + (void)get_ve(owner); +#endif driver->other->refcount++; if (driver->subtype == PTY_TYPE_MASTER) o_tty->count++; @@ -2142,6 +2200,10 @@ static int init_dev(struct tty_driver *d *ltp_loc = ltp; tty->termios = *tp_loc; tty->termios_locked = *ltp_loc; +#ifdef CONFIG_VE + if (driver->refcount == 0) + (void)get_ve(owner); +#endif /* Compatibility until drivers always set this */ tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios); tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios); @@ -2266,7 +2328,8 @@ static void release_one_tty(struct tty_s tty->magic = 0; tty->driver->refcount--; - + if (tty->driver->refcount == 0) + put_ve(tty->owner_env); file_list_lock(); list_del_init(&tty->tty_files); file_list_unlock(); @@ -2312,7 +2375,10 @@ static void release_dev(struct file * fi int idx; char buf[64]; unsigned long flags; - +#ifdef CONFIG_UNIX98_PTYS + struct idr *idr_alloced; +#endif + tty = (struct tty_struct *)filp->private_data; if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "release_dev")) return; @@ -2326,6 +2392,9 @@ static void release_dev(struct file * fi tty->driver->subtype == PTY_TYPE_MASTER); devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; o_tty = tty->link; +#ifdef CONFIG_UNIX98_PTYS + idr_alloced = &__ve_allocated_ptys(tty->owner_env); +#endif #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { @@ -2572,7 +2641,7 @@ static void release_dev(struct file * fi /* Make this pty number available for reallocation */ if (devpts) { down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, idx); + idr_remove(idr_alloced, idx); up(&allocated_ptys_lock); } #endif @@ -2602,7 +2671,7 @@ static void release_dev(struct file * fi static int tty_open(struct inode * inode, struct file * filp) { - struct tty_struct *tty; + struct tty_struct *tty, *c_tty; int noctty, retval; struct tty_driver *driver; int index; @@ -2615,6 +2684,7 @@ retry_open: noctty = filp->f_flags & O_NOCTTY; index = -1; retval = 0; + c_tty = NULL; mutex_lock(&tty_mutex); @@ -2626,6 +2696,7 @@ retry_open: } driver = tty->driver; index = tty->index; + c_tty = tty; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ goto got_driver; @@ -2633,6 +2704,12 @@ retry_open: #ifdef CONFIG_VT if (device == MKDEV(TTY_MAJOR,0)) { extern struct tty_driver *console_driver; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_driver; index = fg_console; noctty = 1; @@ -2640,6 +2717,12 @@ retry_open: } #endif if (device == MKDEV(TTYAUX_MAJOR,1)) { +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + mutex_unlock(&tty_mutex); + return -ENODEV; + } +#endif driver = console_device(&index); if (driver) { /* Don't let /dev/console block */ @@ -2657,7 +2740,7 @@ retry_open: return -ENODEV; } got_driver: - retval = init_dev(driver, index, &tty); + retval = init_dev(driver, index, c_tty, &tty); mutex_unlock(&tty_mutex); if (retval) return retval; @@ -2738,11 +2821,11 @@ static int ptmx_open(struct inode * inod /* find a device that is not in use. */ down(&allocated_ptys_lock); - if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { + if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { up(&allocated_ptys_lock); return -ENOMEM; } - idr_ret = idr_get_new(&allocated_ptys, NULL, &index); + idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); if (idr_ret < 0) { up(&allocated_ptys_lock); if (idr_ret == -EAGAIN) @@ -2750,14 +2833,14 @@ static int ptmx_open(struct inode * inod return -EIO; } if (index >= pty_limit) { - idr_remove(&allocated_ptys, index); + idr_remove(&ve_allocated_ptys, index); up(&allocated_ptys_lock); return -EIO; } up(&allocated_ptys_lock); mutex_lock(&tty_mutex); - retval = init_dev(ptm_driver, index, &tty); + retval = init_dev(ve_ptm_driver, index, NULL, &tty); mutex_unlock(&tty_mutex); if (retval) @@ -2772,7 +2855,7 @@ static int ptmx_open(struct inode * inod goto out1; check_tty_count(tty, "tty_open"); - retval = ptm_driver->open(tty, filp); + retval = ve_ptm_driver->open(tty, filp); if (!retval) { tty_audit_opening(); return 0; @@ -2782,7 +2865,7 @@ out1: return retval; out: down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, index); + idr_remove(&ve_allocated_ptys, index); up(&allocated_ptys_lock); return retval; } @@ -2988,6 +3071,8 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!ve_is_super(get_exec_env())) + return -EACCES; if (file->f_op->write == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); @@ -3536,7 +3621,7 @@ void __do_SAK(struct tty_struct *tty) /* Now kill any processes that happen to have the * tty open. */ - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): task_session_nr(p)==tty->session\n", @@ -3568,7 +3653,7 @@ void __do_SAK(struct tty_struct *tty) spin_unlock(&p->files->file_lock); } task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); #endif } @@ -3914,6 +3999,7 @@ int tty_register_driver(struct tty_drive driver->put_char = tty_default_put_char; mutex_lock(&tty_mutex); + driver->owner_env = get_exec_env(); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); @@ -4107,6 +4193,44 @@ static int __init tty_init(void) vty_init(); #endif + prepare_tty(); return 0; } module_init(tty_init); + +#ifdef CONFIG_UNIX98_PTYS +struct class *init_ve_tty_class(void) +{ + struct class * ve_tty_class; + struct class_device * ve_ptmx_dev_class; + + ve_tty_class = class_create(THIS_MODULE, "tty"); + if (IS_ERR(ve_tty_class)) + return ve_tty_class; + + ve_ptmx_dev_class = class_device_create(ve_tty_class, NULL, + MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); + if (IS_ERR(ve_ptmx_dev_class)) { + class_destroy(ve_tty_class); + return (struct class *)ve_ptmx_dev_class; + } + + return ve_tty_class; +} + +void fini_ve_tty_class(struct class *ve_tty_class) +{ + class_device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2)); + class_destroy(ve_tty_class); +} +#else +struct class *init_ve_tty_class(void) +{ + return NULL; +} +void fini_ve_tty_class(struct class *ve_tty_class) +{ +} +#endif +EXPORT_SYMBOL(init_ve_tty_class); +EXPORT_SYMBOL(fini_ve_tty_class); diff -uprN linux-2.6.24/drivers/firmware/dmi_scan.c linux-2.6.24.ovz/drivers/firmware/dmi_scan.c --- linux-2.6.24/drivers/firmware/dmi_scan.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/firmware/dmi_scan.c 2008-03-25 18:53:59.000000000 -0500 @@ -469,12 +469,3 @@ int dmi_get_year(int field) return year; } - -/** - * dmi_get_slot - return dmi_ident[slot] - * @slot: index into dmi_ident[] - */ -char *dmi_get_slot(int slot) -{ - return(dmi_ident[slot]); -} diff -uprN linux-2.6.24/drivers/media/video/cx23885/cx23885-cards.c linux-2.6.24.ovz/drivers/media/video/cx23885/cx23885-cards.c --- linux-2.6.24/drivers/media/video/cx23885/cx23885-cards.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/media/video/cx23885/cx23885-cards.c 2008-03-25 18:53:59.000000000 -0500 @@ -138,6 +138,10 @@ struct cx23885_subid cx23885_subids[] = .card = CX23885_BOARD_HAUPPAUGE_HVR1800, },{ .subvendor = 0x0070, + .subdevice = 0x7809, + .card = CX23885_BOARD_HAUPPAUGE_HVR1800, + },{ + .subvendor = 0x0070, .subdevice = 0x7911, .card = CX23885_BOARD_HAUPPAUGE_HVR1250, },{ diff -uprN linux-2.6.24/drivers/net/Makefile linux-2.6.24.ovz/drivers/net/Makefile --- linux-2.6.24/drivers/net/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,10 @@ gianfar_driver-objs := gianfar.o \ obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o ucc_geth_driver-objs := ucc_geth.o ucc_geth_mii.o ucc_geth_ethtool.o +obj-$(CONFIG_VE_NETDEV) += vznetdev.o +vznetdev-objs := open_vznet.o venet_core.o +obj-$(CONFIG_VE_ETHDEV) += vzethdev.o + # # link order important here # diff -uprN linux-2.6.24/drivers/net/forcedeth.c linux-2.6.24.ovz/drivers/net/forcedeth.c --- linux-2.6.24/drivers/net/forcedeth.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/forcedeth.c 2008-03-25 18:53:59.000000000 -0500 @@ -5593,35 +5593,35 @@ static struct pci_device_id pci_tbl[] = }, { /* MCP77 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_32), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP77 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_33), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP77 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_34), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP77 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_35), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP79 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_36), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP79 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_37), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP79 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_38), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, { /* MCP79 Ethernet Controller */ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_39), - .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT, + .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, }, {0,}, }; diff -uprN linux-2.6.24/drivers/net/loopback.c linux-2.6.24.ovz/drivers/net/loopback.c --- linux-2.6.24/drivers/net/loopback.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/loopback.c 2008-03-25 18:53:59.000000000 -0500 @@ -136,6 +136,12 @@ static int loopback_xmit(struct sk_buff { struct pcpu_lstats *pcpu_lstats, *lb_stats; +#ifdef CONFIG_VE + if (unlikely(get_exec_env()->disable_net)) { + kfree_skb(skb); + return 0; + } +#endif skb_orphan(skb); skb->protocol = eth_type_trans(skb,dev); @@ -242,7 +248,8 @@ static void loopback_setup(struct net_de | NETIF_F_NO_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX - | NETIF_F_NETNS_LOCAL; + | NETIF_F_NETNS_LOCAL + | NETIF_F_VIRTUAL; dev->ethtool_ops = &loopback_ethtool_ops; dev->header_ops = ð_header_ops; dev->init = loopback_dev_init; diff -uprN linux-2.6.24/drivers/net/open_vznet.c linux-2.6.24.ovz/drivers/net/open_vznet.c --- linux-2.6.24/drivers/net/open_vznet.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/open_vznet.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,244 @@ +/* + * open_vznet.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual Networking device used to change VE ownership on packets + */ + +#include +#include +#include + +#include +#include +#include +#include + +void veip_stop(struct ve_struct *ve) +{ + struct list_head *p, *tmp; + + write_lock_irq(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_safe(p, tmp, &ve->veip->ip_lh) { + struct ip_entry_struct *ptr; + ptr = list_entry(p, struct ip_entry_struct, ve_list); + ptr->active_env = NULL; + list_del(&ptr->ve_list); + list_del(&ptr->ip_hash); + kfree(ptr); + } + veip_put(ve->veip); + ve->veip = NULL; + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +unlock: + write_unlock_irq(&veip_hash_lock); +} + +int veip_start(struct ve_struct *ve) +{ + int err, get; + + err = 0; + write_lock_irq(&veip_hash_lock); + get = ve->veip == NULL; + ve->veip = veip_findcreate(ve->veid); + if (ve->veip == NULL) + err = -ENOMEM; + write_unlock_irq(&veip_hash_lock); + if (err == 0 && get && !ve_is_super(ve)) + __module_get(THIS_MODULE); + return err; +} + +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry, *found; + int err; + + entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + + if (ve->veip == NULL) { + /* This can happen if we load venet AFTER ve was started */ + err = veip_start(ve); + if (err < 0) + goto out; + } + + write_lock_irq(&veip_hash_lock); + err = -EADDRINUSE; + found = venet_entry_lookup(addr); + if (found != NULL) + goto out_unlock; + + entry->active_env = ve; + entry->addr = *addr; + ip_entry_hash(entry, ve->veip); + + err = 0; + entry = NULL; +out_unlock: + write_unlock_irq(&veip_hash_lock); +out: + if (entry != NULL) + kfree(entry); + return err; +} + +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr) +{ + struct ip_entry_struct *found; + int err; + + err = -EADDRNOTAVAIL; + write_lock_irq(&veip_hash_lock); + found = venet_entry_lookup(addr); + if (found == NULL) + goto out; + if (found->active_env->veid != veid) + goto out; + + err = 0; + found->active_env = NULL; + + list_del(&found->ip_hash); + list_del(&found->ve_list); + kfree(found); +out: + write_unlock_irq(&veip_hash_lock); + return err; +} + +static int skb_extract_addr(struct sk_buff *skb, + struct ve_addr_struct *addr, int dir) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + addr->family = AF_INET; + addr->key[0] = 0; + addr->key[1] = 0; + addr->key[2] = 0; + addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr); + return 0; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case __constant_htons(ETH_P_IPV6): + addr->family = AF_INET6; + memcpy(&addr->key, dir ? + ipv6_hdr(skb)->daddr.s6_addr32 : + ipv6_hdr(skb)->saddr.s6_addr32, + sizeof(addr->key)); + return 0; +#endif + } + + return -EAFNOSUPPORT; +} + +static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) +{ + struct ip_entry_struct *entry; + struct ve_addr_struct addr; + + if (skb_extract_addr(skb, &addr, dir) < 0) + return NULL; + + entry = venet_entry_lookup(&addr); + if (entry == NULL) + return NULL; + + return entry->active_env; +} + +int venet_change_skb_owner(struct sk_buff *skb) +{ + struct ve_struct *ve, *ve_old; + + ve_old = skb->owner_env; + + read_lock(&veip_hash_lock); + if (!ve_is_super(ve_old)) { + /* from VE to host */ + ve = venet_find_ve(skb, 0); + if (ve == NULL) + goto out_drop; + if (!ve_accessible_strict(ve, ve_old)) + goto out_source; + skb->owner_env = get_ve0(); + } else { + /* from host to VE */ + ve = venet_find_ve(skb, 1); + if (ve == NULL) + goto out_drop; + skb->owner_env = ve; + } + read_unlock(&veip_hash_lock); + + return 0; + +out_drop: + read_unlock(&veip_hash_lock); + return -ESRCH; + +out_source: + read_unlock(&veip_hash_lock); + if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { + printk(KERN_WARNING "Dropped packet, source wrong " + "veid=%u src-IP=%u.%u.%u.%u " + "dst-IP=%u.%u.%u.%u\n", + skb->owner_env->veid, + NIPQUAD(ip_hdr(skb)->saddr), + NIPQUAD(ip_hdr(skb)->daddr)); + } + return -EACCES; +} + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct ip_entry_struct *entry; + char s[40]; + + p = (struct list_head *)v; + if (p == ip_entry_hash_table) { + seq_puts(m, "Version: 2.5\n"); + return 0; + } + entry = list_entry(p, struct ip_entry_struct, ip_hash); + veaddr_print(s, sizeof(s), &entry->addr); + seq_printf(m, "%39s %10u\n", s, 0); + return 0; +} +#endif + +__exit void veip_cleanup(void) +{ + int i; + + write_lock_irq(&veip_hash_lock); + for (i = 0; i < VEIP_HASH_SZ; i++) + while (!list_empty(ip_entry_hash_table + i)) { + struct ip_entry_struct *entry; + + entry = list_first_entry(ip_entry_hash_table + i, + struct ip_entry_struct, ip_hash); + list_del(&entry->ip_hash); + kfree(entry); + } + write_unlock_irq(&veip_hash_lock); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); +MODULE_LICENSE("GPL v2"); diff -uprN linux-2.6.24/drivers/net/sky2.c linux-2.6.24.ovz/drivers/net/sky2.c --- linux-2.6.24/drivers/net/sky2.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/sky2.c 2008-03-25 18:53:59.000000000 -0500 @@ -621,6 +621,7 @@ static void sky2_phy_power(struct sky2_h static const u32 phy_power[] = { PCI_Y2_PHY1_POWD, PCI_Y2_PHY2_POWD }; static const u32 coma_mode[] = { PCI_Y2_PHY1_COMA, PCI_Y2_PHY2_COMA }; + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); reg1 = sky2_pci_read32(hw, PCI_DEV_REG1); /* Turn on/off phy power saving */ if (onoff) @@ -632,7 +633,8 @@ static void sky2_phy_power(struct sky2_h reg1 |= coma_mode[port]; sky2_pci_write32(hw, PCI_DEV_REG1, reg1); - reg1 = sky2_pci_read32(hw, PCI_DEV_REG1); + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); + sky2_pci_read32(hw, PCI_DEV_REG1); udelay(100); } @@ -1412,6 +1414,7 @@ static int sky2_up(struct net_device *de imask |= portirq_msk[port]; sky2_write32(hw, B0_IMSK, imask); + sky2_set_multicast(dev); return 0; err_out: @@ -2426,6 +2429,7 @@ static void sky2_hw_intr(struct sky2_hw if (status & (Y2_IS_MST_ERR | Y2_IS_IRQ_STAT)) { u16 pci_err; + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); pci_err = sky2_pci_read16(hw, PCI_STATUS); if (net_ratelimit()) dev_err(&pdev->dev, "PCI hardware error (0x%x)\n", @@ -2433,12 +2437,14 @@ static void sky2_hw_intr(struct sky2_hw sky2_pci_write16(hw, PCI_STATUS, pci_err | PCI_STATUS_ERROR_BITS); + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); } if (status & Y2_IS_PCI_EXP) { /* PCI-Express uncorrectable Error occurred */ u32 err; + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); err = sky2_read32(hw, Y2_CFG_AER + PCI_ERR_UNCOR_STATUS); sky2_write32(hw, Y2_CFG_AER + PCI_ERR_UNCOR_STATUS, 0xfffffffful); @@ -2446,6 +2452,7 @@ static void sky2_hw_intr(struct sky2_hw dev_err(&pdev->dev, "PCI Express error (0x%x)\n", err); sky2_read32(hw, Y2_CFG_AER + PCI_ERR_UNCOR_STATUS); + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); } if (status & Y2_HWE_L1_MASK) @@ -2811,6 +2818,7 @@ static void sky2_reset(struct sky2_hw *h } sky2_power_on(hw); + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); for (i = 0; i < hw->ports; i++) { sky2_write8(hw, SK_REG(i, GMAC_LINK_CTRL), GMLC_RST_SET); @@ -3533,8 +3541,6 @@ static int sky2_set_ringparam(struct net err = sky2_up(dev); if (err) dev_close(dev); - else - sky2_set_multicast(dev); } return err; @@ -4368,8 +4374,6 @@ static int sky2_resume(struct pci_dev *p dev_close(dev); goto out; } - - sky2_set_multicast(dev); } } diff -uprN linux-2.6.24/drivers/net/tun.c linux-2.6.24.ovz/drivers/net/tun.c --- linux-2.6.24/drivers/net/tun.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/tun.c 2008-03-25 18:53:59.000000000 -0500 @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,7 @@ #include #include +#include #ifdef TUN_DEBUG static int debug; @@ -73,15 +75,18 @@ static int debug; /* Network device part of the driver */ -static LIST_HEAD(tun_dev_list); +LIST_HEAD(tun_dev_list); +EXPORT_SYMBOL(tun_dev_list); + static const struct ethtool_ops tun_ethtool_ops; /* Net device open. */ -static int tun_net_open(struct net_device *dev) +int tun_net_open(struct net_device *dev) { netif_start_queue(dev); return 0; } +EXPORT_SYMBOL(tun_net_open); /* Net device close. */ static int tun_net_close(struct net_device *dev) @@ -94,6 +99,9 @@ static int tun_net_close(struct net_devi static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); +#if 0 + struct user_beancounter *ub; +#endif DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len); @@ -118,6 +126,24 @@ static int tun_net_xmit(struct sk_buff * } } + /* + * XXX this code is broken: + * See comment in dev_queue_xmit + */ +#if 0 + ub = netdev_bc(dev)->exec_ub; + if (ub && (skb_bc(skb)->charged == 0)) { + unsigned long charge; + charge = skb_charge_fullsize(skb); + if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1)) + goto drop; + get_beancounter(ub); + skb_bc(skb)->ub = ub; + skb_bc(skb)->charged = charge; + skb_bc(skb)->resource = UB_OTHERSOCKBUF; + } +#endif + /* Queue packet */ skb_queue_tail(&tun->readq, skb); dev->trans_start = jiffies; @@ -184,7 +210,7 @@ tun_net_change_mtu(struct net_device *de } /* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +void tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -216,6 +242,7 @@ static void tun_net_init(struct net_devi break; } } +EXPORT_SYMBOL(tun_net_init); /* Character device part */ @@ -415,11 +442,13 @@ static ssize_t tun_chr_aio_read(struct k DBG(KERN_DEBUG "%s: tun_chr_readv: accepted: %s\n", tun->dev->name, print_mac(mac, addr)); ret = tun_put_user(tun, skb, (struct iovec *) iv, len); + /* skb will be uncharged in kfree_skb() */ kfree_skb(skb); break; } else { DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %s\n", tun->dev->name, print_mac(mac, addr)); + /* skb will be uncharged in kfree_skb() */ kfree_skb(skb); continue; } @@ -431,7 +460,7 @@ static ssize_t tun_chr_aio_read(struct k return ret; } -static void tun_setup(struct net_device *dev) +void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); @@ -446,7 +475,9 @@ static void tun_setup(struct net_device dev->stop = tun_net_close; dev->ethtool_ops = &tun_ethtool_ops; dev->destructor = free_netdev; + dev->features |= NETIF_F_VIRTUAL; } +EXPORT_SYMBOL(tun_setup); static struct tun_struct *tun_get_by_name(const char *name) { @@ -454,8 +485,9 @@ static struct tun_struct *tun_get_by_nam ASSERT_RTNL(); list_for_each_entry(tun, &tun_dev_list, list) { - if (!strncmp(tun->dev->name, name, IFNAMSIZ)) - return tun; + if (ve_accessible_strict(tun->dev->owner_env, get_exec_env()) && + !strncmp(tun->dev->name, name, IFNAMSIZ)) + return tun; } return NULL; @@ -463,6 +495,7 @@ static struct tun_struct *tun_get_by_nam static int tun_set_iff(struct file *file, struct ifreq *ifr) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct tun_struct *tun; struct net_device *dev; int err; @@ -477,10 +510,11 @@ static int tun_set_iff(struct file *file current->euid != tun->owner) || (tun->group != -1 && current->egid != tun->group)) && - !capable(CAP_NET_ADMIN)) + !capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; } - else if (__dev_get_by_name(&init_net, ifr->ifr_name)) + else if (__dev_get_by_name(net, ifr->ifr_name)) return -EINVAL; else { char *name; @@ -488,7 +522,7 @@ static int tun_set_iff(struct file *file err = -EINVAL; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; /* Set dev type */ @@ -510,6 +544,7 @@ static int tun_set_iff(struct file *file tun_setup); if (!dev) return -ENOMEM; + dev->nd_net = net; tun = netdev_priv(dev); tun->dev = dev; @@ -546,6 +581,7 @@ static int tun_set_iff(struct file *file file->private_data = tun; tun->attached = 1; + tun->bind_file = file; strcpy(ifr->ifr_name, tun->dev->name); return 0; @@ -603,6 +639,9 @@ static int tun_chr_ioctl(struct inode *i break; case TUNSETPERSIST: + /* prohibit persist mode inside VE */ + if (!ve_is_super(get_exec_env())) + return -EPERM; /* Disable/Enable persist mode */ if (arg) tun->flags |= TUN_PERSIST; @@ -734,12 +773,13 @@ static int tun_chr_fasync(int fd, struct return 0; } -static int tun_chr_open(struct inode *inode, struct file * file) +int tun_chr_open(struct inode *inode, struct file * file) { DBG1(KERN_INFO "tunX: tun_chr_open\n"); file->private_data = NULL; return 0; } +EXPORT_SYMBOL(tun_chr_open); static int tun_chr_close(struct inode *inode, struct file *file) { diff -uprN linux-2.6.24/drivers/net/venet_core.c linux-2.6.24.ovz/drivers/net/venet_core.c --- linux-2.6.24/drivers/net/venet_core.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/venet_core.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,826 @@ +/* + * venet_core.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Common part for Virtuozzo virtual network devices + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include +#include + +struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; +rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(veip_lh); + +struct venet_stats { + struct net_device_stats stats; + struct net_device_stats *real_stats; +}; + +static inline struct net_device_stats * +venet_stats(struct net_device *dev, int cpu) +{ + struct venet_stats *stats; + stats = (struct venet_stats*)dev->priv; + return per_cpu_ptr(stats->real_stats, cpu); +} + + +#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) + +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) +{ + list_add(&entry->ip_hash, + ip_entry_hash_table + + ip_entry_hash_function(entry->addr.key[3])); + list_add(&entry->ve_list, &veip->ip_lh); +} + +void veip_put(struct veip_struct *veip) +{ + if (!list_empty(&veip->ip_lh)) + return; + if (!list_empty(&veip->src_lh)) + return; + if (!list_empty(&veip->dst_lh)) + return; + + list_del(&veip->list); + kfree(veip); +} + +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr) +{ + struct ip_entry_struct *entry; + + list_for_each_entry (entry, ip_entry_hash_table + + ip_entry_hash_function(addr->key[3]), ip_hash) + if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) + return entry; + return NULL; +} + +struct veip_struct *veip_find(envid_t veid) +{ + struct veip_struct *ptr; + + list_for_each_entry(ptr, &veip_lh, list) { + if (ptr->veid != veid) + continue; + return ptr; + } + return NULL; +} + +struct veip_struct *veip_findcreate(envid_t veid) +{ + struct veip_struct *ptr; + + ptr = veip_find(veid); + if (ptr != NULL) + return ptr; + + ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); + if (ptr == NULL) + return NULL; + memset(ptr, 0, sizeof(struct veip_struct)); + INIT_LIST_HEAD(&ptr->ip_lh); + INIT_LIST_HEAD(&ptr->src_lh); + INIT_LIST_HEAD(&ptr->dst_lh); + ptr->veid = veid; + list_add(&ptr->list, &veip_lh); + return ptr; +} + +static int convert_sockaddr(struct sockaddr *addr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in)) + break; + + err = 0; + sin = (struct sockaddr_in *)addr; + veaddr->family = AF_INET; + veaddr->key[0] = 0; + veaddr->key[1] = 0; + veaddr->key[2] = 0; + veaddr->key[3] = sin->sin_addr.s_addr; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin; + + err = -EINVAL; + if (addrlen != sizeof(struct sockaddr_in6)) + break; + + err = 0; + sin = (struct sockaddr_in6 *)addr; + veaddr->family = AF_INET6; + memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key)); + break; + } + default: + err = -EAFNOSUPPORT; + } + return err; +} + +int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr) +{ + int err; + char addr[MAX_SOCK_ADDR]; + + err = move_addr_to_kernel(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr); +out: + return err; +} + +void veaddr_print(char *str, int len, struct ve_addr_struct *a) +{ + if (a->family == AF_INET) + snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3])); + else + snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x", + ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF, + ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF, + ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF, + ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF + ); +} + +/* + * Device functions + */ + +static int venet_open(struct net_device *dev) +{ + if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE)) + return -EBUSY; + return 0; +} + +static int venet_close(struct net_device *master) +{ + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); + return 0; +} + +static void venet_destructor(struct net_device *dev) +{ + struct venet_stats *stats = (struct venet_stats *)dev->priv; + if (stats == NULL) + return; + free_percpu(stats->real_stats); + kfree(stats); + dev->priv = NULL; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int venet_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + int length; + + stats = venet_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + struct iphdr *iph; + iph = ip_hdr(skb); + if (MULTICAST(iph->daddr)) + goto outf; + } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h; + ip6h = ipv6_hdr(skb); + if (ipv6_addr_is_multicast(&ip6h->daddr)) + goto outf; + skb_orphan(skb); + } else { + goto outf; + } + + if (venet_change_skb_owner(skb) < 0) + goto outf; + + if (unlikely(skb->owner_env->disable_net)) + goto outf; + + rcv = skb->owner_env->_venet_dev; + if (!rcv) + /* VE going down */ + goto outf; + + dev_hold(rcv); + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + dev_put(rcv); + goto outf; + } + + skb->pkt_type = PACKET_HOST; + skb->dev = rcv; + + skb_reset_mac_header(skb); + memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); + + dst_release(skb->dst); + skb->dst = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + + rcv_stats = venet_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + dev_put(rcv); + } + + return 0; + +outf: + kfree_skb(skb); + ++stats->tx_dropped; + return 0; +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct venet_stats *stats; + + stats = (struct venet_stats *)dev->priv; + memset(&stats->stats, 0, sizeof(struct net_device_stats)); + for (i=0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = venet_stats(dev, i); + stats->stats.rx_bytes += dev_stats->rx_bytes; + stats->stats.tx_bytes += dev_stats->tx_bytes; + stats->stats.rx_packets += dev_stats->rx_packets; + stats->stats.tx_packets += dev_stats->tx_packets; + } + + return &stats->stats; +} + +/* Initialize the rest of the LOOPBACK device. */ +int venet_init_dev(struct net_device *dev) +{ + struct venet_stats *stats; + + dev->hard_start_xmit = venet_xmit; + stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL); + if (stats == NULL) + goto fail; + stats->real_stats = alloc_percpu(struct net_device_stats); + if (stats->real_stats == NULL) + goto fail_free; + dev->priv = stats; + + dev->get_stats = get_stats; + dev->open = venet_open; + dev->stop = venet_close; + dev->destructor = venet_destructor; + + /* + * Fill in the generic fields of the device structure. + */ + dev->type = ARPHRD_VOID; + dev->hard_header_len = ETH_HLEN; + dev->mtu = 1500; /* eth_mtu */ + dev->tx_queue_len = 0; + + memset(dev->broadcast, 0xFF, ETH_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; + return 0; + +fail_free: + kfree(stats); +fail: + return -ENOMEM; +} + +static int +venet_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + + struct ve_struct *ve; + int ret = 0; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + struct ve_struct *ve_old; + + ve_old = set_exec_env(ve); + read_lock(&dev_base_lock); + for_each_netdev(ve->ve_ns->net_ns, dev) { + if (dev->hard_start_xmit == venet_xmit) + ret = fop(dev, data); + } + read_unlock(&dev_base_lock); + set_exec_env(ve_old); + + if (ret < 0) + break; + } + read_unlock(&ve_list_lock); + return ret; +} + +static unsigned long common_features; + +static int venet_op_set_sg(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_SG; + else + common_features &= ~NETIF_F_SG; + + return venet_set_op(dev, data, ethtool_op_set_sg); +} + +static int venet_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + if (data) + common_features |= NETIF_F_IP_CSUM; + else + common_features &= ~NETIF_F_IP_CSUM; + + return venet_set_op(dev, data, ethtool_op_set_tx_csum); +} + +#define venet_op_set_rx_csum venet_op_set_tx_csum + +static struct ethtool_ops venet_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = venet_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = venet_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = venet_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, +}; + +static void venet_setup(struct net_device *dev) +{ + dev->init = venet_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED; + + dev->features |= common_features; + + SET_ETHTOOL_OPS(dev, &venet_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +static int veinfo_seq_show(struct seq_file *m, void *v) +{ + struct ve_struct *ve; + struct ip_entry_struct *entry; + + ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); + + seq_printf(m, "%10u %5u %5u", ve->veid, + ve->class_id, atomic_read(&ve->pcounter)); + read_lock(&veip_hash_lock); + if (ve->veip == NULL) + goto unlock; + list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) { + char addr[40]; + + if (entry->active_env == NULL) + continue; + + veaddr_print(addr, sizeof(addr), &entry->addr); + if (entry->addr.family == AF_INET) + seq_printf(m, " %15s", addr); + else + seq_printf(m, " %39s", addr); + } +unlock: + read_unlock(&veip_hash_lock); + seq_putc(m, '\n'); + return 0; +} + +static void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + struct list_head *entry; + loff_t l; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + l = *pos; + list_for_each(entry, &ve_list_head) { + if (l == 0) + return entry; + l--; + } + return NULL; +} + +static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *entry; + + entry = (struct list_head *)v; + if (!ve_is_super(get_exec_env())) + return NULL; + (*pos)++; + return entry->next == &ve_list_head ? NULL : entry->next; +} + +static void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} + + +static struct seq_operations veinfo_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = veinfo_seq_show, +}; + +static int veinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veinfo_seq_op); +} + +static struct file_operations proc_veinfo_operations = { + .open = veinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *veip_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + int i; + + l = *pos; + write_lock_irq(&veip_hash_lock); + if (l == 0) + return ip_entry_hash_table; + for (i = 0; i < VEIP_HASH_SZ; i++) { + list_for_each(p, ip_entry_hash_table + i) { + if (--l == 0) + return p; + } + } + return NULL; +} + +static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + while (1) { + p = p->next; + if (p < ip_entry_hash_table || + p >= ip_entry_hash_table + VEIP_HASH_SZ) { + (*pos)++; + return p; + } + if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) + return NULL; + } + return NULL; +} + +static void veip_seq_stop(struct seq_file *m, void *v) +{ + write_unlock_irq(&veip_hash_lock); +} + +static struct seq_operations veip_seq_op = { + .start = veip_seq_start, + .next = veip_seq_next, + .stop = veip_seq_stop, + .show = veip_seq_show, +}; + +static int veip_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &veip_seq_op); +} + +static struct file_operations proc_veip_operations = { + .open = veip_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr, + int addrlen) +{ + int err; + struct ve_struct *ve; + struct ve_addr_struct addr; + + err = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + err = sockaddr_to_veaddr(uaddr, addrlen, &addr); + if (err < 0) + goto out; + + switch (op) + { + case VE_IP_ADD: + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veip_entry_add(ve, &addr); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_IP_DEL: + err = veip_entry_del(veid, &addr); + break; + default: + err = -EINVAL; + } + +out: + return err; +} + +int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VENETCTL_VE_IP_MAP: { + struct vzctl_ve_ip_map s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); + break; + } + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch(cmd) { + case VENETCTL_COMPAT_VE_IP_MAP: { + struct compat_vzctl_ve_ip_map cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr), + cs.addrlen); + break; + } + default: + err = venet_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo venetcalls = { + .type = VENETCTLTYPE, + .ioctl = venet_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_venet_ioctl, +#endif + .owner = THIS_MODULE, +}; + +int venet_dev_start(struct ve_struct *ve) +{ + struct net_device *dev_venet; + int err; + + dev_venet = alloc_netdev(0, "venet%d", venet_setup); + if (!dev_venet) + return -ENOMEM; + dev_venet->nd_net = ve->ve_ns->net_ns; + err = dev_alloc_name(dev_venet, dev_venet->name); + if (err<0) + goto err; + if ((err = register_netdev(dev_venet)) != 0) + goto err; + ve->_venet_dev = dev_venet; + return 0; +err: + free_netdev(dev_venet); + printk(KERN_ERR "VENET initialization error err=%d\n", err); + return err; +} + +static int venet_start(void *data) +{ + struct ve_struct *env; + int err; + + env = (struct ve_struct *)data; + if (env->veip) + return -EEXIST; + + err = veip_start(env); + if (err != 0) + return err; + + err = venet_dev_start(env); + if (err) + goto err_free; + return 0; + +err_free: + veip_stop(env); + return err; +} + +static void venet_stop(void *data) +{ + struct ve_struct *env; + struct net_device *dev; + + env = (struct ve_struct *)data; + veip_stop(env); + + dev = env->_venet_dev; + if (dev == NULL) + return; + + unregister_netdev(dev); + env->_venet_dev = NULL; + free_netdev(dev); +} + +static struct ve_hook venet_ve_hook = { + .init = venet_start, + .fini = venet_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int venet_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; +#endif + int i, err; + + if (get_ve0()->_venet_dev != NULL) + return -EEXIST; + + for (i = 0; i < VEIP_HASH_SZ; i++) + INIT_LIST_HEAD(ip_entry_hash_table + i); + + err = venet_start(get_ve0()); + if (err) + return err; + +#ifdef CONFIG_PROC_FS + de = create_proc_glob_entry_mod("vz/veinfo", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_veinfo_operations; + else + printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); + + de = create_proc_entry_mod("vz/veip", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_veip_operations; + else + printk(KERN_WARNING "venet: can't make veip proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); + vzioctl_register(&venetcalls); + return 0; +} + +__exit void venet_exit(void) +{ + vzioctl_unregister(&venetcalls); + ve_hook_unregister(&venet_ve_hook); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("vz/veip", NULL); + remove_proc_entry("vz/veinfo", NULL); +#endif + venet_stop(get_ve0()); + veip_cleanup(); +} + +module_init(venet_init); +module_exit(venet_exit); diff -uprN linux-2.6.24/drivers/net/vzethdev.c linux-2.6.24.ovz/drivers/net/vzethdev.c --- linux-2.6.24/drivers/net/vzethdev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/vzethdev.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,729 @@ +/* + * veth.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * Virtual ethernet device used to change VE ownership on packets + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include /* For ARPHRD_ETHER */ +#include +#include +#include +#include + +#include +#include +#include +#include + +struct veth_struct +{ + struct net_device_stats stats; + struct net_device *pair; + struct list_head hwaddr_list; + struct net_device_stats *real_stats; + int allow_mac_change; +}; + +static LIST_HEAD(veth_hwaddr_list); +static DEFINE_RWLOCK(ve_hwaddr_lock); +static DECLARE_MUTEX(hwaddr_sem); + +#define veth_from_netdev(dev) \ + ((struct veth_struct *)(netdev_priv(dev))) +static inline struct net_device * veth_to_netdev(struct veth_struct *veth) +{ + return (struct net_device *)((char *)veth - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST)); +} + +static inline struct net_device_stats * +veth_stats(struct net_device *dev, int cpuid) +{ + return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid); +} + +struct net_device * veth_dev_start(char *dev_addr, char *name); + +struct veth_struct *hwaddr_entry_lookup(char *name) +{ + struct veth_struct *entry; + + list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) { + BUG_ON(entry->pair == NULL); + if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) + return entry; + } + return NULL; +} + +int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, + char *dev_addr_ve, char *name_ve) +{ + struct net_device *dev_ve; + struct net_device *dev_ve0; + struct ve_struct *old_env; + char dev_name[IFNAMSIZ]; + int err; + + down(&hwaddr_sem); + + if (name[0] == '\0') + snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); + else { + memcpy(dev_name, name, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve0 = veth_dev_start(dev_addr, dev_name); + if (IS_ERR(dev_ve0)) { + err = PTR_ERR(dev_ve0); + goto err; + } + + old_env = set_exec_env(ve); + if (name_ve[0] == '\0') + sprintf(dev_name, "eth%%d"); + else { + memcpy(dev_name, name_ve, IFNAMSIZ - 1); + dev_name[IFNAMSIZ - 1] = '\0'; + } + dev_ve = veth_dev_start(dev_addr_ve, dev_name); + if (IS_ERR(dev_ve)) { + err = PTR_ERR(dev_ve); + goto err_ve; + } + set_exec_env(old_env); + veth_from_netdev(dev_ve)->pair = dev_ve0; + veth_from_netdev(dev_ve0)->pair = dev_ve; + + write_lock(&ve_hwaddr_lock); + list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + up(&hwaddr_sem); + return 0; + +err_ve: + set_exec_env(old_env); + unregister_netdev(dev_ve0); +err: + up(&hwaddr_sem); + return err; +} + +void veth_pair_del(struct ve_struct *env, struct veth_struct *entry) +{ + struct net_device *dev; + struct ve_struct *old_env; + + write_lock(&ve_hwaddr_lock); + list_del(&entry->hwaddr_list); + write_unlock(&ve_hwaddr_lock); + + dev = entry->pair; + BUG_ON(entry->pair == NULL); + + veth_from_netdev(dev)->pair = NULL; + entry->pair = NULL; + rtnl_lock(); + old_env = set_exec_env(dev->owner_env); + dev_close(dev); + + /* + * Now device from VE0 does not send or receive anything, + * i.e. dev->hard_start_xmit won't be called. + */ + set_exec_env(env); + unregister_netdevice(veth_to_netdev(entry)); + set_exec_env(dev->owner_env); + unregister_netdevice(dev); + set_exec_env(old_env); + rtnl_unlock(); +} + +int veth_entry_del(struct ve_struct *ve, char *name) +{ + struct veth_struct *found; + int err; + + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out; + if (veth_to_netdev(found)->owner_env != ve) + goto out; + + err = 0; + veth_pair_del(ve, found); + +out: + up(&hwaddr_sem); + return err; +} + +int veth_allow_change_mac(envid_t veid, char *name, int allow) +{ + struct ve_struct *ve; + struct veth_struct *found; + int err; + + err = -ESRCH; + ve = get_ve_by_id(veid); + if (!ve) + return err; + + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_ve; + err = -ENODEV; + down(&hwaddr_sem); + found = hwaddr_entry_lookup(name); + if (found == NULL) + goto out_sem; + if (veth_to_netdev(found)->owner_env != ve) + goto out_sem; + + err = 0; + found->allow_mac_change = allow; + +out_sem: + up(&hwaddr_sem); +out_ve: + up_read(&ve->op_sem); + put_ve(ve); + return err; +} + +/* + * Device functions + */ + +static int veth_open(struct net_device *dev) +{ + return 0; +} + +static int veth_close(struct net_device *master) +{ + return 0; +} + +static void veth_destructor(struct net_device *dev) +{ + free_percpu(veth_from_netdev(dev)->real_stats); + free_netdev(dev); +} + +static struct net_device_stats *get_stats(struct net_device *dev) +{ + int i; + struct net_device_stats *stats; + + stats = &veth_from_netdev(dev)->stats; + memset(stats, 0, sizeof(struct net_device_stats)); + for (i = 0; i < NR_CPUS; i++) { + struct net_device_stats *dev_stats; + + if (!cpu_possible(i)) + continue; + dev_stats = veth_stats(dev, i); + stats->rx_bytes += dev_stats->rx_bytes; + stats->tx_bytes += dev_stats->tx_bytes; + stats->rx_packets += dev_stats->rx_packets; + stats->tx_packets += dev_stats->tx_packets; + } + + return stats; +} + +/* + * The higher levels take care of making this non-reentrant (it's + * called with bh's disabled). + */ +static int veth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats; + struct net_device *rcv = NULL; + struct veth_struct *entry; + int length; + + stats = veth_stats(dev, smp_processor_id()); + if (unlikely(get_exec_env()->disable_net)) + goto outf; + + entry = veth_from_netdev(dev); + rcv = entry->pair; + if (!rcv) + /* VE going down */ + goto outf; + + if (!(rcv->flags & IFF_UP)) { + /* Target VE does not want to receive packets */ + goto outf; + } + + if (unlikely(rcv->owner_env->disable_net)) + goto outf; + /* Filtering */ + if (ve_is_super(dev->owner_env) && + !veth_from_netdev(rcv)->allow_mac_change) { + /* from VE0 to VEX */ + if (ve_is_super(rcv->owner_env)) + goto out; + if (is_multicast_ether_addr( + ((struct ethhdr *)skb->data)->h_dest)) + goto out; + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, + rcv->dev_addr)) + goto outf; + } else if (!ve_is_super(dev->owner_env) && + !entry->allow_mac_change) { + /* from VE to VE0 */ + if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, + dev->dev_addr)) + goto outf; + } + +out: + skb->owner_env = rcv->owner_env; + + skb->dev = rcv; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, rcv); + + if (skb->protocol != __constant_htons(ETH_P_IP)) + skb_orphan(skb); + + dst_release(skb->dst); + skb->dst = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + length = skb->len; + + netif_rx(skb); + + stats->tx_bytes += length; + stats->tx_packets++; + if (rcv) { + struct net_device_stats *rcv_stats; + rcv_stats = veth_stats(rcv, smp_processor_id()); + rcv_stats->rx_bytes += length; + rcv_stats->rx_packets++; + } + + return 0; + +outf: + kfree_skb(skb); + stats->tx_dropped++; + return 0; +} + +static int veth_set_mac(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!ve_is_super(dev->owner_env) && + !veth_from_netdev(dev)->allow_mac_change) + return -EPERM; + if (netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + + return 0; +} + +int veth_init_dev(struct net_device *dev) +{ + dev->hard_start_xmit = veth_xmit; + dev->get_stats = get_stats; + dev->open = veth_open; + dev->stop = veth_close; + dev->destructor = veth_destructor; + + ether_setup(dev); + dev->set_mac_address = veth_set_mac; + + /* remove setted by ether_setup() handler */ + dev->change_mtu = NULL; + + dev->tx_queue_len = 0; + + veth_from_netdev(dev)->real_stats = + alloc_percpu(struct net_device_stats); + if (veth_from_netdev(dev)->real_stats == NULL) + return -ENOMEM; + + return 0; +} + +static int +veth_set_op(struct net_device *dev, u32 data, + int (*fop)(struct net_device *, u32)) +{ + struct net_device *pair; + int ret = 0; + + ret = fop(dev, data); + if (ret < 0) + goto out; + + pair = veth_from_netdev(dev)->pair; + if (pair) + ret = fop(pair, data); +out: + return ret; +} + +static int veth_op_set_sg(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_sg); +} + +static int veth_op_set_tx_csum(struct net_device *dev, u32 data) +{ + return veth_set_op(dev, data, ethtool_op_set_tx_csum); +} + +#define veth_op_set_rx_csum veth_op_set_tx_csum + +static struct ethtool_ops veth_ethtool_ops = { + .get_sg = ethtool_op_get_sg, + .set_sg = veth_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = veth_op_set_tx_csum, + .get_rx_csum = ethtool_op_get_tx_csum, + .set_rx_csum = veth_op_set_rx_csum, + .get_tso = ethtool_op_get_tso, +}; + +static void veth_setup(struct net_device *dev) +{ + dev->init = veth_init_dev; + /* + * No other features, as they are: + * - checksumming is required, and nobody else will done our job + */ + dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | + NETIF_F_HIGHDMA; + + SET_ETHTOOL_OPS(dev, &veth_ethtool_ops); +} + +#ifdef CONFIG_PROC_FS +#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] +static int vehwaddr_seq_show(struct seq_file *m, void *v) +{ + struct list_head *p; + struct veth_struct *entry; + + p = (struct list_head *)v; + if (p == &veth_hwaddr_list) { + seq_puts(m, "Version: 1.0\n"); + return 0; + } + entry = list_entry(p, struct veth_struct, hwaddr_list); + seq_printf(m, ADDR_FMT " %16s ", + ADDR_ARG(entry->pair->dev_addr), entry->pair->name); + seq_printf(m, ADDR_FMT " %16s %10u %5s\n", + ADDR_ARG(veth_to_netdev(entry)->dev_addr), + veth_to_netdev(entry)->name, + VEID(veth_to_netdev(entry)->owner_env), + entry->allow_mac_change ? "allow" : "deny"); + return 0; +} + +static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t l; + struct list_head *p; + + l = *pos; + read_lock(&ve_hwaddr_lock); + if (l == 0) + return &veth_hwaddr_list; + list_for_each(p, &veth_hwaddr_list) { + if (--l == 0) + return p; + } + return NULL; +} + +static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *p; + + p = (struct list_head *)v; + (*pos)++; + return p->next == &veth_hwaddr_list ? NULL : p->next; +} + +static void vehwaddr_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_hwaddr_lock); +} + +static struct seq_operations vehwaddr_seq_op = { + .start = vehwaddr_seq_start, + .next = vehwaddr_seq_next, + .stop = vehwaddr_seq_stop, + .show = vehwaddr_seq_show, +}; + +static int vehwaddr_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vehwaddr_seq_op); +} + +static struct file_operations proc_vehwaddr_operations = { + .open = vehwaddr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + +int real_ve_hwaddr(envid_t veid, int op, + unsigned char *dev_addr, int addrlen, char *name, + unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) +{ + int err; + struct ve_struct *ve; + char ve_addr[ETH_ALEN]; + + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto out; + + err = -EINVAL; + switch (op) { + case VE_ETH_ADD: + if (addrlen != ETH_ALEN) + goto out; + if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) + goto out; + /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ + if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) + goto out; + if (addrlen_ve == 0) { + memcpy(ve_addr, dev_addr, ETH_ALEN); + ve_addr[3] |= 0x80; + } else { + memcpy(ve_addr, dev_addr_ve, ETH_ALEN); + } + + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve); + up_read(&ve->op_sem); + put_ve(ve); + break; + + case VE_ETH_DEL: + if (name[0] == '\0') + goto out; + ve = get_ve_by_id(veid); + err = -ESRCH; + if (!ve) + goto out; + + down_read(&ve->op_sem); + if (ve->is_running) + err = veth_entry_del(ve, name); + up_read(&ve->op_sem); + put_ve(ve); + break; + case VE_ETH_ALLOW_MAC_CHANGE: + case VE_ETH_DENY_MAC_CHANGE: + err = veth_allow_change_mac(veid, name, + op == VE_ETH_ALLOW_MAC_CHANGE); + break; + } + +out: + return err; +} + +int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VETHCTL_VE_HWADDR: { + struct vzctl_ve_hwaddr s; + + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen, + s.dev_name, s.dev_addr_ve, s.addrlen_ve, + s.dev_name_ve); + } + break; + } + return err; +} + +static struct vzioctlinfo vethcalls = { + .type = VETHCTLTYPE, + .ioctl = veth_ioctl, + .compat_ioctl = veth_ioctl, + .owner = THIS_MODULE, +}; + +struct net_device * veth_dev_start(char *dev_addr, char *name) +{ + struct net_device *dev; + int err; + + if (!is_valid_ether_addr(dev_addr)) + return ERR_PTR(-EADDRNOTAVAIL); + + dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); + if (!dev) + return ERR_PTR(-ENOMEM); + dev->nd_net = get_exec_env()->ve_ns->net_ns; + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err; + } + if ((err = register_netdev(dev)) != 0) + goto err; + + memcpy(dev->dev_addr, dev_addr, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + return dev; +err: + free_netdev(dev); + printk(KERN_ERR "%s initialization error err=%d\n", name, err); + return ERR_PTR(err); +} + +static int veth_start(void *data) +{ + return 0; +} + +static void veth_stop(void *data) +{ + struct ve_struct *env; + struct veth_struct *entry, *tmp; + + env = (struct ve_struct *)data; + down(&hwaddr_sem); + list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list) + if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env)) + veth_pair_del(env, entry); + up(&hwaddr_sem); +} + +static struct ve_hook veth_ve_hook = { + .init = veth_start, + .fini = veth_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET, +}; + +__init int veth_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *de; + + de = create_proc_entry_mod("vz/veth", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de) + de->proc_fops = &proc_vehwaddr_operations; + else + printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); +#endif + + ve_hook_register(VE_SS_CHAIN, &veth_ve_hook); + vzioctl_register(&vethcalls); + KSYMRESOLVE(veth_open); + KSYMMODRESOLVE(vzethdev); + return 0; +} + +__exit void veth_exit(void) +{ + struct veth_struct *entry; + struct list_head *tmp, *n; + struct ve_struct *ve; + + KSYMMODUNRESOLVE(vzethdev); + KSYMUNRESOLVE(veth_open); + vzioctl_unregister(&vethcalls); + ve_hook_unregister(&veth_ve_hook); +#ifdef CONFIG_PROC_FS + remove_proc_entry("vz/veth", NULL); +#endif + + down(&hwaddr_sem); + list_for_each_safe(tmp, n, &veth_hwaddr_list) { + entry = list_entry(tmp, struct veth_struct, hwaddr_list); + ve = get_ve(veth_to_netdev(entry)->owner_env); + + veth_pair_del(ve, entry); + + put_ve(ve); + } + up(&hwaddr_sem); +} + +module_init(veth_init); +module_exit(veth_exit); + +MODULE_AUTHOR("Andrey Mirkin "); +MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); +MODULE_LICENSE("GPL v2"); + diff -uprN linux-2.6.24/drivers/net/wireless/b43/dma.c linux-2.6.24.ovz/drivers/net/wireless/b43/dma.c --- linux-2.6.24/drivers/net/wireless/b43/dma.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43/dma.c 2008-03-25 18:53:59.000000000 -0500 @@ -1106,7 +1106,7 @@ static int dma_tx_fragment(struct b43_dm { const struct b43_dma_ops *ops = ring->ops; u8 *header; - int slot; + int slot, old_top_slot, old_used_slots; int err; struct b43_dmadesc_generic *desc; struct b43_dmadesc_meta *meta; @@ -1116,20 +1116,31 @@ static int dma_tx_fragment(struct b43_dm #define SLOTS_PER_PACKET 2 B43_WARN_ON(skb_shinfo(skb)->nr_frags); + old_top_slot = ring->current_slot; + old_used_slots = ring->used_slots; + /* Get a slot for the header. */ slot = request_slot(ring); desc = ops->idx2desc(ring, slot, &meta_hdr); memset(meta_hdr, 0, sizeof(*meta_hdr)); header = &(ring->txhdr_cache[slot * sizeof(struct b43_txhdr_fw4)]); - b43_generate_txhdr(ring->dev, header, + err = b43_generate_txhdr(ring->dev, header, skb->data, skb->len, ctl, generate_cookie(ring, slot)); + if (unlikely(err)) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; + return err; + } meta_hdr->dmaaddr = map_descbuffer(ring, (unsigned char *)header, sizeof(struct b43_txhdr_fw4), 1); - if (dma_mapping_error(meta_hdr->dmaaddr)) + if (dma_mapping_error(meta_hdr->dmaaddr)) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; return -EIO; + } ops->fill_descriptor(ring, desc, meta_hdr->dmaaddr, sizeof(struct b43_txhdr_fw4), 1, 0, 0); @@ -1147,6 +1158,8 @@ static int dma_tx_fragment(struct b43_dm if (dma_mapping_error(meta->dmaaddr)) { bounce_skb = __dev_alloc_skb(skb->len, GFP_ATOMIC | GFP_DMA); if (!bounce_skb) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; err = -ENOMEM; goto out_unmap_hdr; } @@ -1157,6 +1170,8 @@ static int dma_tx_fragment(struct b43_dm meta->skb = skb; meta->dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1); if (dma_mapping_error(meta->dmaaddr)) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; err = -EIO; goto out_free_bounce; } @@ -1219,6 +1234,13 @@ int b43_dma_tx(struct b43_wldev *dev, B43_WARN_ON(ring->stopped); err = dma_tx_fragment(ring, skb, ctl); + if (unlikely(err == -ENOKEY)) { + /* Drop this packet, as we don't have the encryption key + * anymore and must not transmit it unencrypted. */ + dev_kfree_skb_any(skb); + err = 0; + goto out_unlock; + } if (unlikely(err)) { b43err(dev->wl, "DMA tx mapping failure\n"); goto out_unlock; diff -uprN linux-2.6.24/drivers/net/wireless/b43/main.c linux-2.6.24.ovz/drivers/net/wireless/b43/main.c --- linux-2.6.24/drivers/net/wireless/b43/main.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43/main.c 2008-03-25 18:53:59.000000000 -0500 @@ -1800,6 +1800,18 @@ static int b43_upload_microcode(struct b err = -EOPNOTSUPP; goto out; } + if (fwrev > 351) { + b43err(dev->wl, "YOUR FIRMWARE IS TOO NEW. Please downgrade your " + "firmware.\n"); + b43err(dev->wl, "Use this firmware tarball: " + "http://downloads.openwrt.org/sources/broadcom-wl-4.80.53.0.tar.bz2\n"); + b43err(dev->wl, "Use this b43-fwcutter tarball: " + "http://bu3sch.de/b43/fwcutter/b43-fwcutter-009.tar.bz2\n"); + b43err(dev->wl, "Read, understand and _do_ what this message says, please.\n"); + b43_write32(dev, B43_MMIO_MACCTL, 0); + err = -EOPNOTSUPP; + goto out; + } b43dbg(dev->wl, "Loading firmware version %u.%u " "(20%.2i-%.2i-%.2i %.2i:%.2i:%.2i)\n", fwrev, fwpatch, @@ -3395,8 +3407,6 @@ static int b43_wireless_core_init(struct b43_bluetooth_coext_enable(dev); ssb_bus_powerup(bus, 1); /* Enable dynamic PCTL */ - memset(wl->bssid, 0, ETH_ALEN); - memset(wl->mac_addr, 0, ETH_ALEN); b43_upload_card_macaddress(dev); b43_security_init(dev); b43_rng_init(wl); @@ -3493,6 +3503,13 @@ static int b43_start(struct ieee80211_hw int did_init = 0; int err = 0; + /* Kill all old instance specific information to make sure + * the card won't use it in the short timeframe between start + * and mac80211 reconfiguring it. */ + memset(wl->bssid, 0, ETH_ALEN); + memset(wl->mac_addr, 0, ETH_ALEN); + wl->filter_flags = 0; + /* First register RFkill. * LEDs that are registered later depend on it. */ b43_rfkill_init(dev); diff -uprN linux-2.6.24/drivers/net/wireless/b43/xmit.c linux-2.6.24.ovz/drivers/net/wireless/b43/xmit.c --- linux-2.6.24/drivers/net/wireless/b43/xmit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43/xmit.c 2008-03-25 18:53:59.000000000 -0500 @@ -177,7 +177,7 @@ static u8 b43_calc_fallback_rate(u8 bitr return 0; } -static void generate_txhdr_fw4(struct b43_wldev *dev, +static int generate_txhdr_fw4(struct b43_wldev *dev, struct b43_txhdr_fw4 *txhdr, const unsigned char *fragment_data, unsigned int fragment_len, @@ -235,7 +235,15 @@ static void generate_txhdr_fw4(struct b4 B43_WARN_ON(key_idx >= dev->max_nr_keys); key = &(dev->key[key_idx]); - B43_WARN_ON(!key->keyconf); + + if (unlikely(!key->keyconf)) { + /* This key is invalid. This might only happen + * in a short timeframe after machine resume before + * we were able to reconfigure keys. + * Drop this packet completely. Do not transmit it + * unencrypted to avoid leaking information. */ + return -ENOKEY; + } /* Hardware appends ICV. */ plcp_fragment_len += txctl->icv_len; @@ -352,16 +360,18 @@ static void generate_txhdr_fw4(struct b4 txhdr->mac_ctl = cpu_to_le32(mac_ctl); txhdr->phy_ctl = cpu_to_le16(phy_ctl); txhdr->extra_ft = extra_ft; + + return 0; } -void b43_generate_txhdr(struct b43_wldev *dev, +int b43_generate_txhdr(struct b43_wldev *dev, u8 * txhdr, const unsigned char *fragment_data, unsigned int fragment_len, const struct ieee80211_tx_control *txctl, u16 cookie) { - generate_txhdr_fw4(dev, (struct b43_txhdr_fw4 *)txhdr, - fragment_data, fragment_len, txctl, cookie); + return generate_txhdr_fw4(dev, (struct b43_txhdr_fw4 *)txhdr, + fragment_data, fragment_len, txctl, cookie); } static s8 b43_rssi_postprocess(struct b43_wldev *dev, diff -uprN linux-2.6.24/drivers/net/wireless/b43/xmit.h linux-2.6.24.ovz/drivers/net/wireless/b43/xmit.h --- linux-2.6.24/drivers/net/wireless/b43/xmit.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43/xmit.h 2008-03-25 18:53:59.000000000 -0500 @@ -82,7 +82,7 @@ struct b43_txhdr_fw4 { #define B43_TX4_PHY_ANT1 0x0100 /* Use antenna 1 */ #define B43_TX4_PHY_ANTLAST 0x0300 /* Use last used antenna */ -void b43_generate_txhdr(struct b43_wldev *dev, +int b43_generate_txhdr(struct b43_wldev *dev, u8 * txhdr, const unsigned char *fragment_data, unsigned int fragment_len, diff -uprN linux-2.6.24/drivers/net/wireless/b43legacy/dma.c linux-2.6.24.ovz/drivers/net/wireless/b43legacy/dma.c --- linux-2.6.24/drivers/net/wireless/b43legacy/dma.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43legacy/dma.c 2008-03-25 18:53:59.000000000 -0500 @@ -1164,7 +1164,7 @@ static int dma_tx_fragment(struct b43leg { const struct b43legacy_dma_ops *ops = ring->ops; u8 *header; - int slot; + int slot, old_top_slot, old_used_slots; int err; struct b43legacy_dmadesc_generic *desc; struct b43legacy_dmadesc_meta *meta; @@ -1174,6 +1174,9 @@ static int dma_tx_fragment(struct b43leg #define SLOTS_PER_PACKET 2 B43legacy_WARN_ON(skb_shinfo(skb)->nr_frags != 0); + old_top_slot = ring->current_slot; + old_used_slots = ring->used_slots; + /* Get a slot for the header. */ slot = request_slot(ring); desc = ops->idx2desc(ring, slot, &meta_hdr); @@ -1181,9 +1184,14 @@ static int dma_tx_fragment(struct b43leg header = &(ring->txhdr_cache[slot * sizeof( struct b43legacy_txhdr_fw3)]); - b43legacy_generate_txhdr(ring->dev, header, + err = b43legacy_generate_txhdr(ring->dev, header, skb->data, skb->len, ctl, generate_cookie(ring, slot)); + if (unlikely(err)) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; + return err; + } meta_hdr->dmaaddr = map_descbuffer(ring, (unsigned char *)header, sizeof(struct b43legacy_txhdr_fw3), 1); @@ -1206,6 +1214,8 @@ static int dma_tx_fragment(struct b43leg if (dma_mapping_error(meta->dmaaddr)) { bounce_skb = __dev_alloc_skb(skb->len, GFP_ATOMIC | GFP_DMA); if (!bounce_skb) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; err = -ENOMEM; goto out_unmap_hdr; } @@ -1216,6 +1226,8 @@ static int dma_tx_fragment(struct b43leg meta->skb = skb; meta->dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1); if (dma_mapping_error(meta->dmaaddr)) { + ring->current_slot = old_top_slot; + ring->used_slots = old_used_slots; err = -EIO; goto out_free_bounce; } @@ -1282,6 +1294,13 @@ int b43legacy_dma_tx(struct b43legacy_wl B43legacy_BUG_ON(ring->stopped); err = dma_tx_fragment(ring, skb, ctl); + if (unlikely(err == -ENOKEY)) { + /* Drop this packet, as we don't have the encryption key + * anymore and must not transmit it unencrypted. */ + dev_kfree_skb_any(skb); + err = 0; + goto out_unlock; + } if (unlikely(err)) { b43legacyerr(dev->wl, "DMA tx mapping failure\n"); goto out_unlock; diff -uprN linux-2.6.24/drivers/net/wireless/b43legacy/main.c linux-2.6.24.ovz/drivers/net/wireless/b43legacy/main.c --- linux-2.6.24/drivers/net/wireless/b43legacy/main.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43legacy/main.c 2008-03-25 18:53:59.000000000 -0500 @@ -3215,8 +3215,6 @@ static int b43legacy_wireless_core_init( b43legacy_shm_write16(dev, B43legacy_SHM_SHARED, 0x0414, 0x01F4); ssb_bus_powerup(bus, 1); /* Enable dynamic PCTL */ - memset(wl->bssid, 0, ETH_ALEN); - memset(wl->mac_addr, 0, ETH_ALEN); b43legacy_upload_card_macaddress(dev); b43legacy_security_init(dev); b43legacy_rng_init(wl); @@ -3311,6 +3309,13 @@ static int b43legacy_start(struct ieee80 int did_init = 0; int err = 0; + /* Kill all old instance specific information to make sure + * the card won't use it in the short timeframe between start + * and mac80211 reconfiguring it. */ + memset(wl->bssid, 0, ETH_ALEN); + memset(wl->mac_addr, 0, ETH_ALEN); + wl->filter_flags = 0; + mutex_lock(&wl->mutex); if (b43legacy_status(dev) < B43legacy_STAT_INITIALIZED) { diff -uprN linux-2.6.24/drivers/net/wireless/b43legacy/pio.c linux-2.6.24.ovz/drivers/net/wireless/b43legacy/pio.c --- linux-2.6.24/drivers/net/wireless/b43legacy/pio.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43legacy/pio.c 2008-03-25 18:53:59.000000000 -0500 @@ -181,7 +181,7 @@ union txhdr_union { struct b43legacy_txhdr_fw3 txhdr_fw3; }; -static void pio_tx_write_fragment(struct b43legacy_pioqueue *queue, +static int pio_tx_write_fragment(struct b43legacy_pioqueue *queue, struct sk_buff *skb, struct b43legacy_pio_txpacket *packet, size_t txhdr_size) @@ -189,14 +189,17 @@ static void pio_tx_write_fragment(struct union txhdr_union txhdr_data; u8 *txhdr = NULL; unsigned int octets; + int err; txhdr = (u8 *)(&txhdr_data.txhdr_fw3); B43legacy_WARN_ON(skb_shinfo(skb)->nr_frags != 0); - b43legacy_generate_txhdr(queue->dev, + err = b43legacy_generate_txhdr(queue->dev, txhdr, skb->data, skb->len, &packet->txstat.control, generate_cookie(queue, packet)); + if (err) + return err; tx_start(queue); octets = skb->len + txhdr_size; @@ -204,6 +207,8 @@ static void pio_tx_write_fragment(struct octets--; tx_data(queue, txhdr, (u8 *)skb->data, octets); tx_complete(queue, skb); + + return 0; } static void free_txpacket(struct b43legacy_pio_txpacket *packet, @@ -226,6 +231,7 @@ static int pio_tx_packet(struct b43legac struct b43legacy_pioqueue *queue = packet->queue; struct sk_buff *skb = packet->skb; u16 octets; + int err; octets = (u16)skb->len + sizeof(struct b43legacy_txhdr_fw3); if (queue->tx_devq_size < octets) { @@ -247,8 +253,14 @@ static int pio_tx_packet(struct b43legac if (queue->tx_devq_used + octets > queue->tx_devq_size) return -EBUSY; /* Now poke the device. */ - pio_tx_write_fragment(queue, skb, packet, + err = pio_tx_write_fragment(queue, skb, packet, sizeof(struct b43legacy_txhdr_fw3)); + if (unlikely(err == -ENOKEY)) { + /* Drop this packet, as we don't have the encryption key + * anymore and must not transmit it unencrypted. */ + free_txpacket(packet, 1); + return 0; + } /* Account for the packet size. * (We must not overflow the device TX queue) @@ -486,6 +498,9 @@ void b43legacy_pio_handle_txstatus(struc queue = parse_cookie(dev, status->cookie, &packet); B43legacy_WARN_ON(!queue); + if (!packet->skb) + return; + queue->tx_devq_packets--; queue->tx_devq_used -= (packet->skb->len + sizeof(struct b43legacy_txhdr_fw3)); diff -uprN linux-2.6.24/drivers/net/wireless/b43legacy/xmit.c linux-2.6.24.ovz/drivers/net/wireless/b43legacy/xmit.c --- linux-2.6.24/drivers/net/wireless/b43legacy/xmit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43legacy/xmit.c 2008-03-25 18:53:59.000000000 -0500 @@ -181,7 +181,7 @@ static u8 b43legacy_calc_fallback_rate(u return 0; } -static void generate_txhdr_fw3(struct b43legacy_wldev *dev, +static int generate_txhdr_fw3(struct b43legacy_wldev *dev, struct b43legacy_txhdr_fw3 *txhdr, const unsigned char *fragment_data, unsigned int fragment_len, @@ -252,6 +252,13 @@ static void generate_txhdr_fw3(struct b4 iv_len = min((size_t)txctl->iv_len, ARRAY_SIZE(txhdr->iv)); memcpy(txhdr->iv, ((u8 *)wlhdr) + wlhdr_len, iv_len); + } else { + /* This key is invalid. This might only happen + * in a short timeframe after machine resume before + * we were able to reconfigure keys. + * Drop this packet completely. Do not transmit it + * unencrypted to avoid leaking information. */ + return -ENOKEY; } } b43legacy_generate_plcp_hdr((struct b43legacy_plcp_hdr4 *) @@ -344,16 +351,18 @@ static void generate_txhdr_fw3(struct b4 /* Apply the bitfields */ txhdr->mac_ctl = cpu_to_le32(mac_ctl); txhdr->phy_ctl = cpu_to_le16(phy_ctl); + + return 0; } -void b43legacy_generate_txhdr(struct b43legacy_wldev *dev, +int b43legacy_generate_txhdr(struct b43legacy_wldev *dev, u8 *txhdr, const unsigned char *fragment_data, unsigned int fragment_len, const struct ieee80211_tx_control *txctl, u16 cookie) { - generate_txhdr_fw3(dev, (struct b43legacy_txhdr_fw3 *)txhdr, + return generate_txhdr_fw3(dev, (struct b43legacy_txhdr_fw3 *)txhdr, fragment_data, fragment_len, txctl, cookie); } diff -uprN linux-2.6.24/drivers/net/wireless/b43legacy/xmit.h linux-2.6.24.ovz/drivers/net/wireless/b43legacy/xmit.h --- linux-2.6.24/drivers/net/wireless/b43legacy/xmit.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/net/wireless/b43legacy/xmit.h 2008-03-25 18:53:59.000000000 -0500 @@ -76,7 +76,7 @@ struct b43legacy_txhdr_fw3 { -void b43legacy_generate_txhdr(struct b43legacy_wldev *dev, +int b43legacy_generate_txhdr(struct b43legacy_wldev *dev, u8 *txhdr, const unsigned char *fragment_data, unsigned int fragment_len, diff -uprN linux-2.6.24/drivers/pci/hotplug/fakephp.c linux-2.6.24.ovz/drivers/pci/hotplug/fakephp.c --- linux-2.6.24/drivers/pci/hotplug/fakephp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/pci/hotplug/fakephp.c 2008-03-25 18:53:59.000000000 -0500 @@ -39,6 +39,7 @@ #include #include #include +#include #include "../pci.h" #if !defined(MODULE) @@ -63,10 +64,16 @@ struct dummy_slot { struct list_head node; struct hotplug_slot *slot; struct pci_dev *dev; + struct work_struct remove_work; + unsigned long removed; }; static int debug; static LIST_HEAD(slot_list); +static struct workqueue_struct *dummyphp_wq; + +static void pci_rescan_worker(struct work_struct *work); +static DECLARE_WORK(pci_rescan_work, pci_rescan_worker); static int enable_slot (struct hotplug_slot *slot); static int disable_slot (struct hotplug_slot *slot); @@ -109,7 +116,7 @@ static int add_slot(struct pci_dev *dev) slot->name = &dev->dev.bus_id[0]; dbg("slot->name = %s\n", slot->name); - dslot = kmalloc(sizeof(struct dummy_slot), GFP_KERNEL); + dslot = kzalloc(sizeof(struct dummy_slot), GFP_KERNEL); if (!dslot) goto error_info; @@ -164,6 +171,14 @@ static void remove_slot(struct dummy_slo err("Problem unregistering a slot %s\n", dslot->slot->name); } +/* called from the single-threaded workqueue handler to remove a slot */ +static void remove_slot_worker(struct work_struct *work) +{ + struct dummy_slot *dslot = + container_of(work, struct dummy_slot, remove_work); + remove_slot(dslot); +} + /** * pci_rescan_slot - Rescan slot * @temp: Device template. Should be set: bus and devfn. @@ -267,11 +282,17 @@ static inline void pci_rescan(void) { pci_rescan_buses(&pci_root_buses); } +/* called from the single-threaded workqueue handler to rescan all pci buses */ +static void pci_rescan_worker(struct work_struct *work) +{ + pci_rescan(); +} static int enable_slot(struct hotplug_slot *hotplug_slot) { /* mis-use enable_slot for rescanning of the pci bus */ - pci_rescan(); + cancel_work_sync(&pci_rescan_work); + queue_work(dummyphp_wq, &pci_rescan_work); return -ENODEV; } @@ -306,6 +327,10 @@ static int disable_slot(struct hotplug_s err("Can't remove PCI devices with other PCI devices behind it yet.\n"); return -ENODEV; } + if (test_and_set_bit(0, &dslot->removed)) { + dbg("Slot already scheduled for removal\n"); + return -ENODEV; + } /* search for subfunctions and disable them first */ if (!(dslot->dev->devfn & 7)) { for (func = 1; func < 8; func++) { @@ -328,8 +353,9 @@ static int disable_slot(struct hotplug_s /* remove the device from the pci core */ pci_remove_bus_device(dslot->dev); - /* blow away this sysfs entry and other parts. */ - remove_slot(dslot); + /* queue work item to blow away this sysfs entry and other parts. */ + INIT_WORK(&dslot->remove_work, remove_slot_worker); + queue_work(dummyphp_wq, &dslot->remove_work); return 0; } @@ -340,6 +366,7 @@ static void cleanup_slots (void) struct list_head *next; struct dummy_slot *dslot; + destroy_workqueue(dummyphp_wq); list_for_each_safe (tmp, next, &slot_list) { dslot = list_entry (tmp, struct dummy_slot, node); remove_slot(dslot); @@ -351,6 +378,10 @@ static int __init dummyphp_init(void) { info(DRIVER_DESC "\n"); + dummyphp_wq = create_singlethread_workqueue(MY_NAME); + if (!dummyphp_wq) + return -ENOMEM; + return pci_scan_buses(); } diff -uprN linux-2.6.24/drivers/pci/probe.c linux-2.6.24.ovz/drivers/pci/probe.c --- linux-2.6.24/drivers/pci/probe.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/pci/probe.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses); EXPORT_SYMBOL(pci_root_buses); LIST_HEAD(pci_devices); +EXPORT_SYMBOL(pci_devices); /* * Some device drivers need know if pci is initiated. diff -uprN linux-2.6.24/drivers/sbus/char/bbc_envctrl.c linux-2.6.24.ovz/drivers/sbus/char/bbc_envctrl.c --- linux-2.6.24/drivers/sbus/char/bbc_envctrl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/sbus/char/bbc_envctrl.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include "bbc_i2c.h" #include "max1617.h" diff -uprN linux-2.6.24/drivers/sbus/char/envctrl.c linux-2.6.24.ovz/drivers/sbus/char/envctrl.c --- linux-2.6.24/drivers/sbus/char/envctrl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/sbus/char/envctrl.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #define ENVCTRL_MINOR 162 diff -uprN linux-2.6.24/drivers/usb/core/driver.c linux-2.6.24.ovz/drivers/usb/core/driver.c --- linux-2.6.24/drivers/usb/core/driver.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/core/driver.c 2008-03-25 18:53:59.000000000 -0500 @@ -534,8 +534,8 @@ const struct usb_device_id *usb_match_id id->driver_info is the way to create an entry that indicates that the driver want to examine every device and interface. */ - for (; id->idVendor || id->bDeviceClass || id->bInterfaceClass || - id->driver_info; id++) { + for (; id->idVendor || id->idProduct || id->bDeviceClass || + id->bInterfaceClass || id->driver_info; id++) { if (usb_match_one_id(interface, id)) return id; } diff -uprN linux-2.6.24/drivers/usb/core/hub.c linux-2.6.24.ovz/drivers/usb/core/hub.c --- linux-2.6.24/drivers/usb/core/hub.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/core/hub.c 2008-03-25 18:53:59.000000000 -0500 @@ -2946,7 +2946,7 @@ static int config_descriptors_changed(st if (len < le16_to_cpu(udev->config[index].desc.wTotalLength)) len = le16_to_cpu(udev->config[index].desc.wTotalLength); } - buf = kmalloc (len, GFP_KERNEL); + buf = kmalloc(len, GFP_NOIO); if (buf == NULL) { dev_err(&udev->dev, "no mem to re-read configs after reset\n"); /* assume the worst */ diff -uprN linux-2.6.24/drivers/usb/gadget/fsl_usb2_udc.c linux-2.6.24.ovz/drivers/usb/gadget/fsl_usb2_udc.c --- linux-2.6.24/drivers/usb/gadget/fsl_usb2_udc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/gadget/fsl_usb2_udc.c 2008-03-25 18:53:59.000000000 -0500 @@ -776,7 +776,7 @@ fsl_ep_queue(struct usb_ep *_ep, struct VDBG("%s, bad params\n", __FUNCTION__); return -EINVAL; } - if (!_ep || (!ep->desc && ep_index(ep))) { + if (unlikely(!_ep || !ep->desc)) { VDBG("%s, bad ep\n", __FUNCTION__); return -EINVAL; } diff -uprN linux-2.6.24/drivers/usb/misc/usbtest.c linux-2.6.24.ovz/drivers/usb/misc/usbtest.c --- linux-2.6.24/drivers/usb/misc/usbtest.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/misc/usbtest.c 2008-03-25 18:53:59.000000000 -0500 @@ -1151,6 +1151,7 @@ static int verify_halted (int ep, struct dbg ("ep %02x couldn't get halt status, %d", ep, retval); return retval; } + le16_to_cpus(&status); if (status != 1) { dbg ("ep %02x bogus status: %04x != 1", ep, status); return -EINVAL; diff -uprN linux-2.6.24/drivers/usb/serial/cp2101.c linux-2.6.24.ovz/drivers/usb/serial/cp2101.c --- linux-2.6.24/drivers/usb/serial/cp2101.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/cp2101.c 2008-03-25 18:53:59.000000000 -0500 @@ -59,6 +59,7 @@ static struct usb_device_id id_table [] { USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */ { USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */ { USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */ + { USB_DEVICE(0x10C4, 0x800A) }, /* SPORTident BSM7-D-USB main station */ { USB_DEVICE(0x10C4, 0x803B) }, /* Pololu USB-serial converter */ { USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */ { USB_DEVICE(0x10C4, 0x8066) }, /* Argussoft In-System Programmer */ @@ -76,8 +77,13 @@ static struct usb_device_id id_table [] { USB_DEVICE(0x10C4, 0x8218) }, /* Lipowsky Industrie Elektronik GmbH, HARP-1 */ { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */ + { USB_DEVICE(0x10C4, 0xF001) }, /* Elan Digital Systems USBscope50 */ + { USB_DEVICE(0x10C4, 0xF002) }, /* Elan Digital Systems USBwave12 */ + { USB_DEVICE(0x10C4, 0xF003) }, /* Elan Digital Systems USBpulse100 */ + { USB_DEVICE(0x10C4, 0xF004) }, /* Elan Digital Systems USBcount50 */ { USB_DEVICE(0x10C5, 0xEA61) }, /* Silicon Labs MobiData GPRS USB Modem */ { USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */ + { USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */ { USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */ { } /* Terminating Entry */ }; diff -uprN linux-2.6.24/drivers/usb/serial/ftdi_sio.c linux-2.6.24.ovz/drivers/usb/serial/ftdi_sio.c --- linux-2.6.24/drivers/usb/serial/ftdi_sio.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/ftdi_sio.c 2008-03-25 18:53:59.000000000 -0500 @@ -471,30 +471,28 @@ static struct usb_device_id id_table_com { USB_DEVICE(FTDI_VID, FTDI_IBS_PEDO_PID) }, { USB_DEVICE(FTDI_VID, FTDI_IBS_PROD_PID) }, /* - * These will probably use user-space drivers. Uncomment them if - * you need them or use the user-specified vendor/product module - * parameters (see ftdi_sio.h for the numbers). Make a fuss if - * you think the driver should recognize any of them by default. + * Due to many user requests for multiple ELV devices we enable + * them by default. */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_CLI7000_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_PPS7330_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_TFM100_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_UDF77_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_UIO88_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_UAD8_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_UDA7_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_USI2_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_T1100_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_PCD200_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_ULA200_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_CSI8_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_EM1000DL_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_PCK100_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_RFP500_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_FS20SIG_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_WS300PC_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_FHZ1300PC_PID) }, */ - /* { USB_DEVICE(FTDI_VID, FTDI_ELV_WS500_PID) }, */ + { USB_DEVICE(FTDI_VID, FTDI_ELV_CLI7000_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_PPS7330_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_TFM100_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UDF77_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UIO88_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UAD8_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_UDA7_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_USI2_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_T1100_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_PCD200_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_ULA200_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_CSI8_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_EM1000DL_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_PCK100_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_RFP500_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_FS20SIG_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS300PC_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_FHZ1300PC_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELV_WS500_PID) }, { USB_DEVICE(FTDI_VID, LINX_SDMUSBQSS_PID) }, { USB_DEVICE(FTDI_VID, LINX_MASTERDEVEL2_PID) }, { USB_DEVICE(FTDI_VID, LINX_FUTURE_0_PID) }, @@ -545,6 +543,7 @@ static struct usb_device_id id_table_com { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16C_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16HR_PID) }, { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16HRC_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16IC_PID) }, { USB_DEVICE(KOBIL_VID, KOBIL_CONV_B1_PID) }, { USB_DEVICE(KOBIL_VID, KOBIL_CONV_KAAN_PID) }, { USB_DEVICE(POSIFLEX_VID, POSIFLEX_PP7000_PID) }, @@ -569,6 +568,7 @@ static struct usb_device_id id_table_com { USB_DEVICE(TELLDUS_VID, TELLDUS_TELLSTICK_PID) }, { USB_DEVICE(FTDI_VID, FTDI_MAXSTREAM_PID) }, { USB_DEVICE(TML_VID, TML_USB_SERIAL_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ELSTER_UNICOM_PID) }, { USB_DEVICE(OLIMEX_VID, OLIMEX_ARM_USB_OCD_PID), .driver_info = (kernel_ulong_t)&ftdi_olimex_quirk }, { }, /* Optional parameter entry */ diff -uprN linux-2.6.24/drivers/usb/serial/ftdi_sio.h linux-2.6.24.ovz/drivers/usb/serial/ftdi_sio.h --- linux-2.6.24/drivers/usb/serial/ftdi_sio.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/ftdi_sio.h 2008-03-25 18:53:59.000000000 -0500 @@ -245,6 +245,7 @@ #define FTDI_ELV_WS300PC_PID 0xE0F6 /* PC-Wetterstation (WS 300 PC) */ #define FTDI_ELV_FHZ1300PC_PID 0xE0E8 /* FHZ 1300 PC */ #define FTDI_ELV_WS500_PID 0xE0E9 /* PC-Wetterstation (WS 500) */ +#define FTDI_ELV_EM1010PC_PID 0xE0EF /* Engery monitor EM 1010 PC */ /* * Definitions for ID TECH (www.idt-net.com) devices @@ -278,6 +279,7 @@ #define FTDI_ATIK_ATK16C_PID 0xDF32 /* ATIK ATK-16C Colour Camera */ #define FTDI_ATIK_ATK16HR_PID 0xDF31 /* ATIK ATK-16HR Grayscale Camera */ #define FTDI_ATIK_ATK16HRC_PID 0xDF33 /* ATIK ATK-16HRC Colour Camera */ +#define FTDI_ATIK_ATK16IC_PID 0xDF35 /* ATIK ATK-16IC Grayscale Camera */ /* * Protego product ids @@ -534,6 +536,8 @@ #define OLIMEX_VID 0x15BA #define OLIMEX_ARM_USB_OCD_PID 0x0003 +/* www.elsterelectricity.com Elster Unicom III Optical Probe */ +#define FTDI_ELSTER_UNICOM_PID 0xE700 /* Product Id */ /* * The Mobility Lab (TML) diff -uprN linux-2.6.24/drivers/usb/serial/keyspan.c linux-2.6.24.ovz/drivers/usb/serial/keyspan.c --- linux-2.6.24/drivers/usb/serial/keyspan.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/keyspan.c 2008-03-25 18:53:59.000000000 -0500 @@ -838,7 +838,7 @@ static void usa49_indat_callback(struct port = (struct usb_serial_port *) urb->context; tty = port->tty; - if (urb->actual_length) { + if (tty && urb->actual_length) { /* 0x80 bit is error flag */ if ((data[0] & 0x80) == 0) { /* no error on any byte */ diff -uprN linux-2.6.24/drivers/usb/serial/kobil_sct.c linux-2.6.24.ovz/drivers/usb/serial/kobil_sct.c --- linux-2.6.24/drivers/usb/serial/kobil_sct.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/kobil_sct.c 2008-03-25 18:53:59.000000000 -0500 @@ -114,6 +114,7 @@ static struct usb_serial_driver kobil_de .usb_driver = &kobil_driver, .id_table = id_table, .num_interrupt_in = NUM_DONT_CARE, + .num_interrupt_out = NUM_DONT_CARE, .num_bulk_in = 0, .num_bulk_out = 0, .num_ports = 1, diff -uprN linux-2.6.24/drivers/usb/serial/option.c linux-2.6.24.ovz/drivers/usb/serial/option.c --- linux-2.6.24/drivers/usb/serial/option.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/option.c 2008-03-25 18:53:59.000000000 -0500 @@ -180,6 +180,7 @@ static struct usb_device_id option_ids[] { USB_DEVICE(DELL_VENDOR_ID, 0x8117) }, /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO ExpressCard == Novatel Merlin XV620 CDMA/EV-DO */ { USB_DEVICE(DELL_VENDOR_ID, 0x8118) }, /* Dell Wireless 5510 Mobile Broadband HSDPA ExpressCard == Novatel Merlin XU870 HSDPA/3G */ { USB_DEVICE(DELL_VENDOR_ID, 0x8128) }, /* Dell Wireless 5700 Mobile Broadband CDMA/EVDO Mini-Card == Novatel Expedite E720 CDMA/EV-DO */ + { USB_DEVICE(DELL_VENDOR_ID, 0x8136) }, /* Dell Wireless HSDPA 5520 == Novatel Expedite EU860D */ { USB_DEVICE(DELL_VENDOR_ID, 0x8137) }, /* Dell Wireless HSDPA 5520 */ { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_E100A) }, { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_500A) }, diff -uprN linux-2.6.24/drivers/usb/serial/pl2303.c linux-2.6.24.ovz/drivers/usb/serial/pl2303.c --- linux-2.6.24/drivers/usb/serial/pl2303.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/pl2303.c 2008-03-25 18:53:59.000000000 -0500 @@ -65,6 +65,7 @@ static struct usb_device_id id_table [] { USB_DEVICE(ITEGNO_VENDOR_ID, ITEGNO_PRODUCT_ID_2080) }, { USB_DEVICE(MA620_VENDOR_ID, MA620_PRODUCT_ID) }, { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID) }, + { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID_USB60F) }, { USB_DEVICE(TRIPP_VENDOR_ID, TRIPP_PRODUCT_ID) }, { USB_DEVICE(RADIOSHACK_VENDOR_ID, RADIOSHACK_PRODUCT_ID) }, { USB_DEVICE(DCU10_VENDOR_ID, DCU10_PRODUCT_ID) }, @@ -84,9 +85,10 @@ static struct usb_device_id id_table [] { USB_DEVICE(DATAPILOT_U2_VENDOR_ID, DATAPILOT_U2_PRODUCT_ID) }, { USB_DEVICE(BELKIN_VENDOR_ID, BELKIN_PRODUCT_ID) }, { USB_DEVICE(ALCOR_VENDOR_ID, ALCOR_PRODUCT_ID) }, - { USB_DEVICE(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_ID) }, { USB_DEVICE(WS002IN_VENDOR_ID, WS002IN_PRODUCT_ID) }, { USB_DEVICE(COREGA_VENDOR_ID, COREGA_PRODUCT_ID) }, + { USB_DEVICE(HL340_VENDOR_ID, HL340_PRODUCT_ID) }, + { USB_DEVICE(YCCABLE_VENDOR_ID, YCCABLE_PRODUCT_ID) }, { } /* Terminating entry */ }; diff -uprN linux-2.6.24/drivers/usb/serial/pl2303.h linux-2.6.24.ovz/drivers/usb/serial/pl2303.h --- linux-2.6.24/drivers/usb/serial/pl2303.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/pl2303.h 2008-03-25 18:53:59.000000000 -0500 @@ -35,6 +35,7 @@ #define RATOC_VENDOR_ID 0x0584 #define RATOC_PRODUCT_ID 0xb000 +#define RATOC_PRODUCT_ID_USB60F 0xb020 #define TRIPP_VENDOR_ID 0x2478 #define TRIPP_PRODUCT_ID 0x2008 @@ -96,10 +97,6 @@ #define ALCOR_VENDOR_ID 0x058F #define ALCOR_PRODUCT_ID 0x9720 -/* Huawei E620 UMTS/HSDPA card (ID: 12d1:1001) */ -#define HUAWEI_VENDOR_ID 0x12d1 -#define HUAWEI_PRODUCT_ID 0x1001 - /* Willcom WS002IN Data Driver (by NetIndex Inc.) */ #define WS002IN_VENDOR_ID 0x11f6 #define WS002IN_PRODUCT_ID 0x2001 @@ -107,3 +104,11 @@ /* Corega CG-USBRS232R Serial Adapter */ #define COREGA_VENDOR_ID 0x07aa #define COREGA_PRODUCT_ID 0x002a + +/* HL HL-340 (ID: 4348:5523) */ +#define HL340_VENDOR_ID 0x4348 +#define HL340_PRODUCT_ID 0x5523 + +/* Y.C. Cable U.S.A., Inc - USB to RS-232 */ +#define YCCABLE_VENDOR_ID 0x05ad +#define YCCABLE_PRODUCT_ID 0x0fba diff -uprN linux-2.6.24/drivers/usb/serial/sierra.c linux-2.6.24.ovz/drivers/usb/serial/sierra.c --- linux-2.6.24/drivers/usb/serial/sierra.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/serial/sierra.c 2008-03-25 18:53:59.000000000 -0500 @@ -104,6 +104,7 @@ static struct usb_device_id id_table [] { USB_DEVICE(0x1199, 0x0019) }, /* Sierra Wireless AirCard 595 */ { USB_DEVICE(0x1199, 0x0021) }, /* Sierra Wireless AirCard 597E */ { USB_DEVICE(0x1199, 0x0120) }, /* Sierra Wireless USB Dongle 595U */ + { USB_DEVICE(0x1199, 0x0023) }, /* Sierra Wireless AirCard */ { USB_DEVICE(0x1199, 0x6802) }, /* Sierra Wireless MC8755 */ { USB_DEVICE(0x1199, 0x6804) }, /* Sierra Wireless MC8755 */ @@ -117,9 +118,15 @@ static struct usb_device_id id_table [] { USB_DEVICE(0x1199, 0x6851) }, /* Sierra Wireless AirCard 881 */ { USB_DEVICE(0x1199, 0x6852) }, /* Sierra Wireless AirCard 880 E */ { USB_DEVICE(0x1199, 0x6853) }, /* Sierra Wireless AirCard 881 E */ + { USB_DEVICE(0x1199, 0x6855) }, /* Sierra Wireless AirCard 880 U */ + { USB_DEVICE(0x1199, 0x6856) }, /* Sierra Wireless AirCard 881 U */ + + { USB_DEVICE(0x1199, 0x6468) }, /* Sierra Wireless MP3G - EVDO */ + { USB_DEVICE(0x1199, 0x6469) }, /* Sierra Wireless MP3G - UMTS/HSPA */ { USB_DEVICE(0x1199, 0x0112), .driver_info = DEVICE_1_PORT }, /* Sierra Wireless AirCard 580 */ { USB_DEVICE(0x0F3D, 0x0112), .driver_info = DEVICE_1_PORT }, /* Airprime/Sierra PC 5220 */ + { USB_DEVICE(0x05C6, 0x6613), .driver_info = DEVICE_1_PORT }, /* Onda H600/ZTE MF330 */ { USB_DEVICE(0x1199, 0x0FFF), .driver_info = DEVICE_INSTALLER}, { } @@ -129,6 +136,7 @@ MODULE_DEVICE_TABLE(usb, id_table); static struct usb_device_id id_table_1port [] = { { USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */ { USB_DEVICE(0x0F3D, 0x0112) }, /* AirPrime/Sierra PC 5220 */ + { USB_DEVICE(0x05C6, 0x6613) }, /* Onda H600/ZTE MF330 */ { } }; @@ -142,6 +150,7 @@ static struct usb_device_id id_table_3po { USB_DEVICE(0x1199, 0x0019) }, /* Sierra Wireless AirCard 595 */ { USB_DEVICE(0x1199, 0x0021) }, /* Sierra Wireless AirCard 597E */ { USB_DEVICE(0x1199, 0x0120) }, /* Sierra Wireless USB Dongle 595U*/ + { USB_DEVICE(0x1199, 0x0023) }, /* Sierra Wireless AirCard */ { USB_DEVICE(0x1199, 0x6802) }, /* Sierra Wireless MC8755 */ { USB_DEVICE(0x1199, 0x6804) }, /* Sierra Wireless MC8755 */ @@ -155,6 +164,10 @@ static struct usb_device_id id_table_3po { USB_DEVICE(0x1199, 0x6851) }, /* Sierra Wireless AirCard 881 */ { USB_DEVICE(0x1199, 0x6852) }, /* Sierra Wireless AirCard 880E */ { USB_DEVICE(0x1199, 0x6853) }, /* Sierra Wireless AirCard 881E */ + { USB_DEVICE(0x1199, 0x6855) }, /* Sierra Wireless AirCard 880 U */ + { USB_DEVICE(0x1199, 0x6856) }, /* Sierra Wireless AirCard 881U */ + { USB_DEVICE(0x1199, 0x6468) }, /* Sierra Wireless MP3G - EVDO */ + { USB_DEVICE(0x1199, 0x6469) }, /* Sierra Wireless MP3G - UMTS/HSPA */ { } }; diff -uprN linux-2.6.24/drivers/usb/storage/unusual_devs.h linux-2.6.24.ovz/drivers/usb/storage/unusual_devs.h --- linux-2.6.24/drivers/usb/storage/unusual_devs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/drivers/usb/storage/unusual_devs.h 2008-03-25 18:53:59.000000000 -0500 @@ -86,6 +86,14 @@ UNUSUAL_DEV( 0x03f0, 0x0307, 0x0001, 0x US_SC_8070, US_PR_USBAT, init_usbat_cd, 0), #endif +/* Reported by Grant Grundler + * HP r707 camera in "Disk" mode with 2.00.23 or 2.00.24 firmware. + */ +UNUSUAL_DEV( 0x03f0, 0x4002, 0x0001, 0x0001, + "HP", + "PhotoSmart R707", + US_SC_DEVICE, US_PR_DEVICE, NULL, US_FL_FIX_CAPACITY), + /* Reported by Sebastian Kapfer * and Olaf Hering (different bcd's, same vendor/product) * for USB floppies that need the SINGLE_LUN enforcement. diff -uprN linux-2.6.24/fs/Kconfig linux-2.6.24.ovz/fs/Kconfig --- linux-2.6.24/fs/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -562,6 +562,15 @@ config PRINT_QUOTA_WARNING Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +config QUOTA_COMPAT + bool "Compatibility with older quotactl interface" + depends on QUOTA + help + This option enables compatibility layer for older version + of quotactl interface with byte granularity (QUOTAON at 0x0100, + GETQUOTA at 0x0D00). Interface versions older than that one and + with block granularity are still not supported. + config QFMT_V1 tristate "Old quota format support" depends on QUOTA @@ -577,6 +586,39 @@ config QFMT_V2 This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. +config SIM_FS + tristate "VPS filesystem" + depends on VZ_QUOTA + default m + help + This file system is a part of Virtuozzo. It intoduces a fake + superblock and blockdev to VE to hide real device and show + statfs results taken from quota. + +config VZ_QUOTA + tristate "Virtuozzo Disk Quota support" + depends on QUOTA + select VZ_DEV + default m + help + Virtuozzo Disk Quota imposes disk quota on directories with their + files and subdirectories in total. Such disk quota is used to + account and limit disk usage by Virtuozzo VPS, but also may be used + separately. + +config VZ_QUOTA_UNLOAD + bool "Unloadable Virtuozzo Disk Quota module" + depends on VZ_QUOTA=m + default n + help + Make Virtuozzo Disk Quota module unloadable. + Doesn't work reliably now. + +config VZ_QUOTA_UGID + bool "Per-user and per-group quota in Virtuozzo quota partitions" + depends on VZ_QUOTA!=n + default y + config QUOTACTL bool depends on XFS_QUOTA || QUOTA diff -uprN linux-2.6.24/fs/Makefile linux-2.6.24.ovz/fs/Makefile --- linux-2.6.24/fs/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -52,9 +52,15 @@ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_VZ_QUOTA) += vzdquota.o +vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o +vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o obj-$(CONFIG_DNOTIFY) += dnotify.o +obj-$(CONFIG_SIM_FS) += simfs.o + obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ diff -uprN linux-2.6.24/fs/aio.c linux-2.6.24.ovz/fs/aio.c --- linux-2.6.24/fs/aio.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/aio.c 2008-03-25 18:53:59.000000000 -0500 @@ -43,13 +43,16 @@ #endif /*------ sysctl variables----*/ -static DEFINE_SPINLOCK(aio_nr_lock); +DEFINE_SPINLOCK(aio_nr_lock); +EXPORT_SYMBOL_GPL(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ +EXPORT_SYMBOL_GPL(aio_nr); unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ static struct kmem_cache *kiocb_cachep; -static struct kmem_cache *kioctx_cachep; +struct kmem_cache *kioctx_cachep; +EXPORT_SYMBOL_GPL(kioctx_cachep); static struct workqueue_struct *aio_wq; @@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_ static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); -static void aio_kick_handler(struct work_struct *); +void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); /* aio_setup @@ -290,7 +293,7 @@ static void aio_cancel_all(struct kioctx spin_unlock_irq(&ctx->ctx_lock); } -static void wait_for_all_aios(struct kioctx *ctx) +void wait_for_all_aios(struct kioctx *ctx) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -313,6 +316,7 @@ static void wait_for_all_aios(struct kio out: spin_unlock_irq(&ctx->ctx_lock); } +EXPORT_SYMBOL_GPL(wait_for_all_aios); /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. @@ -835,7 +839,7 @@ static inline void aio_run_all_iocbs(str * space. * Run on aiod's context. */ -static void aio_kick_handler(struct work_struct *work) +void aio_kick_handler(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, wq.work); mm_segment_t oldfs = get_fs(); @@ -856,7 +860,7 @@ static void aio_kick_handler(struct work if (requeue) queue_delayed_work(aio_wq, &ctx->wq, 0); } - +EXPORT_SYMBOL_GPL(aio_kick_handler); /* * Called by kick_iocb to queue the kiocb for retry diff -uprN linux-2.6.24/fs/autofs/init.c linux-2.6.24.ovz/fs/autofs/init.c --- linux-2.6.24/fs/autofs/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs_fs(void) diff -uprN linux-2.6.24/fs/autofs/root.c linux-2.6.24.ovz/fs/autofs/root.c --- linux-2.6.24/fs/autofs/root.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs/root.c 2008-03-25 18:53:59.000000000 -0500 @@ -356,7 +356,7 @@ static int autofs_root_unlink(struct ino /* This allows root to remove symlinks */ lock_kernel(); - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) { + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) { unlock_kernel(); return -EACCES; } @@ -542,7 +542,7 @@ static int autofs_root_ioctl(struct inod _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff -uprN linux-2.6.24/fs/autofs4/autofs_i.h linux-2.6.24.ovz/fs/autofs4/autofs_i.h --- linux-2.6.24/fs/autofs4/autofs_i.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs4/autofs_i.h 2008-03-25 18:53:59.000000000 -0500 @@ -94,6 +94,10 @@ struct autofs_wait_queue { #define AUTOFS_TYPE_DIRECT 0x0002 #define AUTOFS_TYPE_OFFSET 0x0004 +/* flags for userspace automount daemon */ +#define AUTOFS_DEAMON_32BIT 0 /* automount is a 32bit process */ +#define _AUTOFS_DEAMON_32BIT (1 << AUTOFS_DEAMON_32BIT) + struct autofs_sb_info { u32 magic; int pipefd; @@ -114,6 +118,7 @@ struct autofs_sb_info { struct autofs_wait_queue *queues; /* Wait queue pointer */ spinlock_t rehash_lock; struct list_head rehash_list; + u32 flags; /* flags for userspace automount daemon */ }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) diff -uprN linux-2.6.24/fs/autofs4/init.c linux-2.6.24.ovz/fs/autofs4/init.c --- linux-2.6.24/fs/autofs4/init.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs4/init.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,7 @@ static struct file_system_type autofs_fs .name = "autofs", .get_sb = autofs_get_sb, .kill_sb = autofs4_kill_sb, + .fs_flags = FS_VIRTUALIZED, }; static int __init init_autofs4_fs(void) diff -uprN linux-2.6.24/fs/autofs4/inode.c linux-2.6.24.ovz/fs/autofs4/inode.c --- linux-2.6.24/fs/autofs4/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs4/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -311,6 +311,7 @@ int autofs4_fill_super(struct super_bloc int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + struct task_struct *tsk = current; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -330,6 +331,12 @@ int autofs4_fill_super(struct super_bloc sbi->type = 0; sbi->min_proto = 0; sbi->max_proto = 0; +#ifdef __x86_64__ + if (task_thread_info(tsk)->flags & _TIF_IA32) { + /* mark that automount daemon is 32 bit */ + sbi->flags |= _AUTOFS_DEAMON_32BIT; + } +#endif mutex_init(&sbi->wq_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; diff -uprN linux-2.6.24/fs/autofs4/root.c linux-2.6.24.ovz/fs/autofs4/root.c --- linux-2.6.24/fs/autofs4/root.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs4/root.c 2008-03-25 18:53:59.000000000 -0500 @@ -762,7 +762,7 @@ static int autofs4_dir_unlink(struct ino struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EACCES; if (atomic_dec_and_test(&ino->count)) { @@ -982,7 +982,7 @@ static int autofs4_root_ioctl(struct ino _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; switch(cmd) { diff -uprN linux-2.6.24/fs/autofs4/waitq.c linux-2.6.24.ovz/fs/autofs4/waitq.c --- linux-2.6.24/fs/autofs4/waitq.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/autofs4/waitq.c 2008-03-25 18:53:59.000000000 -0500 @@ -102,27 +102,50 @@ static void autofs4_notify_daemon(struct /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: { - struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; + if (sbi->flags & _AUTOFS_DEAMON_32BIT) { + struct autofs_packet_missing_32bit *mp = &pkt.v4_pkt.missing_32bit; - pktsz = sizeof(*mp); + pktsz = sizeof(*mp); + mp->wait_queue_token = wq->wait_queue_token; + mp->len = wq->len; + memcpy(mp->name, wq->name, wq->len); + mp->name[wq->len] = '\0'; + break; + } else { + struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; + + pktsz = sizeof(*mp); - mp->wait_queue_token = wq->wait_queue_token; - mp->len = wq->len; - memcpy(mp->name, wq->name, wq->len); - mp->name[wq->len] = '\0'; - break; + mp->wait_queue_token = wq->wait_queue_token; + mp->len = wq->len; + memcpy(mp->name, wq->name, wq->len); + mp->name[wq->len] = '\0'; + break; + } } case autofs_ptype_expire_multi: { - struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; + if (sbi->flags & _AUTOFS_DEAMON_32BIT) { + struct autofs_packet_expire_multi_32bit *ep = &pkt.v4_pkt.expire_multi_32bit; + + pktsz = sizeof(*ep); + + ep->wait_queue_token = wq->wait_queue_token; + ep->len = wq->len; + memcpy(ep->name, wq->name, wq->len); + ep->name[wq->len] = '\0'; + break; + } else { + struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; - pktsz = sizeof(*ep); + pktsz = sizeof(*ep); - ep->wait_queue_token = wq->wait_queue_token; - ep->len = wq->len; - memcpy(ep->name, wq->name, wq->len); - ep->name[wq->len] = '\0'; - break; + ep->wait_queue_token = wq->wait_queue_token; + ep->len = wq->len; + memcpy(ep->name, wq->name, wq->len); + ep->name[wq->len] = '\0'; + break; + } } /* * Kernel protocol v5 packet for handling indirect and direct @@ -133,21 +156,39 @@ static void autofs4_notify_daemon(struct case autofs_ptype_missing_direct: case autofs_ptype_expire_direct: { - struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + if (sbi->flags & _AUTOFS_DEAMON_32BIT) { + struct autofs_v5_packet_32bit *packet = &pkt.v5_pkt.v5_packet_32bit; + + pktsz = sizeof(*packet); + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->len; + memcpy(packet->name, wq->name, wq->len); + packet->name[wq->len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = wq->uid; + packet->gid = wq->gid; + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } else { + struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - pktsz = sizeof(*packet); + pktsz = sizeof(*packet); - packet->wait_queue_token = wq->wait_queue_token; - packet->len = wq->len; - memcpy(packet->name, wq->name, wq->len); - packet->name[wq->len] = '\0'; - packet->dev = wq->dev; - packet->ino = wq->ino; - packet->uid = wq->uid; - packet->gid = wq->gid; - packet->pid = wq->pid; - packet->tgid = wq->tgid; - break; + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->len; + memcpy(packet->name, wq->name, wq->len); + packet->name[wq->len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = wq->uid; + packet->gid = wq->gid; + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } } default: printk("autofs4_notify_daemon: bad type %d!\n", type); diff -uprN linux-2.6.24/fs/binfmt_aout.c linux-2.6.24.ovz/fs/binfmt_aout.c --- linux-2.6.24/fs/binfmt_aout.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/binfmt_aout.c 2008-03-25 18:53:59.000000000 -0500 @@ -375,14 +375,14 @@ static int load_aout_binary(struct linux if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) { - printk(KERN_NOTICE "executable not page aligned\n"); + ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); error_time2 = jiffies; } if ((fd_offset & ~PAGE_MASK) != 0 && (jiffies-error_time) > 5*HZ) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", bprm->file->f_path.dentry->d_name.name); error_time = jiffies; @@ -499,7 +499,7 @@ static int load_aout_library(struct file if ((jiffies-error_time) > 5*HZ) { - printk(KERN_WARNING + ve_printk(VE_LOG, KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); error_time = jiffies; diff -uprN linux-2.6.24/fs/binfmt_elf.c linux-2.6.24.ovz/fs/binfmt_elf.c --- linux-2.6.24/fs/binfmt_elf.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/binfmt_elf.c 2008-03-25 18:53:59.000000000 -0500 @@ -373,7 +373,7 @@ static unsigned long load_elf_interp(str eppnt = elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { - int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO; int elf_prot = 0; unsigned long vaddr = 0; unsigned long k, map_addr; @@ -871,7 +871,8 @@ static int load_elf_binary(struct linux_ if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_EXECPRIO; vaddr = elf_ppnt->p_vaddr; if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { @@ -996,7 +997,7 @@ static int load_elf_binary(struct linux_ set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, executable_stack); + retval = arch_setup_additional_pages(bprm, executable_stack, 0); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; @@ -1595,7 +1596,7 @@ static int elf_core_dump(long signr, str if (signr) { struct elf_thread_status *tmp; rcu_read_lock(); - do_each_thread(g,p) + do_each_thread_ve(g,p) if (current->mm == p->mm && current != p) { tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (!tmp) { @@ -1605,7 +1606,7 @@ static int elf_core_dump(long signr, str tmp->thread = p; list_add(&tmp->list, &thread_list); } - while_each_thread(g,p); + while_each_thread_ve(g,p); rcu_read_unlock(); list_for_each(t, &thread_list) { struct elf_thread_status *tmp; diff -uprN linux-2.6.24/fs/block_dev.c linux-2.6.24.ovz/fs/block_dev.c --- linux-2.6.24/fs/block_dev.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/block_dev.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1124,9 +1125,15 @@ static int do_open(struct block_device * { struct module *owner = NULL; struct gendisk *disk; - int ret = -ENXIO; + int ret; int part; + ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev, + file->f_mode & (FMODE_READ | FMODE_WRITE)); + if (ret) + return ret; + + ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; lock_kernel(); disk = get_gendisk(bdev->bd_dev, &part); @@ -1384,7 +1391,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *path) +struct block_device *lookup_bdev(const char *path, int mode) { struct block_device *bdev; struct inode *inode; @@ -1402,6 +1409,11 @@ struct block_device *lookup_bdev(const c error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; + + error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode); + if (error) + goto fail; + error = -EACCES; if (nd.mnt->mnt_flags & MNT_NODEV) goto fail; @@ -1433,12 +1445,13 @@ struct block_device *open_bdev_excl(cons mode_t mode = FMODE_READ; int error = 0; - bdev = lookup_bdev(path); + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = lookup_bdev(path, mode); if (IS_ERR(bdev)) return bdev; - if (!(flags & MS_RDONLY)) - mode |= FMODE_WRITE; error = blkdev_get(bdev, mode, 0); if (error) return ERR_PTR(error); @@ -1486,7 +1499,7 @@ int __invalidate_device(struct block_dev * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes_check(sb, 1); drop_super(sb); } invalidate_bdev(bdev); diff -uprN linux-2.6.24/fs/buffer.c linux-2.6.24.ovz/fs/buffer.c --- linux-2.6.24/fs/buffer.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/buffer.c 2008-03-25 18:53:59.000000000 -0500 @@ -698,6 +698,8 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static int __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { + int acct = 0; + if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -712,12 +714,14 @@ static int __set_page_dirty(struct page __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); + acct = 1; } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } write_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; diff -uprN linux-2.6.24/fs/char_dev.c linux-2.6.24.ovz/fs/char_dev.c --- linux-2.6.24/fs/char_dev.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/char_dev.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -363,6 +365,11 @@ int chrdev_open(struct inode * inode, st struct cdev *new = NULL; int ret = 0; + ret = get_device_perms_ve(S_IFCHR, inode->i_rdev, + filp->f_mode & (FMODE_READ | FMODE_WRITE)); + if (ret) + return ret; + spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { diff -uprN linux-2.6.24/fs/compat.c linux-2.6.24.ovz/fs/compat.c --- linux-2.6.24/fs/compat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/compat.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ #include #include #include +#include #include #include @@ -72,6 +74,18 @@ int compat_printk(const char *fmt, ...) #include "read_write.h" +int ve_compat_printk(int dst, const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = ve_vprintk(dst, fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -243,6 +257,8 @@ asmlinkage long compat_sys_statfs(const struct kstatfs tmp; error = vfs_statfs(nd.dentry, &tmp); if (!error) + error = faudit_statfs(nd.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); path_release(&nd); } @@ -261,6 +277,8 @@ asmlinkage long compat_sys_fstatfs(unsig goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs(buf, &tmp); fput(file); out: @@ -311,6 +329,8 @@ asmlinkage long compat_sys_statfs64(cons struct kstatfs tmp; error = vfs_statfs(nd.dentry, &tmp); if (!error) + error = faudit_statfs(nd.mnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); path_release(&nd); } @@ -332,6 +352,8 @@ asmlinkage long compat_sys_fstatfs64(uns goto out; error = vfs_statfs(file->f_path.dentry, &tmp); if (!error) + error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); + if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); out: @@ -1357,6 +1379,10 @@ int compat_do_execve(char * filename, struct file *file; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) @@ -1406,6 +1432,11 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; + if (!gr_tpe_allow(file)) { + retval = -EACCES; + goto out; + } + retval = search_binary_handler(bprm, regs); if (retval >= 0) { /* execve success */ diff -uprN linux-2.6.24/fs/dcache.c linux-2.6.24.ovz/fs/dcache.c --- linux-2.6.24/fs/dcache.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/dcache.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,13 +26,19 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include #include "internal.h" +#include +#include int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -42,7 +48,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC EXPORT_SYMBOL(dcache_lock); -static struct kmem_cache *dentry_cache __read_mostly; +struct kmem_cache *dentry_cache __read_mostly; #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) @@ -135,6 +141,7 @@ static struct dentry *d_kill(struct dent list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ + preempt_enable_no_resched(); /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); parent = dentry->d_parent; @@ -169,25 +176,18 @@ static struct dentry *d_kill(struct dent * they too may now get deleted. * * no dcache lock, please. + * preemption is disabled by the caller. */ -void dput(struct dentry *dentry) +static void dput_recursive(struct dentry *dentry) { - if (!dentry) - return; - repeat: - if (atomic_read(&dentry->d_count) == 1) - might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; + goto out_preempt; spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - return; - } + if (atomic_read(&dentry->d_count)) + goto out_unlock; /* * AV: ->d_delete() is _NOT_ allowed to block now. @@ -204,8 +204,11 @@ repeat: list_add(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; } +out_unlock: spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); +out_preempt: + preempt_enable(); return; unhash_it: @@ -219,8 +222,23 @@ kill_it: dentry_stat.nr_unused--; } dentry = d_kill(dentry); - if (dentry) + if (dentry) { + preempt_disable(); goto repeat; + } +} + +void dput(struct dentry *dentry) +{ + if (!dentry) + return; + + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); + + preempt_disable(); + ub_dentry_uncharge(dentry); + dput_recursive(dentry); } /** @@ -289,6 +307,8 @@ static inline struct dentry * __dget_loc dentry_stat.nr_unused--; list_del_init(&dentry->d_lru); } + + ub_dentry_charge_nofail(dentry); return dentry; } @@ -391,6 +411,7 @@ restart: static void prune_one_dentry(struct dentry * dentry) { __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); /* @@ -409,6 +430,7 @@ static void prune_one_dentry(struct dent dentry_stat.nr_unused--; } __d_drop(dentry); + preempt_disable(); dentry = d_kill(dentry); spin_lock(&dcache_lock); } @@ -709,6 +731,8 @@ void shrink_dcache_for_umount(struct sup dentry = sb->s_root; sb->s_root = NULL; + /* "/" was also charged in d_alloc_root() */ + ub_dentry_uncharge(dentry); atomic_dec(&dentry->d_count); shrink_dcache_for_umount_subtree(dentry); @@ -871,12 +895,18 @@ void shrink_dcache_parent(struct dentry */ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { + int res = -1; + + KSTAT_PERF_ENTER(shrink_dcache) if (nr) { if (!(gfp_mask & __GFP_FS)) - return -1; + goto out; prune_dcache(nr, NULL); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; +out: + KSTAT_PERF_LEAVE(shrink_dcache) + return res; } static struct shrinker dcache_shrinker = { @@ -899,21 +929,27 @@ struct dentry *d_alloc(struct dentry * p struct dentry *dentry; char *dname; + dname = NULL; + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) + goto err_name; + } + + ub_dentry_alloc_start(); + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) - return NULL; + goto err_alloc; - if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { - kmem_cache_free(dentry_cache, dentry); - return NULL; - } - } else { + preempt_disable(); + if (dname == NULL) dname = dentry->d_iname; - } dentry->d_name.name = dname; + if (ub_dentry_alloc(dentry)) + goto err_charge; + dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); @@ -944,12 +980,27 @@ struct dentry *d_alloc(struct dentry * p } spin_lock(&dcache_lock); - if (parent) + if (parent) { list_add(&dentry->d_u.d_child, &parent->d_subdirs); + if (parent->d_flags & DCACHE_VIRTUAL) + dentry->d_flags |= DCACHE_VIRTUAL; + } dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); + preempt_enable(); + ub_dentry_alloc_end(); return dentry; + +err_charge: + preempt_enable(); + kmem_cache_free(dentry_cache, dentry); +err_alloc: + if (name->len > DNAME_INLINE_LEN - 1) + kfree(dname); + ub_dentry_alloc_end(); +err_name: + return NULL; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) @@ -1255,12 +1306,12 @@ struct dentry * __d_lookup(struct dentry unsigned int hash = name->hash; const unsigned char *str = name->name; struct hlist_head *head = d_hash(parent,hash); - struct dentry *found = NULL; struct hlist_node *node; - struct dentry *dentry; + struct dentry *dentry, *found; rcu_read_lock(); + found = NULL; hlist_for_each_entry_rcu(dentry, node, head, d_hash) { struct qstr *qstr; @@ -1297,6 +1348,8 @@ struct dentry * __d_lookup(struct dentry if (!d_unhashed(dentry)) { atomic_inc(&dentry->d_count); found = dentry; + if (ub_dentry_charge(found)) + goto charge_failure; } spin_unlock(&dentry->d_lock); break; @@ -1306,6 +1359,14 @@ next: rcu_read_unlock(); return found; + +charge_failure: + spin_unlock(&found->d_lock); + rcu_read_unlock(); + /* dentry is now unhashed, just kill it */ + dput(found); + /* ... and fail lookup */ + return NULL; } /** @@ -1764,6 +1825,32 @@ shouldnt_be_hashed: } /** + * __d_path_add_deleted - prepend "(deleted) " text + * @end: a pointer to the character after free space at the beginning of the + * buffer + * @buflen: remaining free space + */ +static inline char * __d_path_add_deleted(char * end, int buflen) +{ + buflen -= 10; + if (buflen < 0) + return ERR_PTR(-ENAMETOOLONG); + end -= 10; + memcpy(end, "(deleted) ", 10); + return end; +} + +/** + * d_root_check - checks if dentry is accessible from current's fs root + * @dentry: dentry to be verified + * @vfsmnt: vfsmnt to which the dentry belongs + */ +int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt) +{ + return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0)); +} + +/** * d_path - return the path of a dentry * @dentry: dentry to report * @vfsmnt: vfsmnt to which the dentry belongs @@ -1779,41 +1866,40 @@ shouldnt_be_hashed: * * "buflen" should be positive. Caller holds the dcache_lock. */ -static char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, +char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, struct dentry *root, struct vfsmount *rootmnt, char *buffer, int buflen) { char * end = buffer+buflen; - char * retval; + char * retval = NULL; int namelen; + int deleted; + struct vfsmount *oldvfsmnt; - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen < 0) + oldvfsmnt = vfsmnt; + deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); + if (buffer != NULL) { + *--end = '\0'; + buflen--; + + if (buflen < 1) goto Elong; - memcpy(end, " (deleted)", 10); + /* Get '/' right */ + retval = end-1; + *retval = '/'; } - if (buflen < 1) - goto Elong; - /* Get '/' right */ - retval = end-1; - *retval = '/'; - for (;;) { struct dentry * parent; if (dentry == root && vfsmnt == rootmnt) break; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - /* Global root? */ + /* root of a tree? */ spin_lock(&vfsmount_lock); if (vfsmnt->mnt_parent == vfsmnt) { spin_unlock(&vfsmount_lock); - goto global_root; + goto other_root; } dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; @@ -1822,31 +1908,57 @@ static char * __d_path( struct dentry *d } parent = dentry->d_parent; prefetch(parent); + if (buffer != NULL) { + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + retval = end; + } + dentry = parent; + } + /* the given root point is reached */ +finish: + if (buffer != NULL && deleted) + retval = __d_path_add_deleted(end, buflen); + return retval; + +other_root: + /* + * We traversed the tree upward and reached a root, but the given + * lookup terminal point wasn't encountered. It means either that the + * dentry is out of our scope or belongs to an abstract space like + * sock_mnt or pipe_mnt. Check for it. + * + * There are different options to check it. + * We may assume that any dentry tree is unreachable unless it's + * connected to `root' (defined as fs root of init aka child reaper) + * and expose all paths that are not connected to it. + * The other option is to allow exposing of known abstract spaces + * explicitly and hide the path information for other cases. + * This approach is more safe, let's take it. 2001/04/22 SAW + */ + if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER)) + return ERR_PTR(-EINVAL); + if (buffer != NULL) { namelen = dentry->d_name.len; - buflen -= namelen + 1; + buflen -= namelen; if (buflen < 0) goto Elong; - end -= namelen; - memcpy(end, dentry->d_name.name, namelen); - *--end = '/'; - retval = end; - dentry = parent; + retval -= namelen-1; /* hit the slash */ + memcpy(retval, dentry->d_name.name, namelen); } + goto finish; - return retval; - -global_root: - namelen = dentry->d_name.len; - buflen -= namelen; - if (buflen < 0) - goto Elong; - retval -= namelen-1; /* hit the slash */ - memcpy(retval, dentry->d_name.name, namelen); - return retval; Elong: return ERR_PTR(-ENAMETOOLONG); } +EXPORT_SYMBOL(__d_path); + /* write full pathname into buffer and return start of pathname */ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, char *buf, int buflen) @@ -1861,8 +1973,11 @@ char * d_path(struct dentry *dentry, str * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * pipefs and socketfs methods assume valid buffer, d_root_check() + * supplies NULL one for access checks. */ - if (dentry->d_op && dentry->d_op->d_dname) + if (buf && dentry->d_op && dentry->d_op->d_dname) return dentry->d_op->d_dname(dentry, buf, buflen); read_lock(¤t->fs->lock); @@ -1877,6 +1992,229 @@ char * d_path(struct dentry *dentry, str return res; } +#ifdef CONFIG_VE +#include +#include +#include +#include +#include + +static void mark_sub_tree_virtual(struct dentry *d) +{ + struct dentry *orig_root; + + orig_root = d; + while (1) { + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_VIRTUAL; + spin_unlock(&d->d_lock); + + if (!list_empty(&d->d_subdirs)) { + d = list_entry(d->d_subdirs.next, + struct dentry, d_u.d_child); + continue; + } + if (d == orig_root) + break; + while (d == list_entry(d->d_parent->d_subdirs.prev, + struct dentry, d_u.d_child)) { + d = d->d_parent; + if (d == orig_root) + goto out; + } + d = list_entry(d->d_u.d_child.next, + struct dentry, d_u.d_child); + } +out: + return; +} + +void mark_tree_virtual(struct vfsmount *m, struct dentry *d) +{ + struct vfsmount *orig_rootmnt; + + spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); + orig_rootmnt = m; + while (1) { + mark_sub_tree_virtual(d); + if (!list_empty(&m->mnt_mounts)) { + m = list_entry(m->mnt_mounts.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + continue; + } + if (m == orig_rootmnt) + break; + while (m == list_entry(m->mnt_parent->mnt_mounts.prev, + struct vfsmount, mnt_child)) { + m = m->mnt_parent; + if (m == orig_rootmnt) + goto out; + } + m = list_entry(m->mnt_child.next, + struct vfsmount, mnt_child); + d = m->mnt_root; + } +out: + spin_unlock(&vfsmount_lock); + spin_unlock(&dcache_lock); +} +EXPORT_SYMBOL(mark_tree_virtual); + +static struct vz_rate_info area_ri = { 20, 10*HZ }; +#define VE_AREA_ACC_CHECK 0x0001 +#define VE_AREA_ACC_DENY 0x0002 +#define VE_AREA_EXEC_CHECK 0x0010 +#define VE_AREA_EXEC_DENY 0x0020 +#define VE0_AREA_ACC_CHECK 0x0100 +#define VE0_AREA_ACC_DENY 0x0200 +#define VE0_AREA_EXEC_CHECK 0x1000 +#define VE0_AREA_EXEC_DENY 0x2000 +int ve_area_access_check = 0; + +static void print_connection_info(struct task_struct *tsk) +{ + struct files_struct *files; + struct fdtable *fdt; + int fd; + + files = get_files_struct(tsk); + if (!files) + return; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + struct inode *inode; + struct socket *socket; + struct sock *sk; + struct inet_sock *inet; + + file = fdt->fd[fd]; + if (file == NULL) + continue; + + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) + continue; + + socket = SOCKET_I(inode); + if (socket == NULL) + continue; + + sk = socket->sk; + if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + || sk->sk_type != SOCK_STREAM) + continue; + + inet = inet_sk(sk); + printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", + NIPQUAD(inet->daddr), ntohs(inet->dport), + inet->num); + } + spin_unlock(&files->file_lock); + put_files_struct(files); +} + +static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry, + char *str) +{ + struct task_struct *tsk; + unsigned long page; + struct super_block *sb; + char *p; + + if (!vz_ratelimit(&area_ri)) + return; + + tsk = current; + p = ERR_PTR(-ENOMEM); + page = __get_free_page(GFP_KERNEL); + if (page) { + spin_lock(&dcache_lock); + p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt, + (char *)page, PAGE_SIZE); + spin_unlock(&dcache_lock); + } + if (IS_ERR(p)) + p = "(undefined)"; + + sb = dentry->d_sb; + printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" + "Task %d/%d[%s] from VE%d, execenv %d\n", + str, p, sb->s_type->owner_env->veid, + sb->s_type->name, sb->s_dev, + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid, + get_exec_env()->veid); + + free_page(page); + + print_connection_info(tsk); + + read_lock(&tasklist_lock); + tsk = tsk->parent; + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", + tsk->pid, task_pid_vnr(tsk), tsk->comm, + VE_TASK_INFO(tsk)->owner_env->veid); + + print_connection_info(tsk); + put_task_struct(tsk); + dump_stack(); +} +#endif + +int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_ACC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_ACC_DENY; + } else { + check = ve_area_access_check & VE_AREA_ACC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_ACC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Access"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + +int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) +{ +#ifdef CONFIG_VE + int check, alert, deny; + + if (ve_is_super(get_exec_env())) { + check = ve_area_access_check & VE0_AREA_EXEC_CHECK; + alert = dentry->d_flags & DCACHE_VIRTUAL; + deny = ve_area_access_check & VE0_AREA_EXEC_DENY; + } else { + check = ve_area_access_check & VE_AREA_EXEC_CHECK; + alert = !(dentry->d_flags & DCACHE_VIRTUAL); + deny = ve_area_access_check & VE_AREA_EXEC_DENY; + } + + if (check && alert) + check_alert(mnt, dentry, "Exec"); + if (deny && alert) + return -EACCES; +#endif + return 0; +} + /* * Helper function for dentry_operations.d_dname() members */ @@ -2034,10 +2372,12 @@ resume: goto repeat; } atomic_dec(&dentry->d_count); + ub_dentry_uncharge_locked(dentry); } if (this_parent != root) { next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); + ub_dentry_uncharge_locked(this_parent); this_parent = this_parent->d_parent; goto resume; } diff -uprN linux-2.6.24/fs/devpts/inode.c linux-2.6.24.ovz/fs/devpts/inode.c --- linux-2.6.24/fs/devpts/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/devpts/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,19 +20,21 @@ #include #include #include +#include #define DEVPTS_SUPER_MAGIC 0x1cd1 +struct devpts_config devpts_config = {.mode = 0600}; + +#ifndef CONFIG_VE static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; - -static struct { - int setuid; - int setgid; - uid_t uid; - gid_t gid; - umode_t mode; -} config = {.mode = 0600}; +#define config devpts_config +#else +#define devpts_mnt (get_exec_env()->devpts_mnt) +#define devpts_root (get_exec_env()->devpts_root) +#define config (*(get_exec_env()->devpts_config)) +#endif enum { Opt_uid, Opt_gid, Opt_mode, @@ -84,7 +86,8 @@ static int devpts_remount(struct super_b config.mode = option & ~S_IFMT; break; default: - printk(KERN_ERR "devpts: called with bogus options\n"); + ve_printk(VE_LOG, KERN_ERR + "devpts: called with bogus options\n"); return -EINVAL; } } @@ -136,13 +139,15 @@ static int devpts_get_sb(struct file_sys return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); } -static struct file_system_type devpts_fs_type = { +struct file_system_type devpts_fs_type = { .owner = THIS_MODULE, .name = "devpts", .get_sb = devpts_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(devpts_fs_type); + /* * The normal naming convention is simply /dev/pts/; this conforms * to the System V naming convention @@ -235,6 +240,7 @@ static int __init init_devpts_fs(void) static void __exit exit_devpts_fs(void) { + /* the code is never called, the argument is irrelevant */ unregister_filesystem(&devpts_fs_type); mntput(devpts_mnt); } diff -uprN linux-2.6.24/fs/direct-io.c linux-2.6.24.ovz/fs/direct-io.c --- linux-2.6.24/fs/direct-io.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/direct-io.c 2008-03-25 18:53:59.000000000 -0500 @@ -666,7 +666,7 @@ submit_page_section(struct dio *dio, str /* * Read accounting is performed in submit_bio() */ - task_io_account_write(len); + task_io_account_write(page, len, 1); } /* diff -uprN linux-2.6.24/fs/dquot.c linux-2.6.24.ovz/fs/dquot.c --- linux-2.6.24/fs/dquot.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/dquot.c 2008-03-25 18:53:59.000000000 -0500 @@ -162,7 +162,9 @@ static struct quota_format_type *find_qu struct quota_format_type *actqf; spin_lock(&dq_list_lock); - for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); + for (actqf = quota_formats; + actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL); + actqf = actqf->qf_next); if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; diff -uprN linux-2.6.24/fs/eventpoll.c linux-2.6.24.ovz/fs/eventpoll.c --- linux-2.6.24/fs/eventpoll.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/eventpoll.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -102,11 +103,6 @@ #define EP_UNACTIVE_PTR ((void *) -1L) -struct epoll_filefd { - struct file *file; - int fd; -}; - /* * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". * It is used to keep track on all tasks that are currently inside the wake_up() code @@ -129,79 +125,6 @@ struct poll_safewake { spinlock_t lock; }; -/* - * Each file descriptor added to the eventpoll interface will - * have an entry of this type linked to the "rbr" RB tree. - */ -struct epitem { - /* RB tree node used to link this structure to the eventpoll RB tree */ - struct rb_node rbn; - - /* List header used to link this structure to the eventpoll ready list */ - struct list_head rdllink; - - /* - * Works together "struct eventpoll"->ovflist in keeping the - * single linked chain of items. - */ - struct epitem *next; - - /* The file descriptor information this item refers to */ - struct epoll_filefd ffd; - - /* Number of active wait queue attached to poll operations */ - int nwait; - - /* List containing poll wait queues */ - struct list_head pwqlist; - - /* The "container" of this item */ - struct eventpoll *ep; - - /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; - - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; -}; - -/* - * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll - * interface. - */ -struct eventpoll { - /* Protect the this structure access */ - spinlock_t lock; - - /* - * This mutex is used to ensure that files are not removed - * while epoll is using them. This is held during the event - * collection loop, the file cleanup path, the epoll file exit - * code and the ctl operations. - */ - struct mutex mtx; - - /* Wait queue used by sys_epoll_wait() */ - wait_queue_head_t wq; - - /* Wait queue used by file->poll() */ - wait_queue_head_t poll_wait; - - /* List of ready file descriptors */ - struct list_head rdllist; - - /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; - - /* - * This is a single linked list that chains all the "struct epitem" that - * happened while transfering ready events to userspace w/out - * holding ->lock. - */ - struct epitem *ovflist; -}; - /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ @@ -229,7 +152,8 @@ struct ep_pqueue { /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +struct mutex epmutex; +EXPORT_SYMBOL_GPL(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -353,7 +277,7 @@ static void ep_poll_safewake(struct poll spin_unlock_irqrestore(&psw->lock, flags); /* Do really wake up now */ - wake_up(wq); + wake_up_nested(wq, 1 + wake_nests); /* Remove the current task from the list */ spin_lock_irqsave(&psw->lock, flags); @@ -502,10 +426,11 @@ static unsigned int ep_eventpoll_poll(st } /* File callbacks that implement the eventpoll file behaviour */ -static const struct file_operations eventpoll_fops = { +const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll }; +EXPORT_SYMBOL(eventpoll_fops); /* Fast test to see if the file is an evenpoll file */ static inline int is_file_epoll(struct file *f) @@ -577,7 +502,7 @@ static int ep_alloc(struct eventpoll **p * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ -static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; @@ -603,6 +528,7 @@ static struct epitem *ep_find(struct eve return epir; } +EXPORT_SYMBOL_GPL(ep_find); /* * This is the callback that is passed to the wait queue wakeup @@ -716,7 +642,7 @@ static void ep_rbtree_insert(struct even /* * Must be called with "mtx" held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; @@ -814,6 +740,7 @@ error_unregister: error_return: return error; } +EXPORT_SYMBOL(ep_insert); /* * Modify the interest event mask by dropping an event if the new mask @@ -1109,6 +1036,7 @@ error_return: current, size, error)); return error; } +EXPORT_SYMBOL(sys_epoll_create); /* * The following function implements the controller interface for diff -uprN linux-2.6.24/fs/exec.c linux-2.6.24.ovz/fs/exec.c --- linux-2.6.24/fs/exec.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/exec.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -51,11 +52,14 @@ #include #include #include +#include #include #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -66,6 +70,8 @@ int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ +int sysctl_at_vsyscall; + static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); @@ -217,9 +223,13 @@ static int __bprm_mm_init(struct linux_b struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, + NULL, UB_SOFT)) + goto fail_charge; + + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL_UBC); if (!vma) - goto err; + goto fail_alloc; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -253,7 +263,9 @@ err: bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); } - +fail_alloc: + ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL); +fail_charge: return err; } @@ -695,10 +707,11 @@ int kernel_read(struct file *file, unsig EXPORT_SYMBOL(kernel_read); -static int exec_mmap(struct mm_struct *mm) +static int exec_mmap(struct linux_binprm *bprm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm, *mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -720,6 +733,10 @@ static int exec_mmap(struct mm_struct *m return -EINTR; } } + + ret = 0; + mm = bprm->mm; + mm->vps_dumpable = 1; task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; @@ -727,14 +744,24 @@ static int exec_mmap(struct mm_struct *m activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); + bprm->mm = NULL; /* We're using it now */ + +#ifdef CONFIG_VZ_GENCALLS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP, + bprm) & NOTIFY_FAIL) { + /* similar to binfmt_elf */ + send_sig(SIGKILL, current, 0); + ret = -ENOMEM; + } +#endif if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); mmput(old_mm); - return 0; + return ret; } mmdrop(active_mm); - return 0; + return ret; } /* @@ -861,6 +888,10 @@ static int de_thread(struct task_struct transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); +#ifdef CONFIG_VE + list_replace_rcu(&leader->ve_task_info.vetask_list, + &tsk->ve_task_info.vetask_list); +#endif tsk->group_leader = tsk; leader->group_leader = tsk; @@ -989,12 +1020,10 @@ int flush_old_exec(struct linux_binprm * /* * Release all of the old mmap stuff */ - retval = exec_mmap(bprm->mm); + retval = exec_mmap(bprm); if (retval) goto mmap_failed; - bprm->mm = NULL; /* We're using it now */ - /* This is the point of no return */ put_files_struct(files); @@ -1298,6 +1327,10 @@ int do_execve(char * filename, unsigned long env_p; int retval; + retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); + if (retval) + return retval; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) @@ -1349,6 +1382,11 @@ int do_execve(char * filename, goto out; bprm->argv_len = env_p - bprm->p; + if (!gr_tpe_allow(file)) { + retval = -EACCES; + goto out; + } + retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ @@ -1561,7 +1599,7 @@ static inline int zap_threads(struct tas goto done; rcu_read_lock(); - for_each_process(g) { + for_each_process_ve(g) { if (g == tsk->group_leader) continue; @@ -1695,7 +1733,7 @@ int do_coredump(long signr, int exit_cod /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_waiters || !get_dumpable(mm)) { + if (mm->core_waiters || !get_dumpable(mm) || mm->vps_dumpable != 1) { up_write(&mm->mmap_sem); goto fail; } diff -uprN linux-2.6.24/fs/ext2/namei.c linux-2.6.24.ovz/fs/ext2/namei.c --- linux-2.6.24/fs/ext2/namei.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext2/namei.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,6 +31,7 @@ */ #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -257,6 +258,8 @@ static int ext2_unlink(struct inode * di struct page * page; int err = -ENOENT; + DQUOT_INIT(inode); + de = ext2_find_entry (dir, dentry, &page); if (!de) goto out; @@ -299,6 +302,9 @@ static int ext2_rename (struct inode * o struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + if (new_inode) + DQUOT_INIT(new_inode); + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); if (!old_de) goto out; diff -uprN linux-2.6.24/fs/ext2/super.c linux-2.6.24.ovz/fs/ext2/super.c --- linux-2.6.24/fs/ext2/super.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext2/super.c 2008-03-25 18:53:59.000000000 -0500 @@ -1377,7 +1377,7 @@ static struct file_system_type ext2_fs_t .name = "ext2", .get_sb = ext2_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext2_fs(void) diff -uprN linux-2.6.24/fs/ext3/ioctl.c linux-2.6.24.ovz/fs/ext3/ioctl.c --- linux-2.6.24/fs/ext3/ioctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext3/ioctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -79,7 +79,7 @@ int ext3_ioctl (struct inode * inode, st * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { + if (!capable(CAP_SYS_ADMIN)) { mutex_unlock(&inode->i_mutex); return -EPERM; } diff -uprN linux-2.6.24/fs/ext3/namei.c linux-2.6.24.ovz/fs/ext3/namei.c --- linux-2.6.24/fs/ext3/namei.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext3/namei.c 2008-03-25 18:53:59.000000000 -0500 @@ -1355,7 +1355,7 @@ static int add_dirent_to_buf(handle_t *h if (err) ext3_std_error(dir->i_sb, err); brelse(bh); - return 0; + return err; } /* diff -uprN linux-2.6.24/fs/ext3/super.c linux-2.6.24.ovz/fs/ext3/super.c --- linux-2.6.24/fs/ext3/super.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext3/super.c 2008-03-25 18:53:59.000000000 -0500 @@ -2869,7 +2869,7 @@ static struct file_system_type ext3_fs_t .name = "ext3", .get_sb = ext3_get_sb, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, }; static int __init init_ext3_fs(void) diff -uprN linux-2.6.24/fs/ext4/ioctl.c linux-2.6.24.ovz/fs/ext4/ioctl.c --- linux-2.6.24/fs/ext4/ioctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ext4/ioctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -79,7 +79,7 @@ int ext4_ioctl (struct inode * inode, st * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { + if (!capable(CAP_SYS_ADMIN)) { mutex_unlock(&inode->i_mutex); return -EPERM; } diff -uprN linux-2.6.24/fs/fcntl.c linux-2.6.24.ovz/fs/fcntl.c --- linux-2.6.24/fs/fcntl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/fcntl.c 2008-03-25 18:53:59.000000000 -0500 @@ -192,6 +192,7 @@ out_fput: fput(file); goto out; } +EXPORT_SYMBOL_GPL(sys_dup2); asmlinkage long sys_dup(unsigned int fildes) { @@ -210,6 +211,9 @@ static int setfl(int fd, struct file * f struct inode * inode = filp->f_path.dentry->d_inode; int error = 0; + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + arg &= ~O_DIRECT; + /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. diff -uprN linux-2.6.24/fs/file.c linux-2.6.24.ovz/fs/file.c --- linux-2.6.24/fs/file.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/file.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include +#include + struct fdtable_defer { spinlock_t lock; struct work_struct wq; @@ -35,9 +38,9 @@ static DEFINE_PER_CPU(struct fdtable_def static inline void * alloc_fdmem(unsigned int size) { if (size <= PAGE_SIZE) - return kmalloc(size, GFP_KERNEL); + return kmalloc(size, GFP_KERNEL_UBC); else - return vmalloc(size); + return ub_vmalloc(size); } static inline void free_fdarr(struct fdtable *fdt) @@ -150,7 +153,7 @@ static struct fdtable * alloc_fdtable(un if (nr > NR_OPEN) nr = NR_OPEN; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC); if (!fdt) goto out; fdt->max_fds = nr; @@ -185,7 +188,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +int expand_fdtable(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -215,6 +218,7 @@ static int expand_fdtable(struct files_s } return 1; } +EXPORT_SYMBOL_GPL(expand_fdtable); /* * Expand files. diff -uprN linux-2.6.24/fs/file_table.c linux-2.6.24.ovz/fs/file_table.c --- linux-2.6.24/fs/file_table.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/file_table.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,9 +20,14 @@ #include #include #include +#include #include +#include +#include +#include + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -36,12 +41,15 @@ static struct percpu_counter nr_files __ static inline void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + put_ve(f->owner_env); kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); + if (f->f_ub == get_ub0()) + percpu_counter_dec(&nr_files); + ub_file_uncharge(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -89,11 +97,14 @@ struct file *get_empty_filp(void) struct task_struct *tsk; static int old_max; struct file * f; + int acct; + acct = (get_exec_ub() == get_ub0()); /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (acct && get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -106,7 +117,13 @@ struct file *get_empty_filp(void) if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); + if (ub_file_charge(f)) + goto fail_ch; + if (acct) + percpu_counter_inc(&nr_files); + + f->owner_env = get_ve(get_exec_env()); + if (security_file_alloc(f)) goto fail_sec; @@ -133,6 +150,10 @@ fail_sec: file_free(f); fail: return NULL; + +fail_ch: + kmem_cache_free(filp_cachep, f); + return NULL; } EXPORT_SYMBOL(get_empty_filp); diff -uprN linux-2.6.24/fs/filesystems.c linux-2.6.24.ovz/fs/filesystems.c --- linux-2.6.24/fs/filesystems.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/filesystems.c 2008-03-25 18:53:59.000000000 -0500 @@ -12,6 +12,9 @@ #include #include #include +#include /* for 'current' */ +#include +#include #include /* @@ -21,8 +24,8 @@ * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it + * 2) we hold the reference to the element. + * The latter can be guaranteed by call of try_filesystem(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ @@ -30,24 +33,46 @@ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); +int try_get_filesystem(struct file_system_type *fs) +{ + if (try_module_get(fs->owner)) { + (void)get_ve(fs->owner_env); + return 1; + } + return 0; +} + /* WARNING: This can be used only if we _already_ own a reference */ void get_filesystem(struct file_system_type *fs) { + (void)get_ve(fs->owner_env); __module_get(fs->owner); } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); + put_ve(fs->owner_env); +} + +static inline int check_ve_fstype(struct file_system_type *p, + struct ve_struct *env) +{ + return ((p->fs_flags & FS_VIRTUALIZED) || + ve_accessible_strict(p->owner_env, env)); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type **find_filesystem(const char *name, unsigned len, + struct ve_struct *env) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) + for (p=&file_systems; *p; p=&(*p)->next) { + if (!check_ve_fstype(*p, env)) + continue; if (strlen((*p)->name) == len && strncmp((*p)->name, name, len) == 0) break; + } return p; } @@ -73,8 +98,12 @@ int register_filesystem(struct file_syst if (fs->next) return -EBUSY; INIT_LIST_HEAD(&fs->fs_supers); + if (fs->owner_env == NULL) + fs->owner_env = get_ve0(); + if (fs->proto == NULL) + fs->proto = fs; write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); + p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env); if (*p) res = -EBUSY; else @@ -118,6 +147,75 @@ int unregister_filesystem(struct file_sy EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_VE +int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template, + struct file_system_type **p_fs_type, struct vfsmount **p_mnt) +{ + struct vfsmount *mnt; + struct file_system_type *local_fs_type; + int ret; + + local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *), + GFP_KERNEL); + if (local_fs_type == NULL) + return -ENOMEM; + + local_fs_type->name = template->name; + local_fs_type->fs_flags = template->fs_flags; + local_fs_type->get_sb = template->get_sb; + local_fs_type->kill_sb = template->kill_sb; + local_fs_type->owner = template->owner; + local_fs_type->owner_env = ve; + local_fs_type->proto = template; + + get_filesystem(local_fs_type); /* get_ve() inside */ + + ret = register_filesystem(local_fs_type); + if (ret) + goto reg_err; + + if (p_mnt == NULL) + goto done; + + mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL); + if (IS_ERR(mnt)) + goto mnt_err; + + *p_mnt = mnt; +done: + *p_fs_type = local_fs_type; + return 0; + +mnt_err: + ret = PTR_ERR(mnt); + unregister_filesystem(local_fs_type); /* does not put */ + +reg_err: + put_filesystem(local_fs_type); + kfree(local_fs_type); + printk(KERN_DEBUG + "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); + return ret; +} + +EXPORT_SYMBOL(register_ve_fs_type); + +void unregister_ve_fs_type(struct file_system_type *local_fs_type, + struct vfsmount *local_fs_mount) +{ + if (local_fs_mount == NULL && local_fs_type == NULL) + return; + + unregister_filesystem(local_fs_type); + umount_ve_fs_type(local_fs_type); + if (local_fs_mount) + kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ + put_filesystem(local_fs_type); +} + +EXPORT_SYMBOL(unregister_ve_fs_type); +#endif + static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -131,11 +229,14 @@ static int fs_index(const char __user * err = -EINVAL; read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { + for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; if (strcmp(tmp->name,name) == 0) { err = index; break; } + index++; } read_unlock(&file_systems_lock); putname(name); @@ -148,9 +249,15 @@ static int fs_name(unsigned int index, c int len, res; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) - break; + for (tmp = file_systems; tmp; tmp = tmp->next) { + if (!check_ve_fstype(tmp, get_exec_env())) + continue; + if (!index) { + if (try_get_filesystem(tmp)) + break; + } else + index--; + } read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; @@ -168,8 +275,9 @@ static int fs_maxindex(void) int index; read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; + for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) + if (check_ve_fstype(tmp, get_exec_env())) + index++; read_unlock(&file_systems_lock); return index; } @@ -205,9 +313,10 @@ int get_filesystem_list(char * buf) read_lock(&file_systems_lock); tmp = file_systems; while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); + if (check_ve_fstype(tmp, get_exec_env())) + len += sprintf(buf+len, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); @@ -221,14 +330,14 @@ struct file_system_type *get_fs_type(con unsigned len = dot ? dot - name : strlen(name); read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, len, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); if (!fs && (request_module("%.*s", len, name) == 0)) { read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) + fs = *(find_filesystem(name, len, get_exec_env())); + if (fs && !try_get_filesystem(fs)) fs = NULL; read_unlock(&file_systems_lock); } @@ -241,3 +350,5 @@ struct file_system_type *get_fs_type(con } EXPORT_SYMBOL(get_fs_type); +EXPORT_SYMBOL(get_filesystem); +EXPORT_SYMBOL(put_filesystem); diff -uprN linux-2.6.24/fs/fuse/control.c linux-2.6.24.ovz/fs/fuse/control.c --- linux-2.6.24/fs/fuse/control.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/fuse/control.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,8 @@ #include #include +#include +#include #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -17,7 +19,11 @@ * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ +#ifdef CONFIG_VE +#define fuse_control_sb (get_exec_env()->_fuse_control_sb) +#else static struct super_block *fuse_control_sb; +#endif static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { @@ -211,12 +217,51 @@ static struct file_system_type fuse_ctl_ .kill_sb = fuse_ctl_kill_sb, }; +#ifdef CONFIG_VE +static int fuse_ctl_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type != NULL) + return -EBUSY; + + return register_ve_fs_type(ve, &fuse_ctl_fs_type, + &ve->fuse_ctl_fs_type, NULL); +} + +static void fuse_ctl_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_ctl_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL); + ve->fuse_ctl_fs_type = NULL; +} + +static struct ve_hook fuse_ctl_ve_hook = { + .init = fuse_ctl_start, + .fini = fuse_ctl_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + int __init fuse_ctl_init(void) { - return register_filesystem(&fuse_ctl_fs_type); + int err; + + err = register_filesystem(&fuse_ctl_fs_type); + if (err == 0) + ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook); + return err; } void fuse_ctl_cleanup(void) { + ve_hook_unregister(&fuse_ctl_ve_hook); unregister_filesystem(&fuse_ctl_fs_type); } diff -uprN linux-2.6.24/fs/fuse/fuse_i.h linux-2.6.24.ovz/fs/fuse/fuse_i.h --- linux-2.6.24/fs/fuse/fuse_i.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/fuse/fuse_i.h 2008-03-25 18:53:59.000000000 -0500 @@ -41,7 +41,11 @@ #define FUSE_ALLOW_OTHER (1 << 1) /** List of active connections */ +#ifdef CONFIG_VE +#define fuse_conn_list (get_exec_env()->_fuse_conn_list) +#else extern struct list_head fuse_conn_list; +#endif /** Global mutex protecting fuse_conn_list and the control filesystem */ extern struct mutex fuse_mutex; diff -uprN linux-2.6.24/fs/fuse/inode.c linux-2.6.24.ovz/fs/fuse/inode.c --- linux-2.6.24/fs/fuse/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/fuse/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -18,13 +18,16 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; +#ifndef CONFIG_VE struct list_head fuse_conn_list; +#endif DEFINE_MUTEX(fuse_mutex); #define FUSE_SUPER_MAGIC 0x65735546 @@ -819,6 +822,41 @@ static void fuse_sysfs_cleanup(void) subsystem_unregister(&fuse_subsys); } +#ifdef CONFIG_VE +static int fuse_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type != NULL) + return -EBUSY; + + INIT_LIST_HEAD(&ve->_fuse_conn_list); + return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL); +} + +static void fuse_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->fuse_fs_type == NULL) + return; + + unregister_ve_fs_type(ve->fuse_fs_type, NULL); + kfree(ve->fuse_fs_type); + ve->fuse_fs_type = NULL; + BUG_ON(!list_empty(&ve->_fuse_conn_list)); +} + +static struct ve_hook fuse_ve_hook = { + .init = fuse_start, + .fini = fuse_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif + static int __init fuse_init(void) { int res; @@ -843,6 +881,7 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook); return 0; err_sysfs_cleanup: @@ -859,6 +898,7 @@ static void __exit fuse_exit(void) { printk(KERN_DEBUG "fuse exit\n"); + ve_hook_unregister(&fuse_ve_hook); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff -uprN linux-2.6.24/fs/gfs2/glock.c linux-2.6.24.ovz/fs/gfs2/glock.c --- linux-2.6.24/fs/gfs2/glock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/gfs2/glock.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,6 +8,7 @@ */ #include +#include #include #include #include diff -uprN linux-2.6.24/fs/gfs2/ops_address.c linux-2.6.24.ovz/fs/gfs2/ops_address.c --- linux-2.6.24/fs/gfs2/ops_address.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/gfs2/ops_address.c 2008-03-25 18:53:59.000000000 -0500 @@ -250,7 +250,7 @@ static int gfs2_readpage(struct file *fi if (likely(file != &gfs2_internal_file_sentinel)) { if (file) { - gf = file->private_data; + gf = file_private(file); if (test_bit(GFF_EXLOCK, &gf->f_flags)) /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */ goto skip_lock; @@ -316,7 +316,7 @@ static int gfs2_readpages(struct file *f if (likely(file != &gfs2_internal_file_sentinel)) { if (file) { - struct gfs2_file *gf = file->private_data; + struct gfs2_file *gf = file_private(file); if (test_bit(GFF_EXLOCK, &gf->f_flags)) goto skip_lock; } diff -uprN linux-2.6.24/fs/gfs2/ops_vm.c linux-2.6.24.ovz/fs/gfs2/ops_vm.c --- linux-2.6.24/fs/gfs2/ops_vm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/gfs2/ops_vm.c 2008-03-25 18:53:59.000000000 -0500 @@ -107,7 +107,7 @@ static int gfs2_sharewrite_fault(struct struct vm_fault *vmf) { struct file *file = vma->vm_file; - struct gfs2_file *gf = file->private_data; + struct gfs2_file *gf = file_private(file); struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_holder i_gh; int alloc_required; diff -uprN linux-2.6.24/fs/inode.c linux-2.6.24.ovz/fs/inode.c --- linux-2.6.24/fs/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,10 +8,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include @@ -22,6 +25,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -97,7 +101,8 @@ static DEFINE_MUTEX(iprune_mutex); */ struct inodes_stat_t inodes_stat; -static struct kmem_cache * inode_cachep __read_mostly; +struct kmem_cache * inode_cachep __read_mostly; + static void wake_up_inode(struct inode *inode) { @@ -108,11 +113,13 @@ static void wake_up_inode(struct inode * wake_up_bit(&inode->i_state, __I_LOCK); } +static struct address_space_operations vfs_empty_aops; +struct inode_operations vfs_empty_iops; +static struct file_operations vfs_empty_fops; +EXPORT_SYMBOL(vfs_empty_iops); + static struct inode *alloc_inode(struct super_block *sb) { - static const struct address_space_operations empty_aops; - static struct inode_operations empty_iops; - static const struct file_operations empty_fops; struct inode *inode; if (sb->s_op->alloc_inode) @@ -127,8 +134,8 @@ static struct inode *alloc_inode(struct inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_op = &vfs_empty_iops; + inode->i_fop = &vfs_empty_fops; inode->i_nlink = 1; atomic_set(&inode->i_writecount, 0); inode->i_size = 0; @@ -152,15 +159,15 @@ static struct inode *alloc_inode(struct } spin_lock_init(&inode->i_lock); - lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key); mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key); init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key); - mapping->a_ops = &empty_aops; + mapping->a_ops = &vfs_empty_aops; mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); @@ -310,13 +317,76 @@ static void dispose_list(struct list_hea spin_unlock(&inode_lock); } +static void show_header(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + printk("VFS: Busy inodes after unmount. " + "sb = %p, fs type = %s, sb count = %d, " + "sb->s_root = %s\n", sb, + (sb->s_type != NULL) ? sb->s_type->name : "", + sb->s_count, + (sb->s_root != NULL) ? + (char *)sb->s_root->d_name.name : ""); +} + +static void show_inode(struct inode *inode) +{ + struct dentry *d; + struct vfsmount *mnt; + int i; + + printk("inode = %p, inode->i_count = %d, " + "inode->i_nlink = %d, " + "inode->i_mode = %d, " + "inode->i_state = %ld, " + "inode->i_flags = %d, " + "inode->i_devices.next = %p, " + "inode->i_devices.prev = %p, " + "inode->i_ino = %ld\n", + inode, + atomic_read(&inode->i_count), + inode->i_nlink, + inode->i_mode, + inode->i_state, + inode->i_flags, + inode->i_devices.next, + inode->i_devices.prev, + inode->i_ino); + printk("inode dump: "); + for (i = 0; i < sizeof(*inode); i++) + printk("%2.2x ", *((u_char *)inode + i)); + printk("\n"); + list_for_each_entry(d, &inode->i_dentry, d_alias) { + printk(" d_alias %s d_count=%d d_flags=%x\n", + d->d_name.name, atomic_read(&d->d_count), d->d_flags); + for (i = 0; i < sizeof(*d); i++) + printk("%2.2x ", *((u_char *)d + i)); + printk("\n"); + } + + spin_lock(&vfsmount_lock); + list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) { + if (mnt->mnt_sb != inode->i_sb) + continue; + printk("mnt=%p count=%d flags=%x exp_mask=%x\n", + mnt, atomic_read(&mnt->mnt_count), + mnt->mnt_flags, + mnt->mnt_expiry_mark); + for (i = 0; i < sizeof(*mnt); i++) + printk("%2.2x ", *((u_char *)mnt + i)); + printk("\n"); + } + spin_unlock(&vfsmount_lock); +} + /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0, count = 0, once = 1; next = head->next; for (;;) { @@ -343,6 +413,14 @@ static int invalidate_list(struct list_h continue; } busy = 1; + + if (check) { + if (once) { + once = 0; + show_header(inode); + } + show_inode(inode); + } } /* only unused inodes may be cached with i_count zero */ inodes_stat.nr_unused -= count; @@ -357,7 +435,7 @@ static int invalidate_list(struct list_h * fails because there are busy inodes then a non zero value is returned. * If the discard is successful all the inodes have been discarded. */ -int invalidate_inodes(struct super_block * sb) +int invalidate_inodes_check(struct super_block * sb, int check) { int busy; LIST_HEAD(throw_away); @@ -365,7 +443,7 @@ int invalidate_inodes(struct super_block mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); + busy = invalidate_list(&sb->s_inodes, &throw_away, check); spin_unlock(&inode_lock); dispose_list(&throw_away); @@ -374,7 +452,7 @@ int invalidate_inodes(struct super_block return busy; } -EXPORT_SYMBOL(invalidate_inodes); +EXPORT_SYMBOL(invalidate_inodes_check); static int can_unuse(struct inode *inode) { @@ -464,6 +542,7 @@ static void prune_icache(int nr_to_scan) */ static int shrink_icache_memory(int nr, gfp_t gfp_mask) { + KSTAT_PERF_ENTER(shrink_icache) if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, @@ -474,6 +553,7 @@ static int shrink_icache_memory(int nr, return -1; prune_icache(nr); } + KSTAT_PERF_LEAVE(shrink_icache) return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -585,7 +665,7 @@ void unlock_new_inode(struct inode *inod */ mutex_destroy(&inode->i_mutex); mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); + lockdep_set_class(&inode->i_mutex, &type->proto->i_mutex_dir_key); } #endif /* diff -uprN linux-2.6.24/fs/inotify.c linux-2.6.24.ovz/fs/inotify.c --- linux-2.6.24/fs/inotify.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/inotify.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include static atomic_t inotify_cookie; @@ -69,19 +70,6 @@ static atomic_t inotify_cookie; * inotify_add_watch() to the final put_inotify_watch(). */ -/* - * struct inotify_handle - represents an inotify instance - * - * This structure is protected by the mutex 'mutex'. - */ -struct inotify_handle { - struct idr idr; /* idr mapping wd -> watch */ - struct mutex mutex; /* protects this bad boy */ - struct list_head watches; /* list of watches */ - atomic_t count; /* reference count */ - u32 last_wd; /* the last wd allocated */ - const struct inotify_operations *in_ops; /* inotify caller operations */ -}; static inline void get_inotify_handle(struct inotify_handle *ih) { @@ -118,6 +106,10 @@ void put_inotify_watch(struct inotify_wa struct inotify_handle *ih = watch->ih; iput(watch->inode); + dput(watch->dentry); + mntput(watch->mnt); + watch->dentry = NULL; + watch->mnt = NULL; ih->in_ops->destroy_watch(watch); put_inotify_handle(ih); } @@ -483,6 +475,8 @@ void inotify_init_watch(struct inotify_w INIT_LIST_HEAD(&watch->i_list); atomic_set(&watch->count, 0); get_inotify_watch(watch); /* initial get */ + watch->dentry = NULL; + watch->mnt = NULL; } EXPORT_SYMBOL_GPL(inotify_init_watch); @@ -623,8 +617,10 @@ EXPORT_SYMBOL_GPL(inotify_find_update_wa * Caller must ensure it only calls inotify_add_watch() once per watch. * Calls inotify_handle_get_wd() so may sleep. */ -s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, - struct inode *inode, u32 mask) +s32 __inotify_add_watch(struct inotify_handle *ih, + struct inotify_watch *watch, + struct dentry *d, struct vfsmount *mnt, + struct inode * inode, u32 mask) { int ret = 0; @@ -651,6 +647,10 @@ s32 inotify_add_watch(struct inotify_han * Save a reference to the inode and bump the ref count to make it * official. We hold a reference to nameidata, which makes this safe. */ + if (d) { + watch->dentry = dget(d); + watch->mnt = mntget(mnt); + } watch->inode = igrab(inode); if (!inotify_inode_watched(inode)) @@ -666,6 +666,19 @@ out: } EXPORT_SYMBOL_GPL(inotify_add_watch); +s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, + struct inode *inode, u32 mask) +{ + return __inotify_add_watch(ih, watch, NULL, NULL, inode, mask); +} + +s32 inotify_add_watch_dget(struct inotify_handle *ih, + struct inotify_watch *watch, struct dentry *d, + struct vfsmount *mnt, u32 mask) +{ + return __inotify_add_watch(ih, watch, d, mnt, d->d_inode, mask); +} + /** * inotify_clone_watch - put the watch next to existing one * @old: already installed watch diff -uprN linux-2.6.24/fs/inotify_user.c linux-2.6.24.ovz/fs/inotify_user.c --- linux-2.6.24/fs/inotify_user.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/inotify_user.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -66,46 +67,6 @@ int inotify_max_queued_events __read_mos * first event, or to inotify_destroy(). */ -/* - * struct inotify_device - represents an inotify instance - * - * This structure is protected by the mutex 'mutex'. - */ -struct inotify_device { - wait_queue_head_t wq; /* wait queue for i/o */ - struct mutex ev_mutex; /* protects event queue */ - struct mutex up_mutex; /* synchronizes watch updates */ - struct list_head events; /* list of queued events */ - atomic_t count; /* reference count */ - struct user_struct *user; /* user who opened this dev */ - struct inotify_handle *ih; /* inotify handle */ - unsigned int queue_size; /* size of the queue (bytes) */ - unsigned int event_count; /* number of pending events */ - unsigned int max_events; /* maximum number of events */ -}; - -/* - * struct inotify_kernel_event - An inotify event, originating from a watch and - * queued for user-space. A list of these is attached to each instance of the - * device. In read(), this list is walked and all events that can fit in the - * buffer are returned. - * - * Protected by dev->ev_mutex of the device in which we are queued. - */ -struct inotify_kernel_event { - struct inotify_event event; /* the user-space event */ - struct list_head list; /* entry in inotify_device's list */ - char *name; /* filename, if any */ -}; - -/* - * struct inotify_user_watch - our version of an inotify_watch, we add - * a reference to the associated inotify_device. - */ -struct inotify_user_watch { - struct inotify_device *dev; /* associated device */ - struct inotify_watch wdata; /* inotify watch data */ -}; #ifdef CONFIG_SYSCTL @@ -361,8 +322,8 @@ static int find_inode(const char __user * * Callers must hold dev->up_mutex. */ -static int create_watch(struct inotify_device *dev, struct inode *inode, - u32 mask) +int inotify_create_watch(struct inotify_device *dev, struct dentry *d, + struct vfsmount *mnt, u32 mask) { struct inotify_user_watch *watch; int ret; @@ -382,12 +343,13 @@ static int create_watch(struct inotify_d atomic_inc(&dev->user->inotify_watches); inotify_init_watch(&watch->wdata); - ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); + ret = inotify_add_watch_dget(dev->ih, &watch->wdata, d, mnt, mask); if (ret < 0) free_inotify_user_watch(&watch->wdata); return ret; } +EXPORT_SYMBOL(inotify_create_watch); /* Device Interface */ @@ -527,13 +489,14 @@ static long inotify_ioctl(struct file *f return ret; } -static const struct file_operations inotify_fops = { +const struct file_operations inotify_fops = { .poll = inotify_poll, .read = inotify_read, .release = inotify_release, .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, }; +EXPORT_SYMBOL(inotify_fops); static const struct inotify_operations inotify_user_ops = { .handle_event = inotify_dev_queue_event, @@ -610,6 +573,7 @@ out_put_fd: put_unused_fd(fd); return ret; } +EXPORT_SYMBOL(sys_inotify_init); asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) { @@ -646,7 +610,7 @@ asmlinkage long sys_inotify_add_watch(in mutex_lock(&dev->up_mutex); ret = inotify_find_update_watch(dev->ih, inode, mask); if (ret == -ENOENT) - ret = create_watch(dev, inode, mask); + ret = inotify_create_watch(dev, nd.dentry, nd.mnt, mask); mutex_unlock(&dev->up_mutex); path_release(&nd); diff -uprN linux-2.6.24/fs/ioprio.c linux-2.6.24.ovz/fs/ioprio.c --- linux-2.6.24/fs/ioprio.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ioprio.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,8 @@ #include #include #include +#include +#include static int set_task_ioprio(struct task_struct *task, int ioprio) { @@ -61,8 +63,11 @@ asmlinkage long sys_ioprio_set(int which int data = IOPRIO_PRIO_DATA(ioprio); struct task_struct *p, *g; struct user_struct *user; - struct pid *pgrp; int ret; + struct pid *pgrp; + + if (!ve_is_super(get_exec_env())) + return -EPERM; switch (class) { case IOPRIO_CLASS_RT: @@ -122,17 +127,23 @@ asmlinkage long sys_ioprio_set(int which if (!user) break; - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (p->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } while_each_thread_all(g, p); free_uid: if (who) free_uid(user); break; + case IOPRIO_WHO_UBC: + if (class != IOPRIO_CLASS_BE) + return -ERANGE; + + ret = bc_set_ioprio(who, data); + break; default: ret = -EINVAL; } @@ -175,9 +186,9 @@ asmlinkage long sys_ioprio_get(int which { struct task_struct *g, *p; struct user_struct *user; - struct pid *pgrp; int ret = -ESRCH; int tmpio; + struct pid *pgrp; read_lock(&tasklist_lock); switch (which) { @@ -213,7 +224,7 @@ asmlinkage long sys_ioprio_get(int which if (!user) break; - do_each_thread(g, p) { + do_each_thread_ve(g, p) { if (p->uid != user->uid) continue; tmpio = get_task_ioprio(p); @@ -223,7 +234,7 @@ asmlinkage long sys_ioprio_get(int which ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); if (who) free_uid(user); diff -uprN linux-2.6.24/fs/lockd/clntproc.c linux-2.6.24.ovz/fs/lockd/clntproc.c --- linux-2.6.24/fs/lockd/clntproc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/lockd/clntproc.c 2008-03-25 18:53:59.000000000 -0500 @@ -159,6 +159,7 @@ nlmclnt_proc(struct inode *inode, int cm sigset_t oldset; unsigned long flags; int status, vers; + struct ve_struct *ve; vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1; if (NFS_PROTO(inode)->version > 3) { @@ -166,16 +167,19 @@ nlmclnt_proc(struct inode *inode, int cm return -ENOLCK; } + ve = set_exec_env(NFS_CLIENT(inode)->cl_xprt->owner_env); rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr)); host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers, nfssrv->nfs_client->cl_hostname, strlen(nfssrv->nfs_client->cl_hostname)); + status = -ENOLCK; if (host == NULL) - return -ENOLCK; + goto fail; call = nlm_alloc_call(host); + status = -ENOMEM; if (call == NULL) - return -ENOMEM; + goto fail; nlmclnt_locks_init_private(fl, host); /* Set up the argument struct */ @@ -217,6 +221,8 @@ nlmclnt_proc(struct inode *inode, int cm spin_unlock_irqrestore(¤t->sighand->siglock, flags); dprintk("lockd: clnt proc returns %d\n", status); +fail: + (void)set_exec_env(ve); return status; } EXPORT_SYMBOL(nlmclnt_proc); diff -uprN linux-2.6.24/fs/lockd/host.c linux-2.6.24.ovz/fs/lockd/host.c --- linux-2.6.24/fs/lockd/host.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/lockd/host.c 2008-03-25 18:53:59.000000000 -0500 @@ -52,6 +52,7 @@ nlm_lookup_host(int server, const struct struct nlm_host *host; struct nsm_handle *nsm = NULL; int hash; + struct ve_struct *ve; dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT ", p=%d, v=%d, my role=%s, name=%.*s)\n", @@ -77,10 +78,14 @@ nlm_lookup_host(int server, const struct * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ + + ve = get_exec_env(); chain = &nlm_hosts[hash]; hlist_for_each_entry(host, pos, chain, h_hash) { if (!nlm_cmp_addr(&host->h_addr, sin)) continue; + if (!ve_accessible_strict(host->owner_env, ve)) + continue; /* See if we have an NSM handle for this client */ if (!nsm) @@ -140,6 +145,7 @@ nlm_lookup_host(int server, const struct spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); + host->owner_env = ve; if (++nrhosts > NLM_HOST_MAX) next_gc = 0; @@ -440,6 +446,52 @@ nlm_gc_hosts(void) next_gc = jiffies + NLM_HOST_COLLECT; } +#ifdef CONFIG_VE +void ve_nlm_shutdown_hosts(struct ve_struct *ve) +{ + envid_t veid = ve->veid; + int i; + + dprintk("lockd: shutting down host module for ve %d\n", veid); + mutex_lock(&nlm_host_mutex); + + /* Perform a garbage collection pass */ + for (i = 0; i < NLM_HOST_NRHASH; i++) { + struct nlm_host *host; + struct hlist_node *pos; + + hlist_for_each_entry(host, pos, &nlm_hosts[i], h_hash) { + struct rpc_clnt *clnt; + + if (ve != host->owner_env) + continue; + + hlist_del(&host->h_hash); + if (host->h_nsmhandle) + host->h_nsmhandle->sm_monitored = 0; + dprintk("lockd: delete host %s ve %d\n", host->h_name, + veid); + if ((clnt = host->h_rpcclnt) != NULL) { + if (!list_empty(&clnt->cl_tasks)) { + struct rpc_xprt *xprt; + + printk(KERN_WARNING + "lockd: active RPC handle\n"); + rpc_killall_tasks(clnt); + xprt = clnt->cl_xprt; + xprt_disconnect(xprt); + xprt->ops->close(xprt); + } else + rpc_shutdown_client(clnt); + } + kfree(host); + nrhosts--; + } + } + + mutex_unlock(&nlm_host_mutex); +} +#endif /* * Manage NSM handles diff -uprN linux-2.6.24/fs/lockd/svc.c linux-2.6.24.ovz/fs/lockd/svc.c --- linux-2.6.24/fs/lockd/svc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/lockd/svc.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -47,11 +48,12 @@ struct nlmsvc_binding * nlmsvc_ops; EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); -static unsigned int nlmsvc_users; -static pid_t nlmsvc_pid; +static unsigned int _nlmsvc_users; +static pid_t _nlmsvc_pid; static struct svc_serv *nlmsvc_serv; -int nlmsvc_grace_period; -unsigned long nlmsvc_timeout; +int _nlmsvc_grace_period; +unsigned long _nlmsvc_timeout; + static DECLARE_COMPLETION(lockd_start_done); static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); @@ -179,8 +181,13 @@ lockd(struct svc_rqst *rqstp) * recvfrom routine. */ err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) + if (err == -EAGAIN || err == -EINTR) { +#ifdef CONFIG_VE + if (!get_exec_env()->is_running) + break; +#endif continue; + } if (err < 0) { printk(KERN_WARNING "lockd: terminating on error %d\n", @@ -494,6 +501,29 @@ static int lockd_authenticate(struct svc return SVC_DENIED; } +#ifdef CONFIG_VE +extern void ve_nlm_shutdown_hosts(struct ve_struct *ve); + +static int ve_lockd_start(void *data) +{ + return 0; +} + +static void ve_lockd_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + + ve_nlm_shutdown_hosts(ve); + flush_scheduled_work(); +} + +static struct ve_hook lockd_hook = { + .init = ve_lockd_start, + .fini = ve_lockd_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_FS, +}; +#endif param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, @@ -522,12 +552,14 @@ module_param(nsm_use_hostnames, bool, 06 static int __init init_nlm(void) { nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); + ve_hook_register(VE_SS_CHAIN, &lockd_hook); return nlm_sysctl_table ? 0 : -ENOMEM; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ + ve_hook_unregister(&lockd_hook); nlm_shutdown_hosts(); unregister_sysctl_table(nlm_sysctl_table); } diff -uprN linux-2.6.24/fs/lockd/svcsubs.c linux-2.6.24.ovz/fs/lockd/svcsubs.c --- linux-2.6.24/fs/lockd/svcsubs.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/lockd/svcsubs.c 2008-03-25 18:53:59.000000000 -0500 @@ -318,6 +318,9 @@ nlmsvc_same_host(struct nlm_host *host, static int nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy) { + if (!ve_accessible_strict(host->owner_env, get_exec_env())) + return 0; + if (host->h_server) { /* we are destroying locks even though the client * hasn't asked us too, so don't unmonitor the diff -uprN linux-2.6.24/fs/locks.c linux-2.6.24.ovz/fs/locks.c --- linux-2.6.24/fs/locks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/locks.c 2008-03-25 18:53:59.000000000 -0500 @@ -129,6 +129,8 @@ #include #include +#include + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) @@ -145,9 +147,25 @@ static LIST_HEAD(blocked_list); static struct kmem_cache *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +static struct file_lock *locks_alloc_lock(int charge) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl; + + fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + if (fl == NULL) + goto out; + fl->fl_charged = 0; + if (!charge) + goto out; + if (!ub_flock_charge(fl, 1)) + goto out; + + kmem_cache_free(filelock_cache, fl); + fl = NULL; +out: +#endif + return fl; } static void locks_release_private(struct file_lock *fl) @@ -172,6 +190,7 @@ static void locks_free_lock(struct file_ BUG_ON(!list_empty(&fl->fl_block)); BUG_ON(!list_empty(&fl->fl_link)); + ub_flock_uncharge(fl); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } @@ -273,7 +292,7 @@ static int flock_make_lock(struct file * if (type < 0) return type; - fl = locks_alloc_lock(); + fl = locks_alloc_lock(type != F_UNLCK); if (fl == NULL) return -ENOMEM; @@ -460,7 +479,7 @@ static int lease_init(struct file *filp, /* Allocate a file_lock initialised to this type of lease */ static struct file_lock *lease_alloc(struct file *filp, int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lock *fl = locks_alloc_lock(1); int error = -ENOMEM; if (fl == NULL) @@ -746,8 +765,13 @@ static int flock_lock_file(struct file * goto find_conflict; if (request->fl_type != F_UNLCK) { + /* + * Nont F_UNLCK request must be already charged in + * flock_make_lock(). Actually new_fl must be charged not the + * request, but we try to fail earlier. + */ error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(0); if (new_fl == NULL) goto out; error = 0; @@ -797,6 +821,10 @@ find_conflict: } if (request->fl_flags & FL_ACCESS) goto out; + + set_flock_charged(new_fl); + unset_flock_charged(request); + locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; @@ -828,8 +856,11 @@ static int __posix_lock_file(struct inod if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); + if (request->fl_type != F_UNLCK) + new_fl = locks_alloc_lock(1); + else + new_fl = NULL; + new_fl2 = locks_alloc_lock(0); } lock_kernel(); @@ -963,7 +994,7 @@ static int __posix_lock_file(struct inod * bail out. */ error = -ENOLCK; /* "no luck" */ - if (right && left == right && !new_fl2) + if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2)) goto out; error = 0; @@ -974,23 +1005,32 @@ static int __posix_lock_file(struct inod goto out; } - if (!new_fl) { - error = -ENOLCK; + error = -ENOLCK; + if (!new_fl) + goto out; + if (right && (left == right) && ub_flock_charge(new_fl, 1)) goto out; - } locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; + error = 0; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ + error = -ENOLCK; + if (added && ub_flock_charge(new_fl2, + request->fl_type != F_UNLCK)) + goto out; + /* FIXME move all fl_charged manipulations in ub code */ + set_flock_charged(new_fl2); left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock(before, left); + error = 0; } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(right); @@ -1381,7 +1421,7 @@ int generic_setlease(struct file *filp, goto out; error = -ENOMEM; - new_fl = locks_alloc_lock(); + new_fl = locks_alloc_lock(1); if (new_fl == NULL) goto out; @@ -1615,6 +1655,7 @@ asmlinkage long sys_flock(unsigned int f out: return error; } +EXPORT_SYMBOL_GPL(sys_flock); /** * vfs_test_lock - test file byte range lock @@ -1635,7 +1676,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = pid_to_vpid(fl->fl_pid); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1657,7 +1698,7 @@ static int posix_lock_to_flock(struct fl #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = pid_to_vpid(fl->fl_pid); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1751,7 +1792,7 @@ EXPORT_SYMBOL_GPL(vfs_lock_file); int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock flock; struct inode *inode; int error; @@ -1875,7 +1916,7 @@ out: int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 __user *l) { - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock64 flock; struct inode *inode; int error; @@ -2084,7 +2125,9 @@ static void lock_get_status(struct seq_f int id, char *pfx) { struct inode *inode = NULL; + unsigned int fl_pid; + fl_pid = pid_to_vpid(fl->fl_pid); if (fl->fl_file != NULL) inode = fl->fl_file->f_path.dentry->d_inode; @@ -2124,16 +2167,16 @@ static void lock_get_status(struct seq_f } if (inode) { #ifdef WE_CAN_BREAK_LSLK_NOW - seq_printf(f, "%d %s:%ld ", fl->fl_pid, + seq_printf(f, "%d %s:%ld ", fl_pid, inode->i_sb->s_id, inode->i_ino); #else /* userspace relies on this representation of dev_t ;-( */ - seq_printf(f, "%d %02x:%02x:%ld ", fl->fl_pid, + seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); #endif } else { - seq_printf(f, "%d :0 ", fl->fl_pid); + seq_printf(f, "%d :0 ", fl_pid); } if (IS_POSIX(fl)) { if (fl->fl_end == OFFSET_MAX) @@ -2150,6 +2193,8 @@ static int locks_show(struct seq_file *f struct file_lock *fl, *bfl; fl = list_entry(v, struct file_lock, fl_link); + if (!ve_accessible(fl->fl_file->owner_env, get_exec_env())) + goto out; lock_get_status(f, fl, (long)f->private, ""); @@ -2157,6 +2202,7 @@ static int locks_show(struct seq_file *f lock_get_status(f, bfl, (long)f->private, " ->"); f->private++; +out: return 0; } @@ -2266,7 +2312,7 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, + sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC, init_once); return 0; } diff -uprN linux-2.6.24/fs/mpage.c linux-2.6.24.ovz/fs/mpage.c --- linux-2.6.24/fs/mpage.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/mpage.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. diff -uprN linux-2.6.24/fs/namei.c linux-2.6.24.ovz/fs/namei.c --- linux-2.6.24/fs/namei.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/namei.c 2008-03-25 18:53:59.000000000 -0500 @@ -141,6 +141,7 @@ char * getname(const char __user * filen { char *tmp, *result; + /*ub_dentry_checkup();*/ result = ERR_PTR(-ENOMEM); tmp = __getname(); if (tmp) { @@ -428,6 +429,21 @@ static struct dentry * cached_lookup(str if (!dentry) dentry = d_lookup(parent, name); + /* + * The revalidation rules are simple: + * d_revalidate operation is called when we're about to use a cached + * dentry rather than call d_lookup. + * d_revalidate method may unhash the dentry itself or return FALSE, in + * which case if the dentry can be released d_lookup will be called. + * + * Additionally, by request of NFS people + * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) + * d_revalidate is called when `/', `.' or `..' are looked up. + * Since re-lookup is impossible on them, we introduce a hack and + * return an error in this case. + * + * 2003/02/19 SAW + */ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) dentry = do_revalidate(dentry, nd); @@ -487,6 +503,7 @@ static struct dentry * real_lookup(struc struct dentry * result; struct inode *dir = parent->d_inode; +repeat: mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created @@ -525,7 +542,7 @@ static struct dentry * real_lookup(struc if (result->d_op && result->d_op->d_revalidate) { result = do_revalidate(result, nd); if (!result) - result = ERR_PTR(-ENOENT); + goto repeat; } return result; } @@ -751,6 +768,13 @@ static __always_inline void follow_dotdo read_unlock(&fs->lock); break; } +#ifdef CONFIG_VE + if (nd->dentry == get_exec_env()->fs_root && + nd->mnt == get_exec_env()->fs_rootmnt) { + read_unlock(¤t->fs->lock); + break; + } +#endif read_unlock(&fs->lock); spin_lock(&dcache_lock); if (nd->dentry != nd->mnt->mnt_root) { @@ -792,6 +816,10 @@ static int do_lookup(struct nameidata *n if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: + if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; __follow_mount(path); @@ -829,6 +857,7 @@ static fastcall int __link_path_walk(con struct inode *inode; int err; unsigned int lookup_flags = nd->flags; + int real_components = 0; while (*name=='/') name++; @@ -898,6 +927,7 @@ static fastcall int __link_path_walk(con break; } /* This does the actual lookups.. */ + real_components++; err = do_lookup(nd, &this, &next); if (err) break; @@ -911,6 +941,9 @@ static fastcall int __link_path_walk(con goto out_dput; if (inode->i_op->follow_link) { + err = -ENOENT; + if (lookup_flags & LOOKUP_STRICT) + goto out_dput; err = do_follow_link(&next, nd); if (err) goto return_err; @@ -958,6 +991,7 @@ last_component: break; inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) + && !(lookup_flags & LOOKUP_STRICT) && inode && inode->i_op && inode->i_op->follow_link) { err = do_follow_link(&next, nd); if (err) @@ -979,26 +1013,40 @@ lookup_parent: nd->last_type = LAST_NORM; if (this.name[0] != '.') goto return_base; - if (this.len == 1) + if (this.len == 1) { nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') + goto return_reval; + } else if (this.len == 2 && this.name[1] == '.') { nd->last_type = LAST_DOTDOT; - else - goto return_base; + goto return_reval; + } +return_base: + if (!(nd->flags & LOOKUP_NOAREACHECK)) { + err = check_area_access_ve(nd->dentry, nd->mnt); + if (err) + break; + } + return 0; return_reval: /* * We bypassed the ordinary revalidation routines. * We may need to check the cached dentry for staleness. */ - if (nd->dentry && nd->dentry->d_sb && + if (!real_components && nd->dentry && nd->dentry->d_sb && (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { err = -ESTALE; /* Note: we do not d_invalidate() */ if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd)) + /* + * This lookup is for `/' or `.' or `..'. + * The filesystem unhashed the dentry itself + * inside d_revalidate (otherwise, d_invalidate + * wouldn't succeed). As a special courtesy to + * NFS we return an error. 2003/02/19 SAW + */ break; } -return_base: - return 0; + goto return_base; out_dput: dput_path(&next, nd); break; @@ -1996,6 +2044,7 @@ asmlinkage long sys_mknod(const char __u { return sys_mknodat(AT_FDCWD, filename, mode, dev); } +EXPORT_SYMBOL_GPL(sys_mknod); int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { @@ -2056,6 +2105,7 @@ asmlinkage long sys_mkdir(const char __u { return sys_mkdirat(AT_FDCWD, pathname, mode); } +EXPORT_SYMBOL_GPL(sys_mkdir); /* * We try to drop the dentry early: we should have @@ -2083,6 +2133,7 @@ void dentry_unhash(struct dentry *dentry spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } +EXPORT_SYMBOL(sys_symlink); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -2163,6 +2214,7 @@ asmlinkage long sys_rmdir(const char __u { return do_rmdir(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_rmdir); int vfs_unlink(struct inode *dir, struct dentry *dentry) { @@ -2262,6 +2314,7 @@ asmlinkage long sys_unlink(const char __ { return do_unlinkat(AT_FDCWD, pathname); } +EXPORT_SYMBOL_GPL(sys_unlink); int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) { @@ -2422,6 +2475,7 @@ asmlinkage long sys_link(const char __us { return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +EXPORT_SYMBOL(sys_rename); /* * The worst of all namespace operations - renaming directory. "Perverted" @@ -2533,6 +2587,9 @@ int vfs_rename(struct inode *old_dir, st int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); const char *old_name; + if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) + return -EXDEV; + if (old_dentry->d_inode == new_dentry->d_inode) return 0; diff -uprN linux-2.6.24/fs/namespace.c linux-2.6.24.ovz/fs/namespace.c --- linux-2.6.24/fs/namespace.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/namespace.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,13 +32,15 @@ /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +EXPORT_SYMBOL(vfsmount_lock); static int event; static struct list_head *mount_hashtable __read_mostly; static int hash_mask __read_mostly, hash_bits __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +struct rw_semaphore namespace_sem; +EXPORT_SYMBOL(namespace_sem); /* /sys/fs */ decl_subsys(fs, NULL, NULL); @@ -56,6 +58,7 @@ struct vfsmount *alloc_vfsmnt(const char { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { + mnt->owner = VEID(get_exec_env()); atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -67,7 +70,7 @@ struct vfsmount *alloc_vfsmnt(const char INIT_LIST_HEAD(&mnt->mnt_slave); if (name) { int size = strlen(name) + 1; - char *newname = kmalloc(size, GFP_KERNEL); + char *newname = kmalloc(size, GFP_KERNEL_UBC); if (newname) { memcpy(newname, name, size); mnt->mnt_devname = newname; @@ -343,10 +346,33 @@ static inline void mangle(struct seq_fil seq_escape(m, s, " \t\n\\"); } +static int prepare_mnt_root_mangle(struct vfsmount *mnt, + char **path_buf, char **path) +{ + /* skip FS_NOMOUNT mounts (rootfs) */ + if (mnt->mnt_sb->s_flags & MS_NOUSER) + return -EACCES; + + *path_buf = (char *)__get_free_page(GFP_KERNEL); + if (!*path_buf) + return -ENOMEM; + + *path = d_path(mnt->mnt_root, mnt, *path_buf, PAGE_SIZE); + if (IS_ERR(*path)) { + free_page((unsigned long)*path_buf); + /* + * This means that the file position will be incremented, i.e. + * the total number of "invisible" vfsmnt will leak. + */ + return -EACCES; + } + return 0; +} + static int show_vfsmnt(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; + int err; static struct proc_fs_info { int flag; char *str; @@ -366,10 +392,20 @@ static int show_vfsmnt(struct seq_file * { 0, NULL } }; struct proc_fs_info *fs_infop; + char *path_buf, *path; - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + err = prepare_mnt_root_mangle(mnt, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); + + if (ve_is_super(get_exec_env()) || + !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC)) + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + else + mangle(m, mnt->mnt_sb->s_type->name); seq_putc(m, ' '); - seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + mangle(m, path); + free_page((unsigned long) path_buf); seq_putc(m, ' '); mangle(m, mnt->mnt_sb->s_type->name); if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { @@ -401,18 +437,27 @@ struct seq_operations mounts_op = { static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; + char *path_buf, *path; + int err; + + err = prepare_mnt_root_mangle(mnt, &path_buf, &path); + if (err < 0) + return (err == -EACCES ? 0 : err); /* device */ if (mnt->mnt_devname) { seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); + if (ve_is_super(get_exec_env())) + mangle(m, mnt->mnt_devname); + else + mangle(m, mnt->mnt_sb->s_type->name); } else seq_puts(m, "no device"); /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + mangle(m, path); + free_page((unsigned long)path_buf); seq_putc(m, ' '); /* file system type */ @@ -511,6 +556,7 @@ void release_mounts(struct list_head *he mntput(mnt); } } +EXPORT_SYMBOL(release_mounts); void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { @@ -533,6 +579,7 @@ void umount_tree(struct vfsmount *mnt, i change_mnt_propagation(p, MS_PRIVATE); } } +EXPORT_SYMBOL(umount_tree); static int do_umount(struct vfsmount *mnt, int flags) { @@ -620,6 +667,34 @@ static int do_umount(struct vfsmount *mn return retval; } +#ifdef CONFIG_VE +void umount_ve_fs_type(struct file_system_type *local_fs_type) +{ + struct vfsmount *mnt; + struct list_head *p, *q; + LIST_HEAD(kill); + LIST_HEAD(umount_list); + + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); + list_for_each_safe(p, q, ¤t->nsproxy->mnt_ns->list) { + mnt = list_entry(p, struct vfsmount, mnt_list); + if (mnt->mnt_sb->s_type != local_fs_type) + continue; + list_del(p); + list_add(p, &kill); + } + + while (!list_empty(&kill)) { + mnt = list_entry(kill.next, struct vfsmount, mnt_list); + umount_tree(mnt, 1, &umount_list); + } + spin_unlock(&vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); +} +#endif + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -643,7 +718,7 @@ asmlinkage long sys_umount(char __user * goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) goto dput_and_out; retval = do_umount(nd.mnt, flags); @@ -667,7 +742,7 @@ asmlinkage long sys_oldumount(char __use static int mount_is_safe(struct nameidata *nd) { - if (capable(CAP_SYS_ADMIN)) + if (capable(CAP_VE_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -906,6 +981,8 @@ static int do_change_type(struct nameida if (nd->dentry != nd->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) + return -EPERM; down_write(&namespace_sem); spin_lock(&vfsmount_lock); @@ -919,7 +996,8 @@ static int do_change_type(struct nameida /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, int recurse, + int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; @@ -949,6 +1027,7 @@ static int do_loopback(struct nameidata if (!mnt) goto out; + mnt->mnt_flags |= mnt_flags; err = graft_tree(mnt, nd); if (err) { LIST_HEAD(umount_list); @@ -974,8 +1053,9 @@ static int do_remount(struct nameidata * { int err; struct super_block *sb = nd->mnt->mnt_sb; + int bind; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!check_mnt(nd->mnt)) @@ -984,12 +1064,23 @@ static int do_remount(struct nameidata * if (nd->dentry != nd->mnt->mnt_root) return -EINVAL; + if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) + return -EPERM; + + /* do not allow to remount bind-mounts with another mountpoint flags */ + bind = 0; + if (nd->dentry != sb->s_root) { + if ((flags & ~(MS_BIND|MS_POSIXACL|MS_NOUSER)) != 0) + return -EINVAL; + bind = 1; + } + down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); + err = bind ? 0 : do_remount_sb(sb, flags, data, 0); if (!err) nd->mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); - if (!err) + if (!err && !bind) security_sb_post_remount(nd->mnt, flags, data); return err; } @@ -1009,7 +1100,7 @@ static int do_move_mount(struct nameidat struct nameidata old_nd, parent_nd; struct vfsmount *p; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1017,6 +1108,10 @@ static int do_move_mount(struct nameidat if (err) return err; + err = -EPERM; + if (!ve_accessible_veid(old_nd.mnt->owner, get_exec_env()->veid)) + goto out_nosem; + down_write(&namespace_sem); while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) ; @@ -1072,6 +1167,7 @@ out: up_write(&namespace_sem); if (!err) path_release(&parent_nd); +out_nosem: path_release(&old_nd); return err; } @@ -1089,7 +1185,7 @@ static int do_new_mount(struct nameidata return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; mnt = do_kern_mount(type, flags, name, data); @@ -1127,6 +1223,11 @@ int do_add_mount(struct vfsmount *newmnt goto unlock; newmnt->mnt_flags = mnt_flags; + + /* make this before graft_tree reveals mnt_root to the world... */ + if (nd->dentry->d_flags & DCACHE_VIRTUAL) + newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; + if ((err = graft_tree(newmnt, nd))) goto unlock; @@ -1446,7 +1547,7 @@ long do_mount(char *dev_name, char *dir_ retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags & MS_REC, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) @@ -1588,6 +1689,7 @@ out1: free_page(type_page); return retval; } +EXPORT_SYMBOL_GPL(sys_mount); /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -1639,7 +1741,7 @@ static void chroot_fs_refs(struct nameid struct fs_struct *fs; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_ve(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -1654,7 +1756,7 @@ static void chroot_fs_refs(struct nameid put_fs_struct(fs); } else task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_ve(g, p); read_unlock(&tasklist_lock); } @@ -1821,7 +1923,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); @@ -1883,3 +1985,5 @@ void __put_mnt_ns(struct mnt_namespace * release_mounts(&umount_list); kfree(ns); } + +EXPORT_SYMBOL_GPL(__put_mnt_ns); diff -uprN linux-2.6.24/fs/ncpfs/mmap.c linux-2.6.24.ovz/fs/ncpfs/mmap.c --- linux-2.6.24/fs/ncpfs/mmap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/ncpfs/mmap.c 2008-03-25 18:53:59.000000000 -0500 @@ -50,10 +50,6 @@ static int ncp_file_mmap_fault(struct vm pos = vmf->pgoff << PAGE_SHIFT; count = PAGE_SIZE; - if ((unsigned long)vmf->virtual_address + PAGE_SIZE > area->vm_end) { - WARN_ON(1); /* shouldn't happen? */ - count = area->vm_end - (unsigned long)vmf->virtual_address; - } /* what we can read in one go */ bufsize = NCP_SERVER(inode)->buffer_size; diff -uprN linux-2.6.24/fs/nfs/client.c linux-2.6.24.ovz/fs/nfs/client.c --- linux-2.6.24/fs/nfs/client.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/nfs/client.c 2008-03-25 18:53:59.000000000 -0500 @@ -116,6 +116,7 @@ static struct nfs_client *nfs_alloc_clie atomic_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; + clp->owner_env = get_exec_env(); clp->cl_nfsversion = nfsversion; memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); @@ -210,7 +211,9 @@ void nfs_put_client(struct nfs_client *c static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port) { struct nfs_client *clp; + struct ve_struct *ve; + ve = get_exec_env(); list_for_each_entry(clp, &nfs_client_list, cl_share_link) { /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) @@ -220,6 +223,9 @@ static struct nfs_client *__nfs_find_cli if (clp->cl_nfsversion != nfsversion) continue; + if (!ve_accessible_strict(clp->owner_env, ve)) + continue; + if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr, sizeof(clp->cl_addr.sin_addr)) != 0) continue; diff -uprN linux-2.6.24/fs/nfs/super.c linux-2.6.24.ovz/fs/nfs/super.c --- linux-2.6.24/fs/nfs/super.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/nfs/super.c 2008-03-25 18:53:59.000000000 -0500 @@ -48,6 +48,9 @@ #include #include #include +#include +#include +#include #include #include @@ -208,7 +211,8 @@ static struct file_system_type nfs_fs_ty .name = "nfs", .get_sb = nfs_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; struct file_system_type nfs_xdev_fs_type = { @@ -216,7 +220,8 @@ struct file_system_type nfs_xdev_fs_type .name = "nfs", .get_sb = nfs_xdev_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT| + FS_BINARY_MOUNTDATA|FS_VIRTUALIZED, }; static const struct super_operations nfs_sops = { @@ -280,6 +285,55 @@ static struct shrinker acl_shrinker = { .seeks = DEFAULT_SEEKS, }; +#ifdef CONFIG_VE +static int ve_nfs_start(void *data) +{ + return 0; +} + +static void ve_nfs_stop(void *data) +{ + struct ve_struct *ve; + struct super_block *sb; + + flush_scheduled_work(); + + ve = (struct ve_struct *)data; + /* Basically, on a valid stop we can be here iff NFS was mounted + read-only. In such a case client force-stop is not a problem. + If we are here and NFS is read-write, we are in a FORCE stop, so + force the client to stop. + Lock daemon is already dead. + Only superblock client remains. Den */ + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + struct rpc_clnt *clnt; + struct rpc_xprt *xprt; + if (sb->s_type != &nfs_fs_type) + continue; + clnt = NFS_SB(sb)->client; + if (!ve_accessible_strict(clnt->cl_xprt->owner_env, ve)) + continue; + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + + xprt = clnt->cl_xprt; + xprt_disconnect(xprt); + xprt->ops->close(xprt); + } + spin_unlock(&sb_lock); + + flush_scheduled_work(); +} + +static struct ve_hook nfs_hook = { + .init = ve_nfs_start, + .fini = ve_nfs_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* * Register the NFS filesystems */ @@ -300,6 +354,7 @@ int __init register_nfs_fs(void) goto error_2; #endif register_shrinker(&acl_shrinker); + ve_hook_register(VE_SS_CHAIN, &nfs_hook); return 0; #ifdef CONFIG_NFS_V4 @@ -318,6 +373,7 @@ error_0: void __exit unregister_nfs_fs(void) { unregister_shrinker(&acl_shrinker); + ve_hook_unregister(&nfs_hook); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); #endif @@ -561,6 +617,9 @@ static void nfs_umount_begin(struct vfsm struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb); struct rpc_clnt *rpc; + /* + * FIXME - think over wether this is OK + */ shrink_submounts(vfsmnt, &nfs_automount_list); if (!(flags & MNT_FORCE)) @@ -1353,6 +1412,11 @@ static int nfs_get_sb(struct file_system .mntflags = flags, }; int error; + struct ve_struct *ve; + + ve = get_exec_env(); + if (!ve_is_super(ve) && !(get_exec_env()->features & VE_FEATURE_NFS)) + return -ENODEV; /* Validate the mount data */ error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); diff -uprN linux-2.6.24/fs/open.c linux-2.6.24.ovz/fs/open.c --- linux-2.6.24/fs/open.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/open.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,21 @@ int vfs_statfs(struct dentry *dentry, st EXPORT_SYMBOL(vfs_statfs); -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +int faudit_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct faudit_statfs_arg arg; + + arg.sb = sb; + arg.stat = buf; + + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) + != NOTIFY_DONE) + return arg.err; + return 0; +} + +static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt, + struct statfs *buf) { struct kstatfs st; int retval; @@ -58,6 +73,10 @@ static int vfs_statfs_native(struct dent if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -92,7 +111,8 @@ static int vfs_statfs_native(struct dent return 0; } -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt, + struct statfs64 *buf) { struct kstatfs st; int retval; @@ -101,6 +121,10 @@ static int vfs_statfs64(struct dentry *d if (retval) return retval; + retval = faudit_statfs(mnt->mnt_sb, &st); + if (retval) + return retval; + if (sizeof(*buf) == sizeof(st)) memcpy(buf, &st, sizeof(st)); else { @@ -127,7 +151,7 @@ asmlinkage long sys_statfs(const char __ error = user_path_walk(path, &nd); if (!error) { struct statfs tmp; - error = vfs_statfs_native(nd.dentry, &tmp); + error = vfs_statfs_native(nd.dentry, nd.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -146,7 +170,7 @@ asmlinkage long sys_statfs64(const char error = user_path_walk(path, &nd); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(nd.dentry, &tmp); + error = vfs_statfs64(nd.dentry, nd.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -165,7 +189,7 @@ asmlinkage long sys_fstatfs(unsigned int file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); + error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -186,7 +210,7 @@ asmlinkage long sys_fstatfs64(unsigned i file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); + error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -591,15 +615,20 @@ out: return err; } -asmlinkage long sys_fchmodat(int dfd, const char __user *filename, - mode_t mode) +static long do_fchmodat(int dfd, const char __user *filename, mode_t mode, + int flags) { struct nameidata nd; struct inode * inode; - int error; + int error = -EINVAL; struct iattr newattrs; + int follow; + + if ((flags & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; - error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); + follow = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + error = __user_walk_fd(dfd, filename, follow, &nd); if (error) goto out; inode = nd.dentry->d_inode; @@ -626,6 +655,12 @@ out: return error; } +asmlinkage long sys_fchmodat(int dfd, const char __user *filename, + mode_t mode) +{ + return do_fchmodat(dfd, filename, mode, 0); +} + asmlinkage long sys_chmod(const char __user *filename, mode_t mode) { return sys_fchmodat(AT_FDCWD, filename, mode); @@ -680,6 +715,7 @@ asmlinkage long sys_chown(const char __u out: return error; } +EXPORT_SYMBOL_GPL(sys_chown); asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) @@ -897,6 +933,7 @@ struct file *nameidata_to_filp(struct na return filp; } +int odirect_enable = 0; /* * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an * error. @@ -906,6 +943,9 @@ struct file *dentry_open(struct dentry * int error; struct file *f; + if (!capable(CAP_SYS_RAWIO) && !odirect_enable) + flags &= ~O_DIRECT; + error = -ENFILE; f = get_empty_filp(); if (f == NULL) { @@ -1061,7 +1101,7 @@ asmlinkage long sys_open(const char __us prevent_tail_call(ret); return ret; } -EXPORT_UNUSED_SYMBOL_GPL(sys_open); /* To be deleted for 2.6.25 */ +EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) @@ -1195,3 +1235,8 @@ int nonseekable_open(struct inode *inode } EXPORT_SYMBOL(nonseekable_open); + +asmlinkage long sys_lchmod(char __user * filename, mode_t mode) +{ + return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW); +} diff -uprN linux-2.6.24/fs/partitions/check.c linux-2.6.24.ovz/fs/partitions/check.c --- linux-2.6.24/fs/partitions/check.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/partitions/check.c 2008-03-25 18:53:59.000000000 -0500 @@ -130,6 +130,7 @@ char *disk_name(struct gendisk *hd, int return buf; } +EXPORT_SYMBOL(disk_name); const char *bdevname(struct block_device *bdev, char *buf) { diff -uprN linux-2.6.24/fs/pipe.c linux-2.6.24.ovz/fs/pipe.c --- linux-2.6.24/fs/pipe.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/pipe.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,8 @@ #include #include +#include + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -477,7 +479,7 @@ redo1: int error, atomic = 1; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_UBC); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -857,7 +859,7 @@ struct pipe_inode_info * alloc_pipe_info { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC); if (pipe) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; @@ -1075,6 +1077,8 @@ int do_pipe(int *fd) return error; } +EXPORT_SYMBOL_GPL(do_pipe); + /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need diff -uprN linux-2.6.24/fs/proc/array.c linux-2.6.24.ovz/fs/proc/array.c --- linux-2.6.24/fs/proc/array.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/array.c 2008-03-25 18:53:59.000000000 -0500 @@ -79,6 +79,9 @@ #include #include +#include + +#include #include #include #include "internal.h" @@ -203,6 +206,19 @@ static inline char *task_state(struct ta put_group_info(group_info); buffer += sprintf(buffer, "\n"); + +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + buffer += sprintf(buffer, + "envID:\t%d\n" + "VPid:\t%d\n" + "PNState:\t%u\n" + "StopState:\t%u\n", + p->ve_task_info.owner_env->veid, + task_pid_vnr(p), + p->pn_state, + p->stopped_state); +#endif return buffer; } @@ -246,10 +262,10 @@ static void collect_sigign_sigcatch(stru } } -static inline char *task_sig(struct task_struct *p, char *buffer) +char *task_sig(struct task_struct *p, char *buffer) { unsigned long flags; - sigset_t pending, shpending, blocked, ignored, caught; + sigset_t pending, shpending, blocked, ignored, caught, saved; int num_threads = 0; unsigned long qsize = 0; unsigned long qlim = 0; @@ -259,12 +275,14 @@ static inline char *task_sig(struct task sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); + sigemptyset(&saved); rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; + saved = p->saved_sigmask; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&p->user->sigpending); @@ -282,6 +300,7 @@ static inline char *task_sig(struct task buffer = render_sigset_t("SigBlk:\t", &blocked, buffer); buffer = render_sigset_t("SigIgn:\t", &ignored, buffer); buffer = render_sigset_t("SigCgt:\t", &caught, buffer); + buffer = render_sigset_t("SigSvd:\t", &saved, buffer); return buffer; } @@ -296,6 +315,20 @@ static inline char *task_cap(struct task cap_t(p->cap_effective)); } +#ifdef CONFIG_BEANCOUNTERS +static inline void ub_dump_task_info(struct task_struct *tsk, + char *stsk, int ltsk, char *smm, int lmm) +{ + print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); + task_lock(tsk); + if (tsk->mm) + print_ub_uid(tsk->mm->mm_ub, smm, lmm); + else + strncpy(smm, "N/A", lmm); + task_unlock(tsk); +} +#endif + static inline char *task_context_switch_counts(struct task_struct *p, char *buffer) { @@ -309,6 +342,9 @@ int proc_pid_status(struct task_struct * { char *orig = buffer; struct mm_struct *mm = get_task_mm(task); +#ifdef CONFIG_BEANCOUNTERS + char tsk_ub_info[64], mm_ub_info[64]; +#endif buffer = task_name(task, buffer); buffer = task_state(task, buffer); @@ -324,6 +360,14 @@ int proc_pid_status(struct task_struct * buffer = task_show_regs(task, buffer); #endif buffer = task_context_switch_counts(task, buffer); +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, + tsk_ub_info, sizeof(tsk_ub_info), + mm_ub_info, sizeof(mm_ub_info)); + + buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info); + buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info); +#endif return buffer - orig; } @@ -406,6 +450,10 @@ static int do_task_stat(struct task_stru char tcomm[sizeof(task->comm)]; unsigned long flags; struct pid_namespace *ns; +#ifdef CONFIG_BEANCOUNTERS + char ub_task_info[64]; + char ub_mm_info[64]; +#endif ns = current->nsproxy->pid_ns; @@ -486,6 +534,7 @@ static int do_task_stat(struct task_stru priority = task_prio(task); nice = task_nice(task); +#ifndef CONFIG_VE /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ start_time = @@ -493,10 +542,25 @@ static int do_task_stat(struct task_stru + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); +#else + start_time = ve_relative_clock(&task->start_time); +#endif + +#ifdef CONFIG_BEANCOUNTERS + ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info), + ub_mm_info, sizeof(ub_mm_info)); +#endif res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld" +#ifdef CONFIG_VE + " 0 0 0 0 0 0 0 %d %u" +#endif +#ifdef CONFIG_BEANCOUNTERS + " %s %s" +#endif + "\n", task_pid_nr_ns(task, ns), tcomm, state, @@ -543,7 +607,16 @@ static int do_task_stat(struct task_stru task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime) +#ifdef CONFIG_VE + , task_pid_vnr(task), + VEID(VE_TASK_INFO(task)->owner_env) +#endif +#ifdef CONFIG_BEANCOUNTERS + , ub_task_info, + ub_mm_info +#endif + ); if (mm) mmput(mm); return res; diff -uprN linux-2.6.24/fs/proc/base.c linux-2.6.24.ovz/fs/proc/base.c --- linux-2.6.24/fs/proc/base.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/base.c 2008-03-25 18:53:59.000000000 -0500 @@ -165,8 +165,11 @@ static int proc_cwd_link(struct inode *i } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); + result = d_root_check(fs->pwd, fs->pwdmnt); + if (result == 0) { + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); + } read_unlock(&fs->lock); result = 0; put_fs_struct(fs); @@ -404,17 +407,19 @@ static int proc_pid_limits(struct task_s static int proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + int err; + /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ + err = -ENOENT; task = get_proc_task(inode); if (task) { - allowed = ptrace_may_attach(task); + err = (ptrace_may_attach(task) ? 0 : -EACCES); put_task_struct(task); } - return allowed; + return err; } static int proc_setattr(struct dentry *dentry, struct iattr *attr) @@ -843,6 +848,8 @@ static ssize_t oom_adjust_write(struct f if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; + if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env())) + return -EPERM; if (*end == '\n') end++; task = get_proc_task(file->f_path.dentry->d_inode); @@ -1086,13 +1093,14 @@ static const struct file_operations proc static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - int error = -EACCES; + int error; /* We don't need a base pointer in the /proc filesystem */ path_release(nd); /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); @@ -1130,13 +1138,14 @@ static int do_proc_readlink(struct dentr static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { - int error = -EACCES; + int error; struct inode *inode = dentry->d_inode; struct dentry *de; struct vfsmount *mnt = NULL; /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) + error = proc_fd_access_allowed(inode); + if (error < 0) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); @@ -1180,6 +1189,10 @@ static struct inode *proc_pid_make_inode struct inode * inode; struct proc_inode *ei; + if (!ve_accessible(task->ve_task_info.owner_env, + sb->s_type->owner_env)) + return NULL; + /* We need a new inode */ inode = new_inode(sb); @@ -1379,6 +1392,9 @@ static int proc_fd_info(struct inode *in struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); + int err; + + err = -ENOENT; if (task) { files = get_files_struct(task); @@ -1391,7 +1407,9 @@ static int proc_fd_info(struct inode *in */ spin_lock(&files->file_lock); file = fcheck_files(files, fd); - if (file) { + err = -EACCES; + if (file && !d_root_check(file->f_path.dentry, + file->f_path.mnt)) { if (mnt) *mnt = mntget(file->f_path.mnt); if (dentry) @@ -1409,7 +1427,7 @@ static int proc_fd_info(struct inode *in spin_unlock(&files->file_lock); put_files_struct(files); } - return -ENOENT; + return err; } static int proc_fd_link(struct inode *inode, struct dentry **dentry, diff -uprN linux-2.6.24/fs/proc/generic.c linux-2.6.24.ovz/fs/proc/generic.c --- linux-2.6.24/fs/proc/generic.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/generic.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include "internal.h" @@ -33,7 +35,7 @@ static loff_t proc_file_lseek(struct fil DEFINE_SPINLOCK(proc_subdir_lock); -static int proc_match(int len, const char *name, struct proc_dir_entry *de) +int proc_match(int len, const char *name, struct proc_dir_entry *de) { if (de->namelen != len) return 0; @@ -239,6 +241,10 @@ static int proc_notify_change(struct den struct proc_dir_entry *de = PDE(inode); int error; + if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) && + LPDE(inode) == GPDE(inode)) + return -EPERM; + error = inode_change_ok(inode, iattr); if (error) goto out; @@ -247,9 +253,12 @@ static int proc_notify_change(struct den if (error) goto out; - de->uid = inode->i_uid; - de->gid = inode->i_gid; - de->mode = inode->i_mode; + if (iattr->ia_valid & ATTR_UID) + de->uid = inode->i_uid; + if (iattr->ia_valid & ATTR_GID) + de->gid = inode->i_gid; + if (iattr->ia_valid & ATTR_MODE) + de->mode = inode->i_mode; out: return error; } @@ -275,7 +284,7 @@ static const struct inode_operations pro * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, +static int __xlate_proc_name(struct proc_dir_entry *root, const char *name, struct proc_dir_entry **ret, const char **residual) { const char *cp = name, *next; @@ -283,8 +292,13 @@ static int xlate_proc_name(const char *n int len; int rtn = 0; + if (*ret) { + de_get(*ret); + return 0; + } + spin_lock(&proc_subdir_lock); - de = &proc_root; + de = root; while (1) { next = strchr(cp, '/'); if (!next) @@ -302,12 +316,29 @@ static int xlate_proc_name(const char *n cp += len + 1; } *residual = cp; - *ret = de; + *ret = de_get(de); out: spin_unlock(&proc_subdir_lock); return rtn; } +#ifndef CONFIG_VE +#define xlate_proc_loc_name xlate_proc_name +#else +static int xlate_proc_loc_name(const char *name, + struct proc_dir_entry **ret, const char **residual) +{ + return __xlate_proc_name(get_exec_env()->proc_root, + name, ret, residual); +} +#endif + +static int xlate_proc_name(const char *name, + struct proc_dir_entry **ret, const char **residual) +{ + return __xlate_proc_name(&proc_root, name, ret, residual); +} + static DEFINE_IDR(proc_inum_idr); static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ @@ -379,6 +410,22 @@ static struct dentry_operations proc_den .d_delete = proc_delete_dentry, }; +static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, + struct dentry *d) +{ + struct proc_dir_entry *de; + + for (de = dir->subdir; de; de = de->next) { + if (de->namelen != d->d_name.len) + continue; + if (!memcmp(d->d_name.name, de->name, de->namelen)) + break; + } + if (de && de->shadow_proc) + de = de->shadow_proc(current, de); + return de_get(de); +} + /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -386,41 +433,121 @@ static struct dentry_operations proc_den struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode = NULL; - struct proc_dir_entry * de; + struct proc_dir_entry *lde, *gde; int error = -ENOENT; lock_kernel(); spin_lock(&proc_subdir_lock); - de = PDE(dir); - if (de) { - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino; - - if (de->shadow_proc) - de = de->shadow_proc(current, de); - ino = de->low_ino; - de_get(de); - spin_unlock(&proc_subdir_lock); - error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); - spin_lock(&proc_subdir_lock); - break; - } - } - } + lde = LPDE(dir); + if (lde) + lde = __proc_lookup(lde, dentry); + if (lde && !try_module_get(lde->owner)) { + de_put(lde); + lde = NULL; + } +#ifdef CONFIG_VE + gde = GPDE(dir); + if (gde) + gde = __proc_lookup(gde, dentry); + if (!lde && gde && !try_module_get(gde->owner)) { + de_put(gde); + gde = NULL; + } +#else + gde = NULL; +#endif spin_unlock(&proc_subdir_lock); + + /* + * There are following possible cases after lookup: + * + * lde gde + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * NULL NULL ENOENT + * loc NULL found in local tree + * loc glob found in both trees + * NULL glob found in global tree + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * We initialized inode as follows after lookup: + * + * inode->lde inode->gde + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * loc NULL in local tree + * loc glob both trees + * glob glob global tree + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * i.e. inode->lde is always initialized + */ + + if (lde == NULL && gde == NULL) + goto out; + + if (lde != NULL) { + de_get(lde); + inode = proc_get_inode(dir->i_sb, lde->low_ino, lde); + } else { + de_get(gde); + inode = proc_get_inode(dir->i_sb, gde->low_ino, gde); + } + + /* + * We can sleep in proc_get_inode(), but since we have i_sem + * being taken, no one can setup GPDE/LPDE on this inode. + */ + if (!inode) + goto out_put; + +#ifdef CONFIG_VE + GPDE(inode) = de_get(gde); + if (gde) + __module_get(gde->owner); + + /* if dentry is found in both trees and it is a directory + * then inode's nlink count must be altered, because local + * and global subtrees may differ. + * on the other hand, they may intersect, so actual nlink + * value is difficult to calculate - upper estimate is used + * instead of it. + * dentry found in global tree only must not be writable + * in non-super ve. + */ + if (lde && gde && lde != gde && gde->nlink > 1) + inode->i_nlink += gde->nlink - 2; + if (lde == NULL && !ve_is_super(dir->i_sb->s_type->owner_env)) + inode->i_mode &= ~S_IWUGO; +#endif + unlock_kernel(); + dentry->d_op = &proc_dentry_operations; + d_add(dentry, inode); + de_put(lde); + de_put(gde); + return NULL; + +out_put: + if (lde) + module_put(lde->owner); + else + module_put(gde->owner); + de_put(lde); + de_put(gde); +out: unlock_kernel(); + return ERR_PTR(error); +} - if (inode) { - dentry->d_op = &proc_dentry_operations; - d_add(dentry, inode); - return NULL; +static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir) +{ + struct proc_dir_entry *gde; + + for (gde = dir->subdir; gde; gde = gde->next) { + if (de->namelen != gde->namelen) + continue; + if (memcmp(de->name, gde->name, gde->namelen)) + continue; + return 1; } - de_put(de); - return ERR_PTR(error); + return 0; } /* @@ -470,11 +597,8 @@ int proc_readdir(struct file * filp, de = de->subdir; i -= 2; for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } + if (!de) + goto chk_global; if (!i) break; de = de->next; @@ -487,8 +611,9 @@ int proc_readdir(struct file * filp, /* filldir passes info to user space */ de_get(de); spin_unlock(&proc_subdir_lock); - if (filldir(dirent, de->name, de->namelen, filp->f_pos, - de->low_ino, de->mode >> 12) < 0) { + if (filldir(dirent, de->name, de->namelen, + filp->f_pos, de->low_ino, + de->mode >> 12) < 0) { de_put(de); goto out; } @@ -498,6 +623,44 @@ int proc_readdir(struct file * filp, de_put(de); de = next; } while (de); +chk_global: +#ifdef CONFIG_VE + de = GPDE(inode); + if (de == NULL) + goto done; + + de = de->subdir; + while (de) { + struct proc_dir_entry *tmp; + + /* skip local names */ + if (in_tree(de, LPDE(inode))) { + de = de->next; + continue; + } + + if (i > 0) { + i--; + de = de->next; + continue; + } + + de_get(de); + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, + filp->f_pos, de->low_ino, + de->mode >> 12) < 0) { + de_put(de); + goto out; + } + spin_lock(&proc_subdir_lock); + tmp = de->next; + de_put(de); + filp->f_pos++; + de = tmp; + } +done: +#endif spin_unlock(&proc_subdir_lock); } ret = 1; @@ -551,7 +714,7 @@ static int proc_register(struct proc_dir spin_lock(&proc_subdir_lock); dp->next = dir->subdir; - dp->parent = dir; + dp->parent = de_get(dir); dir->subdir = dp; spin_unlock(&proc_subdir_lock); @@ -570,17 +733,18 @@ static struct proc_dir_entry *proc_creat /* make sure name is valid */ if (!name || !strlen(name)) goto out; - if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) + if (xlate_proc_loc_name(name, parent, &fn) != 0) goto out; /* At this point there must not be any '/' characters beyond *fn */ if (strchr(fn, '/')) - goto out; + goto out_put; len = strlen(fn); ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); - if (!ent) goto out; + if (!ent) + goto out_put; memset(ent, 0, sizeof(struct proc_dir_entry)); memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); @@ -592,8 +756,12 @@ static struct proc_dir_entry *proc_creat ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); ent->pde_unload_completion = NULL; - out: return ent; + +out_put: + de_put(*parent); +out: + return NULL; } struct proc_dir_entry *proc_symlink(const char *name, @@ -617,6 +785,7 @@ struct proc_dir_entry *proc_symlink(cons kfree(ent); ent = NULL; } + de_put(parent); } return ent; } @@ -632,6 +801,7 @@ struct proc_dir_entry *proc_mkdir_mode(c kfree(ent); ent = NULL; } + de_put(parent); } return ent; } @@ -666,9 +836,28 @@ struct proc_dir_entry *create_proc_entry kfree(ent); ent = NULL; } + de_put(parent); } return ent; } +EXPORT_SYMBOL(remove_proc_glob_entry); + +struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent) +{ + const char *path; + struct proc_dir_entry *ent; + + path = name; + if (xlate_proc_name(path, &parent, &name) != 0) + return NULL; + + ent = create_proc_entry(name, mode, parent); + de_put(parent); + return ent; +} + +EXPORT_SYMBOL(create_proc_glob_entry); void free_proc_entry(struct proc_dir_entry *de) { @@ -687,15 +876,13 @@ void free_proc_entry(struct proc_dir_ent /* * Remove a /proc entry and free it if it's not currently in use. */ -void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent) { struct proc_dir_entry **p; struct proc_dir_entry *de; const char *fn = name; int len; - if (!parent && xlate_proc_name(name, &parent, &fn) != 0) - goto out; len = strlen(fn); spin_lock(&proc_subdir_lock); @@ -734,11 +921,42 @@ continue_removing: parent->nlink--; de->nlink = 0; WARN_ON(de->subdir); - if (atomic_dec_and_test(&de->count)) - free_proc_entry(de); + de_put(parent); + de_put(de); break; } spin_unlock(&proc_subdir_lock); -out: - return; +} + +void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent) +{ + const char *path; + + path = name; + if (xlate_proc_loc_name(path, &parent, &name) != 0) + return; + + __remove_proc_entry(name, parent); + de_put(parent); +} + +void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent) +{ + const char *path; + + path = name; + if (xlate_proc_name(path, &parent, &name) != 0) + return; + + __remove_proc_entry(name, parent); + de_put(parent); +} + +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + remove_proc_loc_entry(name, parent); +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + remove_proc_glob_entry(name, parent); +#endif } diff -uprN linux-2.6.24/fs/proc/inode.c linux-2.6.24.ovz/fs/proc/inode.c --- linux-2.6.24/fs/proc/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,16 +36,13 @@ struct proc_dir_entry *de_get(struct pro void de_put(struct proc_dir_entry *de) { if (de) { - lock_kernel(); if (!atomic_read(&de->count)) { printk("de_put: entry %s already free!\n", de->name); - unlock_kernel(); return; } if (atomic_dec_and_test(&de->count)) free_proc_entry(de); - unlock_kernel(); } } @@ -62,16 +59,25 @@ static void proc_delete_inode(struct ino put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ - de = PROC_I(inode)->pde; + de = LPDE(inode); if (de) { if (de->owner) module_put(de->owner); de_put(de); } +#ifdef CONFIG_VE + de = GPDE(inode); + if (de) { + module_put(de->owner); + de_put(de); + } +#endif clear_inode(inode); } +#ifndef CONFIG_VE struct vfsmount *proc_mnt; +#endif static void proc_read_inode(struct inode * inode) { @@ -94,6 +100,9 @@ static struct inode *proc_alloc_inode(st ei->pde = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; +#ifdef CONFIG_VE + GPDE(inode) = NULL; +#endif return inode; } @@ -398,12 +407,9 @@ struct inode *proc_get_inode(struct supe { struct inode * inode; - if (de != NULL && !try_module_get(de->owner)) - goto out_mod; - inode = iget(sb, ino); if (!inode) - goto out_ino; + goto out_mod; PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; @@ -436,9 +442,6 @@ struct inode *proc_get_inode(struct supe return inode; -out_ino: - if (de != NULL) - module_put(de->owner); out_mod: return NULL; } @@ -463,6 +466,12 @@ int proc_fill_super(struct super_block * s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; +#ifdef CONFIG_VE + LPDE(root_inode) = de_get(get_exec_env()->proc_root); + GPDE(root_inode) = &proc_root; +#else + LPDE(root_inode) = &proc_root; +#endif return 0; out_no_root: diff -uprN linux-2.6.24/fs/proc/kmsg.c linux-2.6.24.ovz/fs/proc/kmsg.c --- linux-2.6.24/fs/proc/kmsg.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/kmsg.c 2008-03-25 18:53:59.000000000 -0500 @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -40,7 +42,7 @@ static ssize_t kmsg_read(struct file *fi static unsigned int kmsg_poll(struct file *file, poll_table *wait) { - poll_wait(file, &log_wait, wait); + poll_wait(file, &ve_log_wait, wait); if (do_syslog(9, NULL, 0)) return POLLIN | POLLRDNORM; return 0; @@ -53,3 +55,4 @@ const struct file_operations proc_kmsg_o .open = kmsg_open, .release = kmsg_release, }; +EXPORT_SYMBOL(proc_kmsg_operations); diff -uprN linux-2.6.24/fs/proc/proc_misc.c linux-2.6.24.ovz/fs/proc/proc_misc.c --- linux-2.6.24/fs/proc/proc_misc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/proc_misc.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -53,8 +55,10 @@ #include #include "internal.h" -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +#ifdef CONFIG_FAIRSCHED +#include +#endif + /* * Warning: stuff below (imported functions) assumes that its output will fit * into one page. For some of those functions it may be wrong. Moreover, we @@ -83,15 +87,30 @@ static int loadavg_read_proc(char *page, { int a, b, c; int len; + long running, threads; + struct ve_struct *ve; - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + ve = get_exec_env(); + if (ve_is_super(ve)) { + a = avenrun[0] + (FIXED_1/200); + b = avenrun[1] + (FIXED_1/200); + c = avenrun[2] + (FIXED_1/200); + running = nr_running(); + threads = nr_threads; +#ifdef CONFIG_VE + } else { + a = ve->avenrun[0] + (FIXED_1/200); + b = ve->avenrun[1] + (FIXED_1/200); + c = ve->avenrun[2] + (FIXED_1/200); + running = nr_running_ve(ve); + threads = atomic_read(&ve->pcounter); +#endif + } + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%ld %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, + running, threads, task_active_pid_ns(current)->last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -106,6 +125,13 @@ static int uptime_read_proc(char *page, do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) { + set_normalized_timespec(&uptime, + uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, + uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + } +#endif cputime_to_timespec(idletime, &idle); len = sprintf(page,"%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, @@ -119,29 +145,49 @@ static int uptime_read_proc(char *page, static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct sysinfo i; + struct meminfo mi; int len; - unsigned long committed; - unsigned long allowed; + unsigned long dummy; struct vmalloc_info vmi; - long cached; + + get_zone_counts(&mi.active, &mi.inactive, &dummy); /* * display in kilobytes. */ #define K(x) ((x) << (PAGE_SHIFT - 10)) - si_meminfo(&i); - si_swapinfo(&i); - committed = atomic_read(&vm_committed_space); - allowed = ((totalram_pages - hugetlb_total_pages()) + si_meminfo(&mi.si); + si_swapinfo(&mi.si); + mi.committed_space = atomic_read(&vm_committed_space); + mi.swapcache = total_swapcache_pages; + mi.allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; - cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; - if (cached < 0) - cached = 0; + mi.cache = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - mi.si.bufferram; + if (mi.cache < 0) + mi.cache = 0; get_vmalloc_info(&vmi); + mi.vmalloc_used = vmi.used >> PAGE_SHIFT; + mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; + mi.vmalloc_total = VMALLOC_TOTAL >> PAGE_SHIFT; + + mi.pi.nr_file_dirty = global_page_state(NR_FILE_DIRTY); + mi.pi.nr_writeback = global_page_state(NR_WRITEBACK); + mi.pi.nr_anon_pages = global_page_state(NR_ANON_PAGES); + mi.pi.nr_file_mapped = global_page_state(NR_FILE_MAPPED); + mi.pi.nr_slab_rec = global_page_state(NR_SLAB_RECLAIMABLE); + mi.pi.nr_slab_unrec = global_page_state(NR_SLAB_UNRECLAIMABLE); + mi.pi.nr_pagetable = global_page_state(NR_PAGETABLE); + mi.pi.nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); + mi.pi.nr_bounce = global_page_state(NR_BOUNCE); + +#ifdef CONFIG_BEANCOUNTERS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * Tagged format, for easy grepping and expansion. @@ -177,37 +223,37 @@ static int meminfo_read_proc(char *page, "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n", - K(i.totalram), - K(i.freeram), - K(i.bufferram), - K(cached), - K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(mi.si.totalram), + K(mi.si.freeram), + K(mi.si.bufferram), + K(mi.cache), + K(mi.swapcache), + K(mi.active), + K(mi.inactive), #ifdef CONFIG_HIGHMEM - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), -#endif - K(i.totalswap), - K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_SLAB_RECLAIMABLE)), - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_PAGETABLE)), - K(global_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(allowed), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 + K(mi.si.totalhigh), + K(mi.si.freehigh), + K(mi.si.totalram-mi.si.totalhigh), + K(mi.si.freeram-mi.si.freehigh), +#endif + K(mi.si.totalswap), + K(mi.si.freeswap), + K(mi.pi.nr_file_dirty), + K(mi.pi.nr_writeback), + K(mi.pi.nr_anon_pages), + K(mi.pi.nr_file_mapped), + K(mi.pi.nr_slab_rec + + mi.pi.nr_slab_unrec), + K(mi.pi.nr_slab_rec), + K(mi.pi.nr_slab_unrec), + K(mi.pi.nr_pagetable), + K(mi.pi.nr_unstable_nfs), + K(mi.pi.nr_bounce), + K(mi.allowed), + K(mi.committed_space), + K(mi.vmalloc_total), + K(mi.vmalloc_used), + K(mi.vmalloc_largest) ); len += hugetlb_report_meminfo(page + len); @@ -451,25 +497,21 @@ static const struct file_operations proc #endif #endif -static int show_stat(struct seq_file *p, void *v) +static void show_stat_ve0(struct seq_file *p) { int i; - unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; - struct timespec boottime; unsigned int *per_irq_sum; per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); if (!per_irq_sum) - return -ENOMEM; + return; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; guest = cputime64_zero; - getboottime(&boottime); - jif = boottime.tv_sec; for_each_possible_cpu(i) { int j; @@ -529,9 +571,89 @@ static int show_stat(struct seq_file *p, for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", per_irq_sum[i]); + kfree(per_irq_sum); +#ifdef CONFIG_VM_EVENT_COUNTERS + seq_printf(p, "\nswap %lu %lu\n", + vm_events(PSWPIN), vm_events(PSWPOUT)); +#else + seq_printf(p, "\nswap 0 0\n"); +#endif +} + +#ifdef CONFIG_VE +static void show_stat_ve(struct seq_file *p, struct ve_struct *ve) +{ + int i; + u64 user, nice, system; + cycles_t idle, iowait; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + + user = nice = system = idle = iowait = 0; + for_each_cpu_mask(i, ve_cpus) { + user += VE_CPU_STATS(ve, i)->user; + nice += VE_CPU_STATS(ve, i)->nice; + system += VE_CPU_STATS(ve, i)->system; + idle += ve_sched_get_idle_time(ve, i); + iowait += ve_sched_get_iowait_time(ve, i); + } + + seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + + for_each_cpu_mask(i, ve_cpus) { + user = VE_CPU_STATS(ve, i)->user; + nice = VE_CPU_STATS(ve, i)->nice; + system = VE_CPU_STATS(ve, i)->system; + idle = ve_sched_get_idle_time(ve, i); + iowait = ve_sched_get_iowait_time(ve, i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cycles_to_clocks(idle), + (unsigned long long)cycles_to_clocks(iowait)); + } + seq_printf(p, "intr 0\nswap 0 0\n"); +} +#endif + +int show_stat(struct seq_file *p, void *v) +{ + extern unsigned long total_forks; + unsigned long seq, jif; + struct ve_struct *env; + unsigned long __nr_running, __nr_iowait; + + do { + seq = read_seqbegin(&xtime_lock); + jif = - wall_to_monotonic.tv_sec; + if (wall_to_monotonic.tv_nsec) + --jif; + } while (read_seqretry(&xtime_lock, seq)); + + env = get_exec_env(); + if (ve_is_super(env)) { + show_stat_ve0(p); + __nr_running = nr_running(); + __nr_iowait = nr_iowait(); + } +#ifdef CONFIG_VE + else { + show_stat_ve(p, env); + __nr_running = nr_running_ve(env); + __nr_iowait = nr_iowait_ve(env); + } +#endif seq_printf(p, - "\nctxt %llu\n" + "ctxt %llu\n" "btime %lu\n" "processes %lu\n" "procs_running %lu\n" @@ -539,10 +661,9 @@ static int show_stat(struct seq_file *p, nr_context_switches(), (unsigned long)jif, total_forks, - nr_running(), - nr_iowait()); + __nr_running, + __nr_iowait); - kfree(per_irq_sum); return 0; } @@ -630,7 +751,8 @@ static int cmdline_read_proc(char *page, { int len; - len = sprintf(page, "%s\n", saved_command_line); + len = sprintf(page, "%s\n", + ve_is_super(get_exec_env()) ? saved_command_line : "quiet"); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -661,11 +783,16 @@ static ssize_t write_sysrq_trigger(struc size_t count, loff_t *ppos) { if (count) { - char c; + int i, cnt; + char c[32]; - if (get_user(c, buf)) + cnt = min(count, sizeof(c)); + if (copy_from_user(c, buf, cnt)) return -EFAULT; - __handle_sysrq(c, NULL, 0); + + + for (i = 0; i < cnt && c[i] != '\n'; i++) + __handle_sysrq(c[i], NULL, 0); } return count; } diff -uprN linux-2.6.24/fs/proc/proc_net.c linux-2.6.24.ovz/fs/proc/proc_net.c --- linux-2.6.24/fs/proc/proc_net.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/proc_net.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,7 +31,7 @@ struct proc_dir_entry *proc_net_fops_cre { struct proc_dir_entry *res; - res = create_proc_entry(name, mode, net->proc_net); + res = create_proc_entry(name, mode, get_exec_env()->_proc_net); if (res) res->proc_fops = fops; return res; @@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(proc_net_fops_create); void proc_net_remove(struct net *net, const char *name) { - remove_proc_entry(name, net->proc_net); + remove_proc_entry(name, get_exec_env()->_proc_net); } EXPORT_SYMBOL_GPL(proc_net_remove); @@ -50,68 +50,26 @@ struct net *get_proc_net(const struct in } EXPORT_SYMBOL_GPL(get_proc_net); -static struct proc_dir_entry *shadow_pde; - static struct proc_dir_entry *proc_net_shadow(struct task_struct *task, struct proc_dir_entry *de) { return task->nsproxy->net_ns->proc_net; } -static __net_init int proc_net_ns_init(struct net *net) -{ - struct proc_dir_entry *root, *netd, *net_statd; - int err; - - err = -ENOMEM; - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - goto out; - - err = -EEXIST; - netd = proc_mkdir("net", root); - if (!netd) - goto free_root; - - err = -EEXIST; - net_statd = proc_mkdir("stat", netd); - if (!net_statd) - goto free_net; - - root->data = net; - netd->data = net; - net_statd->data = net; - - net->proc_net_root = root; - net->proc_net = netd; - net->proc_net_stat = net_statd; - err = 0; - -out: - return err; -free_net: - remove_proc_entry("net", root); -free_root: - kfree(root); - goto out; -} - -static __net_exit void proc_net_ns_exit(struct net *net) -{ - remove_proc_entry("stat", net->proc_net); - remove_proc_entry("net", net->proc_net_root); - kfree(net->proc_net_root); -} - -static struct pernet_operations __net_initdata proc_net_ns_ops = { - .init = proc_net_ns_init, - .exit = proc_net_ns_exit, -}; - int __init proc_net_init(void) { - shadow_pde = proc_mkdir("net", NULL); - shadow_pde->shadow_proc = proc_net_shadow; + struct proc_dir_entry *pde; + + pde = proc_mkdir("net", NULL); + pde->shadow_proc = proc_net_shadow; + init_net.proc_net = pde; + ve0._proc_net = pde; + pde->data = &init_net; + + pde = proc_mkdir("stat", pde); + init_net.proc_net_stat = pde; + ve0._proc_net_stat = pde; + pde->data = &init_net; - return register_pernet_subsys(&proc_net_ns_ops); + return 0; } diff -uprN linux-2.6.24/fs/proc/proc_sysctl.c linux-2.6.24.ovz/fs/proc/proc_sysctl.c --- linux-2.6.24/fs/proc/proc_sysctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/proc_sysctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -1,15 +1,15 @@ /* * /proc/sys support */ - +#include #include #include #include #include "internal.h" static struct dentry_operations proc_sys_dentry_operations; -static const struct file_operations proc_sys_file_operations; -static struct inode_operations proc_sys_inode_operations; +extern const struct file_operations proc_sys_file_operations; +extern struct inode_operations proc_sys_inode_operations; static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table) { @@ -440,17 +440,19 @@ static int proc_sys_setattr(struct dentr /* I'm lazy and don't distinguish between files and directories, * until access time. */ -static const struct file_operations proc_sys_file_operations = { +const struct file_operations proc_sys_file_operations = { .read = proc_sys_read, .write = proc_sys_write, .readdir = proc_sys_readdir, }; +EXPORT_SYMBOL_GPL(proc_sys_file_operations); -static struct inode_operations proc_sys_inode_operations = { +struct inode_operations proc_sys_inode_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, }; +EXPORT_SYMBOL_GPL(proc_sys_inode_operations); static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) { @@ -466,13 +468,11 @@ static struct dentry_operations proc_sys .d_revalidate = proc_sys_revalidate, }; -static struct proc_dir_entry *proc_sys_root; - int proc_sys_init(void) { - proc_sys_root = proc_mkdir("sys", NULL); - proc_sys_root->proc_iops = &proc_sys_inode_operations; - proc_sys_root->proc_fops = &proc_sys_file_operations; - proc_sys_root->nlink = 0; + ve0.proc_sys_root = proc_mkdir("sys", NULL); + ve0.proc_sys_root->proc_iops = &proc_sys_inode_operations; + ve0.proc_sys_root->proc_fops = &proc_sys_file_operations; + ve0.proc_sys_root->nlink = 0; return 0; } diff -uprN linux-2.6.24/fs/proc/proc_tty.c linux-2.6.24.ovz/fs/proc/proc_tty.c --- linux-2.6.24/fs/proc/proc_tty.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/proc_tty.c 2008-03-25 18:53:59.000000000 -0500 @@ -13,6 +13,7 @@ #include #include #include +#include #include static int tty_ldiscs_read_proc(char *page, char **start, off_t off, @@ -73,6 +74,9 @@ static int show_tty_driver(struct seq_fi dev_t from = MKDEV(p->major, p->minor_start); dev_t to = from + p->num; + if (!ve_accessible_strict(p->owner_env, get_exec_env())) + goto out; + if (&p->tty_drivers == tty_drivers.next) { /* pseudo-drivers first */ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); @@ -100,6 +104,7 @@ static int show_tty_driver(struct seq_fi } if (from != to) show_tty_range(m, p, from, to - from); +out: return 0; } diff -uprN linux-2.6.24/fs/proc/root.c linux-2.6.24.ovz/fs/proc/root.c --- linux-2.6.24/fs/proc/root.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/root.c 2008-03-25 18:53:59.000000000 -0500 @@ -45,7 +45,9 @@ static int proc_get_sb(struct file_syste struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + struct vfsmount *proc_mnt; + proc_mnt = proc_mnt(fs_type->owner_env); if (proc_mnt) { /* Seed the root directory with a pid so it doesn't need * to be special in base.c. I would do this earlier but @@ -98,12 +100,14 @@ static void proc_kill_sb(struct super_bl put_pid_ns(ns); } -static struct file_system_type proc_fs_type = { +struct file_system_type proc_fs_type = { .name = "proc", .get_sb = proc_get_sb, .kill_sb = proc_kill_sb, }; +EXPORT_SYMBOL(proc_fs_type); + void __init proc_root_init(void) { int err = proc_init_inodecache(); @@ -112,9 +116,9 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - err = PTR_ERR(proc_mnt); - if (IS_ERR(proc_mnt)) { + proc_mnt(get_ve0()) = kern_mount_data(&proc_fs_type, &init_pid_ns); + err = PTR_ERR(proc_mnt(get_ve0())); + if (IS_ERR(proc_mnt(get_ve0()))) { unregister_filesystem(&proc_fs_type); return; } diff -uprN linux-2.6.24/fs/proc/task_mmu.c linux-2.6.24.ovz/fs/proc/task_mmu.c --- linux-2.6.24/fs/proc/task_mmu.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/task_mmu.c 2008-03-25 18:53:59.000000000 -0500 @@ -95,9 +95,12 @@ int proc_exe_link(struct inode *inode, s } if (vma) { - *mnt = mntget(vma->vm_file->f_path.mnt); - *dentry = dget(vma->vm_file->f_path.dentry); - result = 0; + result = d_root_check(vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); + if (!result) { + *mnt = mntget(vma->vm_file->f_path.mnt); + *dentry = dget(vma->vm_file->f_path.dentry); + } } up_read(&mm->mmap_sem); diff -uprN linux-2.6.24/fs/proc/task_nommu.c linux-2.6.24.ovz/fs/proc/task_nommu.c --- linux-2.6.24/fs/proc/task_nommu.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/proc/task_nommu.c 2008-03-25 18:53:59.000000000 -0500 @@ -127,9 +127,12 @@ int proc_exe_link(struct inode *inode, s } if (vma) { - *mnt = mntget(vma->vm_file->f_path.mnt); - *dentry = dget(vma->vm_file->f_path.dentry); - result = 0; + result = d_root_check(vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); + if (!result) { + *mnt = mntget(vma->vm_file->f_path.mnt); + *dentry = dget(vma->vm_file->f_path.dentry); + } } up_read(&mm->mmap_sem); diff -uprN linux-2.6.24/fs/quota.c linux-2.6.24.ovz/fs/quota.c --- linux-2.6.24/fs/quota.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/quota.c 2008-03-25 18:53:59.000000000 -0500 @@ -82,11 +82,11 @@ static int generic_quotactl_valid(struct if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current->euid != id) || (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; @@ -133,10 +133,10 @@ static int xqm_quotactl_valid(struct sup if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current->euid != id) || (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_VE_SYS_ADMIN)) return -EPERM; } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; } @@ -178,6 +178,8 @@ static void quota_sync_sb(struct super_b continue; if (!sb_has_quota_enabled(sb, cnt)) continue; + if (!sb_dqopt(sb)->files[cnt]) + continue; mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); @@ -208,7 +210,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_qcop->quota_sync) + if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) quota_sync_sb(sb, type); up_read(&sb->s_umount); spin_lock(&sb_lock); @@ -342,7 +344,7 @@ static inline struct super_block *quotac if (IS_ERR(tmp)) return ERR_PTR(PTR_ERR(tmp)); - bdev = lookup_bdev(tmp); + bdev = lookup_bdev(tmp, FMODE_QUOTACTL); putname(tmp); if (IS_ERR(bdev)) return ERR_PTR(PTR_ERR(bdev)); @@ -357,6 +359,215 @@ static inline struct super_block *quotac #endif } +#ifdef CONFIG_QUOTA_COMPAT + +#define QC_QUOTAON 0x0100 /* enable quotas */ +#define QC_QUOTAOFF 0x0200 /* disable quotas */ +/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ +#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define QC_SETQLIM 0x0700 /* set limits */ +/* GETSTATS at 0x0800 is now longer... */ +#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ +#define QC_SETINFO 0x0A00 /* set info about quotas */ +#define QC_SETGRACE 0x0B00 /* set inode and block grace */ +#define QC_SETFLAGS 0x0C00 /* set flags for quota */ +#define QC_GETQUOTA 0x0D00 /* get limits and usage */ +#define QC_SETQUOTA 0x0E00 /* set limits and usage */ +#define QC_SETUSE 0x0F00 /* set usage */ +/* 0x1000 used by old RSQUASH */ +#define QC_GETSTATS 0x1100 /* get collected stats */ + +struct compat_dqblk { + unsigned int dqb_ihardlimit; + unsigned int dqb_isoftlimit; + unsigned int dqb_curinodes; + unsigned int dqb_bhardlimit; + unsigned int dqb_bsoftlimit; + qsize_t dqb_curspace; + __kernel_time_t dqb_btime; + __kernel_time_t dqb_itime; +}; + +struct compat_dqinfo { + unsigned int dqi_bgrace; + unsigned int dqi_igrace; + unsigned int dqi_flags; + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; +}; + +struct compat_dqstats { + __u32 lookups; + __u32 drops; + __u32 reads; + __u32 writes; + __u32 cache_hits; + __u32 allocated_dquots; + __u32 free_dquots; + __u32 syncs; + __u32 version; +}; + +asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); +static long compat_quotactl(unsigned int cmds, unsigned int type, + const char __user *special, qid_t id, + void __user *addr) +{ + struct super_block *sb; + long ret; + + sb = NULL; + switch (cmds) { + case QC_QUOTAON: + return sys_quotactl(QCMD(Q_QUOTAON, type), + special, id, addr); + + case QC_QUOTAOFF: + return sys_quotactl(QCMD(Q_QUOTAOFF, type), + special, id, addr); + + case QC_SYNC: + return sys_quotactl(QCMD(Q_SYNC, type), + special, id, addr); + + case QC_GETQUOTA: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + break; + cdq.dqb_ihardlimit = idq.dqb_ihardlimit; + cdq.dqb_isoftlimit = idq.dqb_isoftlimit; + cdq.dqb_curinodes = idq.dqb_curinodes; + cdq.dqb_bhardlimit = idq.dqb_bhardlimit; + cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; + cdq.dqb_curspace = idq.dqb_curspace; + cdq.dqb_btime = idq.dqb_btime; + cdq.dqb_itime = idq.dqb_itime; + ret = 0; + if (copy_to_user(addr, &cdq, sizeof(cdq))) + ret = -EFAULT; + break; + } + + case QC_SETQUOTA: + case QC_SETUSE: + case QC_SETQLIM: { + struct if_dqblk idq; + struct compat_dqblk cdq; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cdq, addr, sizeof(cdq))) + break; + idq.dqb_ihardlimit = cdq.dqb_ihardlimit; + idq.dqb_isoftlimit = cdq.dqb_isoftlimit; + idq.dqb_curinodes = cdq.dqb_curinodes; + idq.dqb_bhardlimit = cdq.dqb_bhardlimit; + idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; + idq.dqb_curspace = cdq.dqb_curspace; + idq.dqb_valid = 0; + if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) + idq.dqb_valid |= QIF_LIMITS; + if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) + idq.dqb_valid |= QIF_USAGE; + ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); + break; + } + + case QC_GETINFO: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); + if (ret) + break; + ret = sb->s_qcop->get_info(sb, type, &iinf); + if (ret) + break; + cinf.dqi_bgrace = iinf.dqi_bgrace; + cinf.dqi_igrace = iinf.dqi_igrace; + cinf.dqi_flags = 0; + if (iinf.dqi_flags & DQF_INFO_DIRTY) + cinf.dqi_flags |= 0x0010; + cinf.dqi_blocks = 0; + cinf.dqi_free_blk = 0; + cinf.dqi_free_entry = 0; + ret = 0; + if (copy_to_user(addr, &cinf, sizeof(cinf))) + ret = -EFAULT; + break; + } + + case QC_SETINFO: + case QC_SETGRACE: + case QC_SETFLAGS: { + struct if_dqinfo iinf; + struct compat_dqinfo cinf; + + sb = quotactl_block(special); + ret = PTR_ERR(sb); + if (IS_ERR(sb)) + break; + ret = check_quotactl_valid(sb, type, Q_SETINFO, id); + if (ret) + break; + ret = -EFAULT; + if (copy_from_user(&cinf, addr, sizeof(cinf))) + break; + iinf.dqi_bgrace = cinf.dqi_bgrace; + iinf.dqi_igrace = cinf.dqi_igrace; + iinf.dqi_flags = cinf.dqi_flags; + iinf.dqi_valid = 0; + if (cmds == QC_SETINFO || cmds == QC_SETGRACE) + iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; + if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) + iinf.dqi_valid |= IIF_FLAGS; + ret = sb->s_qcop->set_info(sb, type, &iinf); + break; + } + + case QC_GETSTATS: { + struct compat_dqstats stat; + + memset(&stat, 0, sizeof(stat)); + stat.version = 6*10000+5*100+0; + ret = 0; + if (copy_to_user(addr, &stat, sizeof(stat))) + ret = -EFAULT; + break; + } + + default: + ret = -ENOSYS; + break; + } + if (sb && !IS_ERR(sb)) + drop_super(sb); + return ret; +} + +#endif + /* * This is the system call interface. This communicates with * the user-level programs. Currently this only supports diskquota @@ -372,6 +583,11 @@ asmlinkage long sys_quotactl(unsigned in cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; +#ifdef CONFIG_QUOTA_COMPAT + if (cmds >= 0x0100 && cmds < 0x3000) + return compat_quotactl(cmds, type, special, id, addr); +#endif + if (cmds != Q_SYNC || special) { sb = quotactl_block(special); if (IS_ERR(sb)) diff -uprN linux-2.6.24/fs/read_write.c linux-2.6.24.ovz/fs/read_write.c --- linux-2.6.24/fs/read_write.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/read_write.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,8 @@ #include #include +#include + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -354,6 +356,29 @@ static inline void file_pos_write(struct file->f_pos = pos; } +static inline void bc_acct_write(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, write); + ub_percpu_add(ub, wchar, bytes); + } +} + +static inline void bc_acct_read(size_t bytes) +{ + struct user_beancounter *ub; + + if (bytes > 0) { + ub = get_exec_ub(); + ub_percpu_inc(ub, read); + ub_percpu_add(ub, rchar, bytes); + } +} + + asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; @@ -366,6 +391,8 @@ asmlinkage ssize_t sys_read(unsigned int ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -384,6 +411,8 @@ asmlinkage ssize_t sys_write(unsigned in ret = vfs_write(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -405,6 +434,8 @@ asmlinkage ssize_t sys_pread64(unsigned if (file->f_mode & FMODE_PREAD) ret = vfs_read(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } return ret; @@ -426,6 +457,8 @@ asmlinkage ssize_t sys_pwrite64(unsigned if (file->f_mode & FMODE_PWRITE) ret = vfs_write(file, buf, count, &pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } return ret; @@ -673,6 +706,8 @@ sys_readv(unsigned long fd, const struct ret = vfs_readv(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_read(ret); } if (ret > 0) @@ -694,6 +729,8 @@ sys_writev(unsigned long fd, const struc ret = vfs_writev(file, vec, vlen, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); + + bc_acct_write(ret); } if (ret > 0) diff -uprN linux-2.6.24/fs/reiserfs/namei.c linux-2.6.24.ovz/fs/reiserfs/namei.c --- linux-2.6.24/fs/reiserfs/namei.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/reiserfs/namei.c 2008-03-25 18:53:59.000000000 -0500 @@ -859,6 +859,9 @@ static int reiserfs_rmdir(struct inode * INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + inode = dentry->d_inode; + DQUOT_INIT(inode); + /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -883,8 +886,6 @@ static int reiserfs_rmdir(struct inode * goto end_rmdir; } - inode = dentry->d_inode; - reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -947,6 +948,7 @@ static int reiserfs_unlink(struct inode unsigned long savelink; inode = dentry->d_inode; + DQUOT_INIT(inode); /* in this transaction we can be doing at max two balancings and update * two stat datas, we change quotas of the owner of the directory and of @@ -1254,6 +1256,8 @@ static int reiserfs_rename(struct inode old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; + if (new_dentry_inode) + DQUOT_INIT(new_dentry_inode); // make sure, that oldname still exists and points to an object we // are going to rename diff -uprN linux-2.6.24/fs/select.c linux-2.6.24.ovz/fs/select.c --- linux-2.6.24/fs/select.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/select.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,8 @@ #include +#include + struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; @@ -331,7 +333,8 @@ static int core_sys_select(int n, fd_set if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ? + GFP_KERNEL_UBC : GFP_KERNEL); if (!bits) goto out_nofds; } @@ -677,7 +680,7 @@ int do_sys_poll(struct pollfd __user *uf len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; - walk = walk->next = kmalloc(size, GFP_KERNEL); + walk = walk->next = kmalloc(size, GFP_KERNEL_UBC); if (!walk) { err = -ENOMEM; goto out_fds; @@ -709,7 +712,7 @@ out_fds: return err; } -static long do_restart_poll(struct restart_block *restart_block) +long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0; int nfds = restart_block->arg1; @@ -725,6 +728,7 @@ static long do_restart_poll(struct resta } return ret; } +EXPORT_SYMBOL_GPL(do_restart_poll); asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) diff -uprN linux-2.6.24/fs/seq_file.c linux-2.6.24.ovz/fs/seq_file.c --- linux-2.6.24/fs/seq_file.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/seq_file.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,7 +31,7 @@ int seq_open(struct file *file, const st struct seq_file *p = file->private_data; if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc(sizeof(*p), GFP_KERNEL_UBC); if (!p) return -ENOMEM; file->private_data = p; @@ -86,7 +86,7 @@ ssize_t seq_read(struct file *file, char m->version = file->f_version; /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; } @@ -120,7 +120,7 @@ ssize_t seq_read(struct file *file, char goto Fill; m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); if (!m->buf) goto Enomem; m->count = 0; @@ -189,7 +189,7 @@ static int traverse(struct seq_file *m, return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); if (!m->buf) return -ENOMEM; } @@ -224,7 +224,7 @@ static int traverse(struct seq_file *m, Eoverflow: m->op->stop(m, p); kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -349,6 +349,8 @@ int seq_path(struct seq_file *m, if (m->count < m->size) { char *s = m->buf + m->count; char *p = d_path(dentry, mnt, s, m->size - m->count); + if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) + return 0; if (!IS_ERR(p)) { while (s <= p) { char c = *p++; @@ -392,7 +394,7 @@ static void single_stop(struct seq_file int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC); int res = -ENOMEM; if (op) { @@ -436,7 +438,7 @@ void *__seq_open_private(struct file *f, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_UBC); if (private == NULL) goto out; diff -uprN linux-2.6.24/fs/simfs.c linux-2.6.24.ovz/fs/simfs.c --- linux-2.6.24/fs/simfs.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/simfs.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,332 @@ +/* + * fs/simfs.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb + +static struct super_operations sim_super_ops; + +static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct super_block *sb; + struct inode *inode; + + inode = dentry->d_inode; + if (!inode->i_op->getattr) { + generic_fillattr(inode, stat); + if (!stat->blksize) { + unsigned blocks; + + sb = inode->i_sb; + blocks = (stat->size + sb->s_blocksize-1) >> + sb->s_blocksize_bits; + stat->blocks = (sb->s_blocksize / 512) * blocks; + stat->blksize = sb->s_blocksize; + } + } else { + int err; + + err = inode->i_op->getattr(mnt, dentry, stat); + if (err) + return err; + } + + sb = mnt->mnt_sb; + if (sb->s_op == &sim_super_ops) + stat->dev = sb->s_dev; + return 0; +} + +static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct dq_stat qstat; + struct virt_info_quota q; + long free_file, adj_file; + s64 blk, free_blk, adj_blk; + int bsize_bits; + + q.super = sb; + q.qstat = &qstat; + err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); + if (err != NOTIFY_OK) + return; + + bsize_bits = ffs(buf->f_bsize) - 1; + + if (qstat.bsoftlimit > qstat.bcurrent) + free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; + else + free_blk = 0; + /* + * In the regular case, we always set buf->f_bfree and buf->f_blocks to + * the values reported by quota. In case of real disk space shortage, + * we adjust the values. We want this adjustment to look as if the + * total disk space were reduced, not as if the usage were increased. + * -- SAW + */ + adj_blk = 0; + if (buf->f_bfree < free_blk) + adj_blk = free_blk - buf->f_bfree; + buf->f_bfree = free_blk - adj_blk; + + if (free_blk < buf->f_bavail) + buf->f_bavail = free_blk; + + blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; + buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; + + free_file = qstat.isoftlimit - qstat.icurrent; + if (free_file < 0) + free_file = 0; + if (buf->f_type == REISERFS_SUPER_MAGIC) + /* + * reiserfs doesn't initialize f_ffree and f_files values of + * kstatfs because it doesn't have an inode limit. + */ + buf->f_ffree = free_file; + adj_file = 0; + if (buf->f_ffree < free_file) + adj_file = free_file - buf->f_ffree; + buf->f_ffree = free_file - adj_file; + buf->f_files = qstat.isoftlimit - adj_file; +} + +static int sim_statfs(struct super_block *sb, struct kstatfs *buf) +{ + int err; + struct super_block *lsb; + struct kstatfs statbuf; + + err = 0; + if (sb->s_op != &sim_super_ops) + return 0; + + memset(&statbuf, 0, sizeof(statbuf)); + lsb = SIMFS_GET_LOWER_FS_SB(sb); + + err = -ENOSYS; + if (lsb && lsb->s_op && lsb->s_op->statfs) + err = lsb->s_op->statfs(lsb->s_root, &statbuf); + if (err) + return err; + + quota_get_stat(sb, &statbuf); + + buf->f_files = statbuf.f_files; + buf->f_ffree = statbuf.f_ffree; + buf->f_blocks = statbuf.f_blocks; + buf->f_bfree = statbuf.f_bfree; + buf->f_bavail = statbuf.f_bavail; + return 0; +} + +static int sim_systemcall(struct vnotifier_block *me, unsigned long n, + void *d, int old_ret) +{ + int err; + + switch (n) { + case VIRTINFO_FAUDIT_STAT: { + struct faudit_stat_arg *arg; + + arg = (struct faudit_stat_arg *)d; + err = sim_getattr(arg->mnt, arg->dentry, arg->stat); + arg->err = err; + } + break; + case VIRTINFO_FAUDIT_STATFS: { + struct faudit_statfs_arg *arg; + + arg = (struct faudit_statfs_arg *)d; + err = sim_statfs(arg->sb, arg->stat); + arg->err = err; + } + break; + default: + return old_ret; + } + return (err ? NOTIFY_BAD : NOTIFY_OK); +} + +static struct inode *sim_quota_root(struct super_block *sb) +{ + return sb->s_root->d_inode; +} + +/* + * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() + * does lookup_bdev() and get_super() which are comparing sb->s_bdev. + * so this is a MUST if we want unmodified sys_quotactl + * to work correctly on /dev/simfs inside VE + */ +static int sim_init_blkdev(struct super_block *sb) +{ + static struct hd_struct fake_hd; + struct block_device *blkdev; + + blkdev = bdget(sb->s_dev); + if (blkdev == NULL) + return -ENOMEM; + + blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */ + sb->s_bdev = blkdev; + + return 0; +} + +static void sim_free_blkdev(struct super_block *sb) +{ + /* set bd_part back to NULL */ + sb->s_bdev->bd_part = NULL; + bdput(sb->s_bdev); +} + +static void sim_quota_init(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); +} + +static void sim_quota_free(struct super_block *sb) +{ + struct virt_info_quota viq; + + viq.super = sb; + virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); +} + +static struct super_operations sim_super_ops = { + .get_quota_root = sim_quota_root, +}; + +static int sim_fill_super(struct super_block *s, void *data) +{ + int err; + struct nameidata *nd; + + err = set_anon_super(s, NULL); + if (err) + goto out; + + err = 0; + nd = (struct nameidata *)data; + s->s_fs_info = mntget(nd->mnt); + s->s_root = dget(nd->dentry); + s->s_op = &sim_super_ops; +out: + return err; +} + +static int sim_get_sb(struct file_system_type *type, int flags, + const char *dev_name, void *opt, struct vfsmount *mnt) +{ + int err; + struct nameidata nd; + struct super_block *sb; + + err = -EINVAL; + if (opt == NULL) + goto out; + + err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (err) + goto out; + + sb = sget(type, NULL, sim_fill_super, &nd); + err = PTR_ERR(sb); + if (IS_ERR(sb)) + goto out_path; + + err = sim_init_blkdev(sb); + if (err) + goto out_killsb; + + sim_quota_init(sb); + + path_release(&nd); + return simple_set_mnt(mnt, sb); + +out_killsb: + up_write(&sb->s_umount); + deactivate_super(sb); +out_path: + path_release(&nd); +out: + return err; +} + +static void sim_kill_sb(struct super_block *sb) +{ + dput(sb->s_root); + sb->s_root = NULL; + mntput((struct vfsmount *)(sb->s_fs_info)); + + sim_quota_free(sb); + sim_free_blkdev(sb); + + kill_anon_super(sb); +} + +static struct file_system_type sim_fs_type = { + .owner = THIS_MODULE, + .name = "simfs", + .get_sb = sim_get_sb, + .kill_sb = sim_kill_sb, + .fs_flags = FS_MANGLE_PROC, +}; + +static struct vnotifier_block sim_syscalls = { + .notifier_call = sim_systemcall, +}; + +static int __init init_simfs(void) +{ + int err; + + err = register_filesystem(&sim_fs_type); + if (err) + return err; + + virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); + return 0; +} + +static void __exit exit_simfs(void) +{ + virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); + unregister_filesystem(&sim_fs_type); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); +MODULE_LICENSE("GPL v2"); + +module_init(init_simfs); +module_exit(exit_simfs); diff -uprN linux-2.6.24/fs/smbfs/sock.c linux-2.6.24.ovz/fs/smbfs/sock.c --- linux-2.6.24/fs/smbfs/sock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/smbfs/sock.c 2008-03-25 18:53:59.000000000 -0500 @@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *ser VERBOSE("closing socket %p\n", sock); sock->sk->sk_data_ready = server->data_ready; + sock->sk->sk_user_data = NULL; server->sock_file = NULL; fput(file); } diff -uprN linux-2.6.24/fs/splice.c linux-2.6.24.ovz/fs/splice.c --- linux-2.6.24/fs/splice.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/splice.c 2008-03-25 18:53:59.000000000 -0500 @@ -1184,6 +1184,9 @@ static int copy_from_user_mmap_sem(void { int partial; + if (!access_ok(VERIFY_READ, src, n)) + return -EFAULT; + pagefault_disable(); partial = __copy_from_user_inatomic(dst, src, n); pagefault_enable(); @@ -1236,7 +1239,7 @@ static int get_iovec_page_array(const st if (unlikely(!len)) break; error = -EFAULT; - if (unlikely(!base)) + if (!access_ok(VERIFY_READ, base, len)) break; /* @@ -1392,6 +1395,11 @@ static long vmsplice_to_user(struct file break; } + if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { + error = -EFAULT; + break; + } + sd.len = 0; sd.total_len = len; sd.flags = flags; diff -uprN linux-2.6.24/fs/stat.c linux-2.6.24.ovz/fs/stat.c --- linux-2.6.24/fs/stat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/stat.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, st { struct inode *inode = dentry->d_inode; int retval; + struct faudit_stat_arg arg; retval = security_inode_getattr(mnt, dentry); if (retval) return retval; + arg.mnt = mnt; + arg.dentry = dentry; + arg.stat = stat; + if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) + != NOTIFY_DONE) + return arg.err; + if (inode->i_op->getattr) return inode->i_op->getattr(mnt, dentry, stat); diff -uprN linux-2.6.24/fs/super.c linux-2.6.24.ovz/fs/super.c --- linux-2.6.24/fs/super.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/super.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,11 +37,14 @@ #include #include #include +#include #include LIST_HEAD(super_blocks); +EXPORT_SYMBOL_GPL(super_blocks); DEFINE_SPINLOCK(sb_lock); +EXPORT_SYMBOL_GPL(sb_lock); /** * alloc_super - create new superblock @@ -70,13 +73,15 @@ static struct super_block *alloc_super(s INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); + lockdep_set_class(&s->s_umount, + &type->proto->s_umount_key); /* * The locking rules for s_lock are up to the * filesystem. For example ext3fs has different * lock ordering than usbfs: */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); + lockdep_set_class(&s->s_lock, + &type->proto->s_lock_key); down_write(&s->s_umount); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); @@ -300,7 +305,7 @@ void generic_shutdown_super(struct super sop->put_super(sb); /* Forget any remaining inodes */ - if (invalidate_inodes(sb)) { + if (invalidate_inodes_check(sb, 1)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); @@ -529,17 +534,26 @@ rescan: spin_unlock(&sb_lock); return NULL; } +EXPORT_SYMBOL(user_get_super); asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) { + dev_t kdev; struct super_block *s; struct ustat tmp; struct kstatfs sbuf; - int err = -EINVAL; + int err; + + kdev = new_decode_dev(dev); + err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); + if (err) + goto out; + + err = -EINVAL; + s = user_get_super(kdev); + if (s == NULL) + goto out; - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) @@ -655,6 +669,13 @@ void emergency_remount(void) static struct idr unnamed_dev_idr; static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ +/* for compatibility with coreutils still unaware of new minor sizes */ +int unnamed_dev_majors[] = { + 0, 144, 145, 146, 242, 243, 244, 245, + 246, 247, 248, 249, 250, 251, 252, 253 +}; +EXPORT_SYMBOL(unnamed_dev_majors); + int set_anon_super(struct super_block *s, void *data) { int dev; @@ -672,13 +693,13 @@ int set_anon_super(struct super_block *s else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, dev); spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_dev = make_unnamed_dev(dev); return 0; } @@ -686,8 +707,9 @@ EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { - int slot = MINOR(sb->s_dev); + int slot; + slot = unnamed_dev_idx(sb->s_dev); generic_shutdown_super(sb); spin_lock(&unnamed_dev_lock); idr_remove(&unnamed_dev_idr, slot); diff -uprN linux-2.6.24/fs/sync.c linux-2.6.24.ovz/fs/sync.c --- linux-2.6.24/fs/sync.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sync.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,10 @@ #include #include +#include + +#include + #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -38,7 +42,14 @@ static void do_sync(unsigned long wait) asmlinkage long sys_sync(void) { + struct user_beancounter *ub; + + ub = get_exec_ub(); + ub_percpu_inc(ub, sync); + do_sync(1); + + ub_percpu_inc(ub, sync_done); return 0; } @@ -80,6 +91,7 @@ long do_fsync(struct file *file, int dat int ret; int err; struct address_space *mapping = file->f_mapping; + struct user_beancounter *ub; if (!file->f_op || !file->f_op->fsync) { /* Why? We can still call filemap_fdatawrite */ @@ -87,6 +99,12 @@ long do_fsync(struct file *file, int dat goto out; } + ub = get_exec_ub(); + if (datasync) + ub_percpu_inc(ub, fdsync); + else + ub_percpu_inc(ub, fsync); + ret = filemap_fdatawrite(mapping); /* @@ -101,6 +119,11 @@ long do_fsync(struct file *file, int dat err = filemap_fdatawait(mapping); if (!ret) ret = err; + + if (datasync) + ub_percpu_inc(ub, fdsync_done); + else + ub_percpu_inc(ub, fsync_done); out: return ret; } @@ -251,12 +274,16 @@ int do_sync_mapping_range(struct address loff_t endbyte, unsigned int flags) { int ret; + struct user_beancounter *ub; if (!mapping) { ret = -EINVAL; - goto out; + goto out_noacct; } + ub = get_exec_ub(); + ub_percpu_inc(ub, frsync); + ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = wait_on_page_writeback_range(mapping, @@ -279,6 +306,8 @@ int do_sync_mapping_range(struct address endbyte >> PAGE_CACHE_SHIFT); } out: + ub_percpu_inc(ub, frsync_done); +out_noacct: return ret; } EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff -uprN linux-2.6.24/fs/sysfs/bin.c linux-2.6.24.ovz/fs/sysfs/bin.c --- linux-2.6.24/fs/sysfs/bin.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/bin.c 2008-03-25 18:53:59.000000000 -0500 @@ -177,6 +177,9 @@ static int open(struct inode * inode, st struct bin_buffer *bb = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + /* binary file operations requires both @sd and its parent */ if (!sysfs_get_active_two(attr_sd)) return -ENODEV; @@ -238,6 +241,9 @@ const struct file_operations bin_fops = int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); @@ -252,6 +258,8 @@ int sysfs_create_bin_file(struct kobject void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff -uprN linux-2.6.24/fs/sysfs/dir.c linux-2.6.24.ovz/fs/sysfs/dir.c --- linux-2.6.24/fs/sysfs/dir.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/dir.c 2008-03-25 18:53:59.000000000 -0500 @@ -481,6 +481,9 @@ static void sysfs_drop_dentry(struct sys struct inode *inode; struct dentry *dentry; + if (!ve_sysfs_alowed()) + return; + inode = ilookup(sysfs_sb, sd->s_ino); if (!inode) return; @@ -652,12 +655,15 @@ int sysfs_create_dir(struct kobject * ko struct sysfs_dirent *parent_sd, *sd; int error = 0; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!kobj); if (kobj->parent) parent_sd = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); if (!error) @@ -758,6 +764,9 @@ void sysfs_remove_dir(struct kobject * k { struct sysfs_dirent *sd = kobj->sd; + if (!ve_sysfs_alowed()) + return; + spin_lock(&sysfs_assoc_lock); kobj->sd = NULL; spin_unlock(&sysfs_assoc_lock); @@ -773,6 +782,9 @@ int sysfs_rename_dir(struct kobject * ko const char *dup_name = NULL; int error; + if (!ve_sysfs_alowed()) + return 0; + mutex_lock(&sysfs_rename_mutex); error = 0; @@ -841,7 +853,7 @@ int sysfs_move_dir(struct kobject *kobj, mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : ve_sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) diff -uprN linux-2.6.24/fs/sysfs/file.c linux-2.6.24.ovz/fs/sysfs/file.c --- linux-2.6.24/fs/sysfs/file.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/file.c 2008-03-25 18:53:59.000000000 -0500 @@ -549,6 +549,8 @@ int sysfs_add_file(struct sysfs_dirent * int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); @@ -641,16 +643,18 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { + if (!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, attr->name); } - /** * sysfs_remove_file_from_group - remove an attribute file from a group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ + void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { diff -uprN linux-2.6.24/fs/sysfs/group.c linux-2.6.24.ovz/fs/sysfs/group.c --- linux-2.6.24/fs/sysfs/group.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/group.c 2008-03-25 18:53:59.000000000 -0500 @@ -45,6 +45,8 @@ int sysfs_create_group(struct kobject * struct sysfs_dirent *sd; int error; + if (!ve_sysfs_alowed()) + return 0; BUG_ON(!kobj || !kobj->sd); if (grp->name) { @@ -69,6 +71,9 @@ void sysfs_remove_group(struct kobject * struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; + if (!ve_sysfs_alowed()) + return; + if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); BUG_ON(!sd); diff -uprN linux-2.6.24/fs/sysfs/inode.c linux-2.6.24.ovz/fs/sysfs/inode.c --- linux-2.6.24/fs/sysfs/inode.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/inode.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,8 +20,6 @@ #include #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, diff -uprN linux-2.6.24/fs/sysfs/mount.c linux-2.6.24.ovz/fs/sysfs/mount.c --- linux-2.6.24/fs/sysfs/mount.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/mount.c 2008-03-25 18:53:59.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -22,8 +23,11 @@ /* Random magic number */ #define SYSFS_MAGIC 0x62656572 -static struct vfsmount *sysfs_mount; +#ifndef CONFIG_VE +struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; +#endif + struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -39,6 +43,13 @@ struct sysfs_dirent sysfs_root = { .s_ino = 1, }; +static void init_ve0_sysfs_root(void) +{ +#ifdef CONFIG_VE + get_ve0()->_sysfs_root = &sysfs_root; +#endif +} + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; @@ -52,7 +63,7 @@ static int sysfs_fill_super(struct super sysfs_sb = sb; /* get root inode, initialize and unlock it */ - inode = sysfs_get_inode(&sysfs_root); + inode = sysfs_get_inode(ve_sysfs_root); if (!inode) { pr_debug("sysfs: could not get root inode\n"); return -ENOMEM; @@ -65,7 +76,7 @@ static int sysfs_fill_super(struct super iput(inode); return -ENOMEM; } - root->d_fsdata = &sysfs_root; + root->d_fsdata = ve_sysfs_root; sb->s_root = root; return 0; } @@ -76,16 +87,19 @@ static int sysfs_get_sb(struct file_syst return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); } -static struct file_system_type sysfs_fs_type = { +struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, .kill_sb = kill_anon_super, }; +EXPORT_SYMBOL(sysfs_fs_type); + int __init sysfs_init(void) { int err = -ENOMEM; + init_ve0_sysfs_root(); sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", sizeof(struct sysfs_dirent), 0, 0, NULL); diff -uprN linux-2.6.24/fs/sysfs/symlink.c linux-2.6.24.ovz/fs/sysfs/symlink.c --- linux-2.6.24/fs/sysfs/symlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/symlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -66,10 +66,13 @@ int sysfs_create_link(struct kobject * k struct sysfs_addrm_cxt acxt; int error; + if (!ve_sysfs_alowed()) + return 0; + BUG_ON(!name); if (!kobj) - parent_sd = &sysfs_root; + parent_sd = ve_sysfs_root; else parent_sd = kobj->sd; @@ -121,6 +124,8 @@ int sysfs_create_link(struct kobject * k void sysfs_remove_link(struct kobject * kobj, const char * name) { + if(!ve_sysfs_alowed()) + return; sysfs_hash_and_remove(kobj->sd, name); } diff -uprN linux-2.6.24/fs/sysfs/sysfs.h linux-2.6.24.ovz/fs/sysfs/sysfs.h --- linux-2.6.24/fs/sysfs/sysfs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/sysfs/sysfs.h 2008-03-25 18:53:59.000000000 -0500 @@ -8,67 +8,17 @@ * This file is released under the GPLv2. */ -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - struct attribute *attr; - struct sysfs_open_dirent *open; -}; - -struct sysfs_elem_bin_attr { - struct bin_attribute *bin_attr; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; - struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; - const char *s_name; - - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - struct sysfs_elem_bin_attr s_bin_attr; - }; - - unsigned int s_flags; - ino_t s_ino; - umode_t s_mode; - struct iattr *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) - -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_REMOVED 0x0200 +#ifndef CONFIG_VE +extern struct vfsmount *sysfs_mount; +extern struct super_block *sysfs_sb; +#define ve_sysfs_alowed() 1 +#else +#include +#include +#define sysfs_mount (get_exec_env()->sysfs_mnt) +#define sysfs_sb (get_exec_env()->sysfs_sb) +#define ve_sysfs_alowed() (sysfs_sb != NULL) +#endif static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { @@ -88,8 +38,12 @@ struct sysfs_addrm_cxt { /* * mount.c */ +#ifdef CONFIG_VE +#define ve_sysfs_root (get_exec_env()->_sysfs_root) +#else extern struct sysfs_dirent sysfs_root; -extern struct super_block *sysfs_sb; +#define ve_sysfs_root (&sysfs_root) +#endif extern struct kmem_cache *sysfs_dir_cachep; /* diff -uprN linux-2.6.24/fs/utimes.c linux-2.6.24.ovz/fs/utimes.c --- linux-2.6.24/fs/utimes.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/fs/utimes.c 2008-03-25 18:53:59.000000000 -0500 @@ -52,7 +52,7 @@ static bool nsec_valid(long nsec) */ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags) { - int error; + int error = -EINVAL; struct nameidata nd; struct dentry *dentry; struct inode *inode; @@ -198,3 +198,18 @@ asmlinkage long sys_utimes(char __user * { return sys_futimesat(AT_FDCWD, filename, utimes); } + +asmlinkage long sys_lutime(char __user *filename, struct utimbuf __user *times) +{ + struct timespec tv[2]; + + if (times) { + if (get_user(tv[0].tv_sec, ×->actime) || + get_user(tv[1].tv_sec, ×->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + + return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW); +} diff -uprN linux-2.6.24/fs/vzdq_file.c linux-2.6.24.ovz/fs/vzdq_file.c --- linux-2.6.24/fs/vzdq_file.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdq_file.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,909 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota files as proc entry implementation. + * It is required for std quota tools to work correctly as they are expecting + * aquota.user and aquota.group files. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * File read operation + * + * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, + * perhaps) abuse vz_quota_sem. + * Taking a global semaphore for lengthy and user-controlled operations inside + * VPSs is not a good idea in general. + * In this case, the reasons for taking this semaphore are completely unclear, + * especially taking into account that the only function that has comments + * about the necessity to be called under this semaphore + * (create_proc_quotafile) is actually called OUTSIDE it. + * + * --------------------------------------------------------------------- */ + +#define DQBLOCK_SIZE 1024 +#define DQUOTBLKNUM 21U +#define DQTREE_DEPTH 4 +#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) +#define ISINDBLOCK(num) ((num)%2 != 0) +#define FIRST_DATABLK 2 /* first even number */ +#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) +#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) +#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ + & QUOTATREE_BMASK) + +#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) +#error xBITS and DQTREE_DEPTH does not correspond +#endif + +#define BLOCK_NOT_FOUND 1 + +/* data for quota file -- one per proc entry */ +struct quotatree_data { + struct list_head list; + struct vz_quota_master *qmblk; + int type; /* type of the tree */ +}; + +/* serialized by vz_quota_sem */ +static LIST_HEAD(qf_data_head); + +static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; +static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; +static const char aquota_user[] = "aquota.user"; +static const char aquota_group[] = "aquota.group"; + + +static inline loff_t get_depoff(int depth) +{ + loff_t res = 1; + while (depth) { + res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); + depth--; + } + return res; +} + +static inline loff_t get_blknum(loff_t num, int depth) +{ + loff_t res; + res = (num << 1) + get_depoff(depth); + return res; +} + +static int get_depth(loff_t num) +{ + int i; + for (i = 0; i < DQTREE_DEPTH; i++) { + if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 + || num < get_depoff(i + 1))) + return i; + } + return -1; +} + +static inline loff_t get_offset(loff_t num) +{ + loff_t res, tmp; + + tmp = get_depth(num); + if (tmp < 0) + return -1; + num -= get_depoff(tmp); + BUG_ON(num < 0); + res = num >> 1; + + return res; +} + +static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) +{ + /* return maximum available block num */ + return tree->levels[level].freenum; +} + +static inline loff_t get_block_num(struct quotatree_tree *tree) +{ + loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; + + quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); + max_quot = TREENUM_2_BLKNUM(quot_blk_num); + ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); + max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) + : get_blknum(ind_blk_num, 0); + + return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; +} + +/* Write quota file header */ +static int read_header(void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int type) +{ + struct v2_disk_dqheader *dqh; + struct v2_disk_dqinfo *dq_disk_info; + + dqh = buf; + dq_disk_info = buf + sizeof(struct v2_disk_dqheader); + + dqh->dqh_magic = vzquota_magics[type]; + dqh->dqh_version = vzquota_versions[type]; + + dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; + dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; + dq_disk_info->dqi_flags = 0; /* no flags */ + dq_disk_info->dqi_blocks = get_block_num(tree); + dq_disk_info->dqi_free_blk = 0; /* first block in the file */ + dq_disk_info->dqi_free_entry = FIRST_DATABLK; + + return 0; +} + +static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) +{ + int i, j, lev_num; + + lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; + for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { + struct quotatree_node *next, *parent; + + parent = p; + next = p; + for (j = lev_num; j >= 0; j--) { + if (!next->blocks[GETLEVINDX(i,j)]) { + buf[i] = 0; + goto bad_branch; + } + parent = next; + next = next->blocks[GETLEVINDX(i,j)]; + } + buf[i] = (depth == DQTREE_DEPTH - 1) ? + TREENUM_2_BLKNUM(parent->num) + : get_blknum(next->num, depth + 1); + + bad_branch: + ; + } + + return 0; +} + +/* + * Write index block to disk (or buffer) + * @buf has length 256*sizeof(u_int32_t) bytes + */ +static int read_index_block(int num, u_int32_t *buf, + struct quotatree_tree *tree) +{ + struct quotatree_node *p; + u_int32_t index; + loff_t off; + int depth, res; + + res = BLOCK_NOT_FOUND; + index = 0; + depth = get_depth(num); + off = get_offset(num); + if (depth < 0 || off < 0) + return -EINVAL; + + list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, + list) { + if (p->num >= off) + res = 0; + if (p->num != off) + continue; + get_block_child(depth, p, buf); + break; + } + + return res; +} + +static inline void convert_quot_format(struct v2_disk_dqblk *dq, + struct vz_quota_ugid *vzq) +{ + dq->dqb_id = vzq->qugid_id; + dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; + dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; + dq->dqb_curinodes = vzq->qugid_stat.icurrent; + dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; + dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; + dq->dqb_curspace = vzq->qugid_stat.bcurrent; + dq->dqb_btime = vzq->qugid_stat.btime; + dq->dqb_itime = vzq->qugid_stat.itime; +} + +static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) +{ + int res, i, entries = 0; + struct v2_disk_dqdbheader *dq_header; + struct quotatree_node *p; + struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); + + res = BLOCK_NOT_FOUND; + dq_header = buf; + memset(dq_header, 0, sizeof(*dq_header)); + + list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), + list) { + if (TREENUM_2_BLKNUM(p->num) >= num) + res = 0; + if (TREENUM_2_BLKNUM(p->num) != num) + continue; + + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (!p->blocks[i]) + continue; + convert_quot_format(blk + entries, + (struct vz_quota_ugid *)p->blocks[i]); + entries++; + res = 0; + } + break; + } + dq_header->dqdh_entries = entries; + + return res; +} + +static int read_block(int num, void *buf, struct quotatree_tree *tree, + struct dq_info *dq_ugid_info, int magic) +{ + int res; + + memset(buf, 0, DQBLOCK_SIZE); + if (!num) + res = read_header(buf, tree, dq_ugid_info, magic); + else if (ISINDBLOCK(num)) + res = read_index_block(num, (u_int32_t*)buf, tree); + else + res = read_dquot(num, buf, tree); + + return res; +} + +/* + * FIXME: this function can handle quota files up to 2GB only. + */ +static int read_proc_quotafile(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + off_t blk_num, blk_off, buf_off; + char *tmp; + size_t buf_size; + struct quotatree_data *qtd; + struct quotatree_tree *tree; + struct dq_info *dqi; + int res; + + *start = NULL; + tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + qtd = data; + down(&vz_quota_sem); + down(&qtd->qmblk->dq_sem); + + res = 0; + tree = QUGID_TREE(qtd->qmblk, qtd->type); + if (!tree) { + *eof = 1; + goto out_dq; + } + + dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; + + buf_off = 0; + buf_size = count; + blk_num = off / DQBLOCK_SIZE; + blk_off = off % DQBLOCK_SIZE; + + while (buf_size > 0) { + off_t len; + + len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); + res = read_block(blk_num, tmp, tree, dqi, qtd->type); + if (res < 0) + goto out_err; + if (res == BLOCK_NOT_FOUND) { + *eof = 1; + break; + } + memcpy(page + buf_off, tmp + blk_off, len); + + blk_num++; + buf_size -= len; + blk_off = 0; + buf_off += len; + } + res = buf_off; + +out_err: + *start += count; +out_dq: + up(&qtd->qmblk->dq_sem); + up(&vz_quota_sem); + kfree(tmp); + + return res; +} + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID/aquota.* files + * + * FIXME: this code lacks serialization of read/readdir/lseek. + * However, this problem should be fixed after the mainstream issue of what + * appears to be non-atomic read and update of file position in sys_read. + * + * --------------------------------------------------------------------- */ + +static inline unsigned long vzdq_aquot_getino(dev_t dev) +{ + return 0xec000000UL + dev; +} + +static inline dev_t vzdq_aquot_getidev(struct inode *inode) +{ + return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; +} + +static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) +{ + PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; +} + +static ssize_t vzdq_aquotf_read(struct file *file, + char __user *buf, size_t size, loff_t *ppos) +{ + char *page; + size_t bufsize; + ssize_t l, l2, copied; + char *start; + struct inode *inode; + struct block_device *bdev; + struct super_block *sb; + struct quotatree_data data; + int eof, err; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (page == NULL) + goto out_err; + + err = -ENODEV; + inode = file->f_dentry->d_inode; + bdev = bdget(vzdq_aquot_getidev(inode)); + if (bdev == NULL) + goto out_err; + sb = get_super(bdev); + bdput(bdev); + if (sb == NULL) + goto out_err; + data.qmblk = vzquota_find_qmblk(sb); + data.type = PROC_I(inode)->fd - 1; + drop_super(sb); + if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) + goto out_err; + + copied = 0; + l = l2 = 0; + while (1) { + bufsize = min(size, (size_t)PAGE_SIZE); + if (bufsize <= 0) + break; + + l = read_proc_quotafile(page, &start, *ppos, bufsize, + &eof, &data); + if (l <= 0) + break; + + l2 = copy_to_user(buf, page, l); + copied += l - l2; + if (l2) + break; + + buf += l; + size -= l; + *ppos += (unsigned long)start; + l = l2 = 0; + } + + qmblk_put(data.qmblk); + free_page((unsigned long)page); + if (copied) + return copied; + else if (l2) /* last copy_to_user failed */ + return -EFAULT; + else /* read error or EOF */ + return l; + +out_err: + if (page != NULL) + free_page((unsigned long)page); + return err; +} + +static struct file_operations vzdq_aquotf_file_operations = { + .read = &vzdq_aquotf_read, +}; + +static struct inode_operations vzdq_aquotf_inode_operations = { +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota/QID directory + * + * --------------------------------------------------------------------- */ + +static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) +{ + loff_t n; + int err; + + n = file->f_pos; + for (err = 0; !err; n++) { + /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */ + switch ((unsigned long)n) { + case 0: + err = (*filler)(data, ".", 1, n, + file->f_dentry->d_inode->i_ino, + DT_DIR); + break; + case 1: + err = (*filler)(data, "..", 2, n, + parent_ino(file->f_dentry), DT_DIR); + break; + case 2: + err = (*filler)(data, aquota_user, + sizeof(aquota_user)-1, n, + file->f_dentry->d_inode->i_ino + + USRQUOTA + 1, + DT_REG); + break; + case 3: + err = (*filler)(data, aquota_group, + sizeof(aquota_group)-1, n, + file->f_dentry->d_inode->i_ino + + GRPQUOTA + 1, + DT_REG); + break; + default: + goto out; + } + } +out: + file->f_pos = n; + return err; +} + +struct vzdq_aquotq_lookdata { + dev_t dev; + int type; + struct vz_quota_master *qmblk; +}; + +static int vzdq_aquotq_looktest(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + + d = data; + return inode->i_op == &vzdq_aquotf_inode_operations && + vzdq_aquot_getidev(inode) == d->dev && + PROC_I(inode)->fd == d->type + 1; +} + +static int vzdq_aquotq_lookset(struct inode *inode, void *data) +{ + struct vzdq_aquotq_lookdata *d; + struct quotatree_tree *tree; + + d = data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 1; + inode->i_op = &vzdq_aquotf_inode_operations; + inode->i_fop = &vzdq_aquotf_file_operations; + PROC_I(inode)->fd = d->type + 1; + vzdq_aquot_setidev(inode, d->dev); + + /* Setting size */ + tree = QUGID_TREE(d->qmblk, PROC_I(inode)->fd - 1); + inode->i_size = get_block_num(tree) * 1024; + return 0; +} + +static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd) +{ + return 0; +} + +static struct dentry_operations vzdq_aquotq_dentry_operations = { + .d_revalidate = &vzdq_aquotq_revalidate, +}; + +static struct vz_quota_master *find_qmblk_by_dev(dev_t dev) +{ + struct super_block *sb; + struct vz_quota_master *qmblk; + + qmblk = NULL; + sb = user_get_super(dev); + if (sb != NULL) { + qmblk = vzquota_find_qmblk(sb); + drop_super(sb); + + if (qmblk == VZ_QUOTA_BAD) + qmblk = NULL; + } + + return qmblk; +} + +static struct dentry *vzdq_aquotq_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + struct vzdq_aquotq_lookdata d; + int k; + + if (dentry->d_name.len == sizeof(aquota_user)-1) { + if (memcmp(dentry->d_name.name, aquota_user, + sizeof(aquota_user)-1)) + goto out; + k = USRQUOTA; + } else if (dentry->d_name.len == sizeof(aquota_group)-1) { + if (memcmp(dentry->d_name.name, aquota_group, + sizeof(aquota_group)-1)) + goto out; + k = GRPQUOTA; + } else + goto out; + d.dev = vzdq_aquot_getidev(dir); + d.type = k; + d.qmblk = find_qmblk_by_dev(d.dev); + if (d.qmblk == NULL) + goto out; + + inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, + vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + dentry->d_op = &vzdq_aquotq_dentry_operations; + d_add(dentry, inode); + return NULL; + +out: + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotq_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotq_readdir, +}; + +static struct inode_operations vzdq_aquotq_inode_operations = { + .lookup = &vzdq_aquotq_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * /proc/vz/vzaquota directory + * + * --------------------------------------------------------------------- */ + +struct vzdq_aquot_de { + struct list_head list; + struct vfsmount *mnt; +}; + +static int vzdq_aquot_buildmntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vfsmount *rmnt, *mnt; + struct vzdq_aquot_de *p; + int err; + +#ifdef CONFIG_VE + rmnt = mntget(ve->fs_rootmnt); +#else + read_lock(¤t->fs->lock); + rmnt = mntget(current->fs->rootmnt); + read_unlock(¤t->fs->lock); +#endif + mnt = rmnt; + spin_lock(&vfsmount_lock); + while (1) { + list_for_each_entry(p, head, list) { + if (p->mnt->mnt_sb == mnt->mnt_sb) + goto skip; + } + + err = -ENOMEM; + p = kmalloc(sizeof(*p), GFP_ATOMIC); + if (p == NULL) + goto out; + p->mnt = mntget(mnt); + list_add_tail(&p->list, head); + +skip: + err = 0; + if (list_empty(&mnt->mnt_mounts)) { + while (1) { + if (mnt == rmnt) + goto out; + if (mnt->mnt_child.next != + &mnt->mnt_parent->mnt_mounts) + break; + mnt = mnt->mnt_parent; + } + mnt = list_entry(mnt->mnt_child.next, + struct vfsmount, mnt_child); + } else + mnt = list_entry(mnt->mnt_mounts.next, + struct vfsmount, mnt_child); + } +out: + spin_unlock(&vfsmount_lock); + mntput(rmnt); + return err; +} + +static void vzdq_aquot_releasemntlist(struct ve_struct *ve, + struct list_head *head) +{ + struct vzdq_aquot_de *p; + + while (!list_empty(head)) { + p = list_entry(head->next, typeof(*p), list); + mntput(p->mnt); + list_del(&p->list); + kfree(p); + } +} + +static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) +{ + struct ve_struct *ve, *old_ve; + struct list_head mntlist; + struct vzdq_aquot_de *de; + struct super_block *sb; + struct vz_quota_master *qmblk; + loff_t i, n; + char buf[24]; + int l, err; + + i = 0; + n = file->f_pos; + ve = file->f_dentry->d_sb->s_type->owner_env; + old_ve = set_exec_env(ve); + + INIT_LIST_HEAD(&mntlist); +#ifdef CONFIG_VE + /* + * The only reason of disabling readdir for the host system is that + * this readdir can be slow and CPU consuming with large number of VPSs + * (or just mount points). + */ + err = ve_is_super(ve); +#else + err = 0; +#endif + if (!err) { + err = vzdq_aquot_buildmntlist(ve, &mntlist); + if (err) + goto out_err; + } + + if (i >= n) { + if ((*filler)(data, ".", 1, i, + file->f_dentry->d_inode->i_ino, DT_DIR)) + goto out_fill; + } + i++; + + if (i >= n) { + if ((*filler)(data, "..", 2, i, + parent_ino(file->f_dentry), DT_DIR)) + goto out_fill; + } + i++; + + list_for_each_entry (de, &mntlist, list) { + sb = de->mnt->mnt_sb; + if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) + continue; + + qmblk = vzquota_find_qmblk(sb); + if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) + continue; + + qmblk_put(qmblk); + i++; + if (i <= n) + continue; + + l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); + if ((*filler)(data, buf, l, i - 1, + vzdq_aquot_getino(sb->s_dev), DT_DIR)) + break; + } + +out_fill: + err = 0; + file->f_pos = i; +out_err: + vzdq_aquot_releasemntlist(ve, &mntlist); + (void)set_exec_env(old_ve); + return err; +} + +static int vzdq_aquotd_looktest(struct inode *inode, void *data) +{ + return inode->i_op == &vzdq_aquotq_inode_operations && + vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; +} + +static int vzdq_aquotd_lookset(struct inode *inode, void *data) +{ + dev_t dev; + + dev = (dev_t)(unsigned long)data; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = vzdq_aquot_getino(dev); + inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_nlink = 2; + inode->i_op = &vzdq_aquotq_inode_operations; + inode->i_fop = &vzdq_aquotq_file_operations; + vzdq_aquot_setidev(inode, dev); + return 0; +} + +static struct dentry *vzdq_aquotd_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct ve_struct *ve, *old_ve; + const unsigned char *s; + int l; + dev_t dev; + struct inode *inode; + + ve = dir->i_sb->s_type->owner_env; + old_ve = set_exec_env(ve); +#ifdef CONFIG_VE + /* + * Lookup is much lighter than readdir, so it can be allowed for the + * host system. But it would be strange to be able to do lookup only + * without readdir... + */ + if (ve_is_super(ve)) + goto out; +#endif + + dev = 0; + l = dentry->d_name.len; + if (l <= 0) + goto out; + for (s = dentry->d_name.name; l > 0; s++, l--) { + if (!isxdigit(*s)) + goto out; + if (dev & ~(~0UL >> 4)) + goto out; + dev <<= 4; + if (isdigit(*s)) + dev += *s - '0'; + else if (islower(*s)) + dev += *s - 'a' + 10; + else + dev += *s - 'A' + 10; + } + dev = new_decode_dev(dev); + + if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) + goto out; + + inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), + vzdq_aquotd_looktest, vzdq_aquotd_lookset, + (void *)(unsigned long)dev); + if (inode == NULL) + goto out; + unlock_new_inode(inode); + + d_add(dentry, inode); + (void)set_exec_env(old_ve); + return NULL; + +out: + (void)set_exec_env(old_ve); + return ERR_PTR(-ENOENT); +} + +static struct file_operations vzdq_aquotd_file_operations = { + .read = &generic_read_dir, + .readdir = &vzdq_aquotd_readdir, +}; + +static struct inode_operations vzdq_aquotd_inode_operations = { + .lookup = &vzdq_aquotd_lookup, +}; + + +/* ---------------------------------------------------------------------- + * + * Initialization and deinitialization + * + * --------------------------------------------------------------------- */ +static ctl_table fs_table[] = { + { + .ctl_name = FS_DQSTATS, + .procname = "quota", + .mode = 0555, + }, + {}, +}; + +static ctl_table sys_table[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + {}, +}; + +/* + * FIXME: creation of proc entries here is unsafe with respect to module + * unloading. + */ +void vzaquota_init(void) +{ + struct proc_dir_entry *de; + + de = create_proc_glob_entry("vz/vzaquota", + S_IFDIR | S_IRUSR | S_IXUSR, NULL); + if (de != NULL) { + de->proc_iops = &vzdq_aquotd_inode_operations; + de->proc_fops = &vzdq_aquotd_file_operations; + } else + printk("VZDQ: vz/vzaquota creation failed\n"); + register_glob_sysctl_table(sys_table); +} + +void vzaquota_fini(void) +{ + remove_proc_entry("vz/vzaquota", NULL); +} diff -uprN linux-2.6.24/fs/vzdq_mgmt.c linux-2.6.24.ovz/fs/vzdq_mgmt.c --- linux-2.6.24/fs/vzdq_mgmt.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdq_mgmt.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,769 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Switching quota on. + * --------------------------------------------------------------------- */ + +/* + * check limits copied from user + */ +int vzquota_check_sane_limits(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* softlimit must be less then hardlimit */ + if (qstat->bsoftlimit > qstat->bhardlimit) + goto out; + + if (qstat->isoftlimit > qstat->ihardlimit) + goto out; + + err = 0; +out: + return err; +} + +/* + * check usage values copied from user + */ +int vzquota_check_sane_values(struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + + /* expiration time must not be set if softlimit was not exceeded */ + if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0) + goto out; + + if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0) + goto out; + + err = vzquota_check_sane_limits(qstat); +out: + return err; +} + +/* + * create new quota master block + * this function should: + * - copy limits and usage parameters from user buffer; + * - allock, initialize quota block and insert it to hash; + */ +static int vzquota_create(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + err = -EINVAL; + if (quota_id == 0) + goto out; + + if (vzquota_check_sane_values(&qstat.dq_stat)) + goto out; + err = 0; + qmblk = vzquota_alloc_master(quota_id, &qstat); + + if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ + err = PTR_ERR(qmblk); +out: + up(&vz_quota_sem); + + return err; +} + +/** + * vzquota_on - turn quota on + * + * This function should: + * - find and get refcnt of directory entry for quota root and corresponding + * mountpoint; + * - find corresponding quota block and mark it with given path; + * - check quota tree; + * - initialize quota for the tree root. + */ +static int vzquota_on(unsigned int quota_id, const char __user *quota_root, + char __user *buf) +{ + int err; + struct nameidata nd; + struct vz_quota_master *qmblk; + struct super_block *dqsb; + + dqsb = NULL; + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; + + err = user_path_walk(quota_root, &nd); + if (err) + goto out; + /* init path must be a directory */ + err = -ENOTDIR; + if (!S_ISDIR(nd.dentry->d_inode->i_mode)) + goto out_path; + + qmblk->dq_root_dentry = nd.dentry; + qmblk->dq_root_mnt = nd.mnt; + qmblk->dq_sb = nd.dentry->d_inode->i_sb; + err = vzquota_get_super(qmblk->dq_sb); + if (err) + goto out_super; + + /* + * Serialization with quota initialization and operations is performed + * through generation check: generation is memorized before qmblk is + * found and compared under inode_qmblk_lock with assignment. + * + * Note that the dentry tree is shrunk only for high-level logical + * serialization, purely as a courtesy to the user: to have consistent + * quota statistics, files should be closed etc. on quota on. + */ + err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode, + qmblk, buf); + if (err) + goto out_init; + qmblk->dq_state = VZDQ_WORKING; + + up(&vz_quota_sem); + return 0; + +out_init: + dqsb = qmblk->dq_sb; +out_super: + /* clear for qmblk_put/quota_free_master */ + qmblk->dq_sb = NULL; + qmblk->dq_root_dentry = NULL; + qmblk->dq_root_mnt = NULL; +out_path: + path_release(&nd); +out: + if (dqsb) + vzquota_put_super(dqsb); + up(&vz_quota_sem); + return err; +} + + +/* ---------------------------------------------------------------------- + * Switching quota off. + * --------------------------------------------------------------------- */ + +/* + * destroy quota block by ID + */ +static int vzquota_destroy(unsigned int quota_id) +{ + int err; + struct vz_quota_master *qmblk; + struct dentry *dentry; + struct vfsmount *mnt; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state == VZDQ_WORKING) + goto out; /* quota_off first */ + + list_del_init(&qmblk->dq_hash); + dentry = qmblk->dq_root_dentry; + qmblk->dq_root_dentry = NULL; + mnt = qmblk->dq_root_mnt; + qmblk->dq_root_mnt = NULL; + + if (qmblk->dq_sb) + vzquota_put_super(qmblk->dq_sb); + up(&vz_quota_sem); + + qmblk_put(qmblk); + dput(dentry); + mntput(mnt); + return 0; + +out: + up(&vz_quota_sem); + return err; +} + +/** + * vzquota_off - turn quota off + */ + +static int __vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk, + enum writeback_sync_modes sync_mode) +{ + struct writeback_control wbc; + LIST_HEAD(list); + struct vz_quota_ilink *qlnk; + struct inode *inode; + int err, ret; + + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = sync_mode; + + err = ret = 0; + while (!list_empty(lh)) { + if (need_resched()) { + inode_qmblk_unlock(qmblk->dq_sb); + schedule(); + inode_qmblk_lock(qmblk->dq_sb); + continue; + } + + qlnk = list_first_entry(lh, struct vz_quota_ilink, list); + list_move(&qlnk->list, &list); + + inode = igrab(QLNK_INODE(qlnk)); + if (!inode) + continue; + + inode_qmblk_unlock(qmblk->dq_sb); + + wbc.nr_to_write = LONG_MAX; + ret = sync_inode(inode, &wbc); + if (ret) + err = ret; + iput(inode); + + inode_qmblk_lock(qmblk->dq_sb); + } + + list_splice(&list, lh); + return err; +} + +static int vzquota_sync_list(struct list_head *lh, + struct vz_quota_master *qmblk) +{ + (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); + return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); +} + +static int vzquota_sync_inodes(struct vz_quota_master *qmblk) +{ + int err; + LIST_HEAD(qlnk_list); + + list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); + err = vzquota_sync_list(&qlnk_list, qmblk); + if (!err && !list_empty(&qmblk->dq_ilink_list)) + err = -EBUSY; + list_splice(&qlnk_list, &qmblk->dq_ilink_list); + + return err; +} + +static int vzquota_off(unsigned int quota_id, char __user *buf, int force) +{ + int err, ret; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EALREADY; + if (qmblk->dq_state != VZDQ_WORKING) + goto out; + + inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ + ret = vzquota_sync_inodes(qmblk); + inode_qmblk_unlock(qmblk->dq_sb); + + err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force); + if (err) + goto out; + + err = ret; + /* vzquota_destroy will free resources */ + qmblk->dq_state = VZDQ_STOPING; +out: + up(&vz_quota_sem); + + return err; +} + + +/* ---------------------------------------------------------------------- + * Other VZQUOTA ioctl's. + * --------------------------------------------------------------------- */ + +/* + * this function should: + * - set new limits/buffer under quota master block lock + * - if new softlimit less then usage, then set expiration time + * - no need to alloc ugid hash table - we'll do that on demand + */ +int vzquota_update_limit(struct dq_stat *_qstat, + struct dq_stat *qstat) +{ + int err; + + err = -EINVAL; + if (vzquota_check_sane_limits(qstat)) + goto out; + + err = 0; + + /* limits */ + _qstat->bsoftlimit = qstat->bsoftlimit; + _qstat->bhardlimit = qstat->bhardlimit; + /* + * If the soft limit is exceeded, administrator can override the moment + * when the grace period for limit exceeding ends. + * Specifying the moment may be useful if the soft limit is set to be + * lower than the current usage. In the latter case, if the grace + * period end isn't specified, the grace period will start from the + * moment of the first write operation. + * There is a race with the user level. Soft limit may be already + * exceeded before the limit change, and grace period end calculated by + * the kernel will be overriden. User level may check if the limit is + * already exceeded, but check and set calls are not atomic. + * This race isn't dangerous. Under normal cicrumstances, the + * difference between the grace period end calculated by the kernel and + * the user level should be not greater than as the difference between + * the moments of check and set calls, i.e. not bigger than the quota + * timer resolution - 1 sec. + */ + if (qstat->btime != (time_t)0 && + _qstat->bcurrent >= _qstat->bsoftlimit) + _qstat->btime = qstat->btime; + + _qstat->isoftlimit = qstat->isoftlimit; + _qstat->ihardlimit = qstat->ihardlimit; + if (qstat->itime != (time_t)0 && + _qstat->icurrent >= _qstat->isoftlimit) + _qstat->itime = qstat->itime; + +out: + return err; +} + +/* + * set new quota limits. + * this function should: + * copy new limits from user level + * - find quota block + * - set new limits and flags. + */ +static int vzquota_setlimit(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); /* for hash list protection */ + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (!compat) { + if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) + goto out; + compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); + compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); +#endif + } + + qmblk_data_write_lock(qmblk); + err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); + if (err == 0) + qmblk->dq_info = qstat.dq_info; + qmblk_data_write_unlock(qmblk); + +out: + up(&vz_quota_sem); + return err; +} + +/* + * get quota limits. + * very simple - just return stat buffer to user + */ +static int vzquota_getstat(unsigned int quota_id, + struct vz_quota_stat __user *u_qstat, int compat) +{ + int err; + struct vz_quota_stat qstat; + struct vz_quota_master *qmblk; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + qmblk_data_read_lock(qmblk); + /* copy whole buffer under lock */ + memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); + memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); + qmblk_data_read_unlock(qmblk); + + if (!compat) + err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); + else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_stat cqstat; + dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat); + dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info); + err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat)); +#endif + } + if (err) + err = -EFAULT; + +out: + up(&vz_quota_sem); + return err; +} + +/* + * This is a system call to turn per-VE disk quota on. + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat) +{ + int ret; + int force = 0; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_CREATE: + ret = vzquota_create(quota_id, qstat, compat); + break; + case VZ_DQ_DESTROY: + ret = vzquota_destroy(quota_id); + break; + case VZ_DQ_ON: + /* + * qstat is just a pointer to userspace buffer to + * store busy files path in case of vzquota_on fail + */ + ret = vzquota_on(quota_id, ve_root, (char *)qstat); + break; + case VZ_DQ_OFF_FORCED: + force = 1; + case VZ_DQ_OFF: + /* + * ve_root is just a pointer to userspace buffer to + * store busy files path in case of vzquota_off fail + */ + ret = vzquota_off(quota_id, (char *)ve_root, force); + break; + case VZ_DQ_SETLIMIT: + ret = vzquota_setlimit(quota_id, qstat, compat); + break; + case VZ_DQ_GETSTAT: + ret = vzquota_getstat(quota_id, qstat, compat); + break; + + default: + ret = -EINVAL; + goto out; + } + +out: + return ret; +} + + +/* ---------------------------------------------------------------------- + * Proc filesystem routines + * ---------------------------------------------------------------------*/ + +#if defined(CONFIG_PROC_FS) + +#define QUOTA_UINT_LEN 15 +#define QUOTA_TIME_LEN_FMT_UINT "%11u" +#define QUOTA_NUM_LEN_FMT_UINT "%15u" +#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" +#define QUOTA_TIME_LEN_FMT_STR "%11s" +#define QUOTA_NUM_LEN_FMT_STR "%15s" +#define QUOTA_PROC_MAX_LINE_LEN 2048 + +/* + * prints /proc/ve_dq header line + */ +static int print_proc_header(char * buffer) +{ + return sprintf(buffer, + "%-11s" + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_NUM_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + QUOTA_TIME_LEN_FMT_STR + "\n", + "qid: path", + "usage", "softlimit", "hardlimit", "time", "expire"); +} + +/* + * prints proc master record id, dentry path + */ +static int print_proc_master_id(char * buffer, char * path_buf, + struct vz_quota_master * qp) +{ + char *path; + int over; + + path = NULL; + switch (qp->dq_state) { + case VZDQ_WORKING: + if (!path_buf) { + path = ""; + break; + } + path = d_path(qp->dq_root_dentry, + qp->dq_root_mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + path = ""; + break; + } + /* do not print large path, truncate it */ + over = strlen(path) - + (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - + QUOTA_UINT_LEN); + if (over > 0) { + path += over - 3; + path[0] = path[1] = path[3] = '.'; + } + break; + case VZDQ_STARTING: + path = "-- started --"; + break; + case VZDQ_STOPING: + path = "-- stopped --"; + break; + } + + return sprintf(buffer, "%u: %s\n", qp->dq_id, path); +} + +/* + * prints struct vz_quota_stat data + */ +static int print_proc_stat(char * buffer, struct dq_stat *qs, + struct dq_info *qi) +{ + return sprintf(buffer, + "%11s" + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_NUM_LEN_FMT_ULL + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n" + "%11s" + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_NUM_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + QUOTA_TIME_LEN_FMT_UINT + "\n", + "1k-blocks", + (unsigned long long)qs->bcurrent >> 10, + (unsigned long long)qs->bsoftlimit >> 10, + (unsigned long long)qs->bhardlimit >> 10, + (unsigned int)qs->btime, + (unsigned int)qi->bexpire, + "inodes", + qs->icurrent, + qs->isoftlimit, + qs->ihardlimit, + (unsigned int)qs->itime, + (unsigned int)qi->iexpire); +} + + +/* + * for /proc filesystem output + */ +static int vzquota_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len, i; + off_t printed = 0; + char *p = page; + struct vz_quota_master *qp; + struct vz_quota_ilink *ql2; + struct list_head *listp; + char *path_buf; + + path_buf = (char*)__get_free_page(GFP_KERNEL); + if (path_buf == NULL) + return -ENOMEM; + + len = print_proc_header(p); + printed += len; + if (off < printed) /* keep header in output */ { + *start = p + off; + p += len; + } + + down(&vz_quota_sem); + + /* traverse master hash table for all records */ + for (i = 0; i < vzquota_hash_size; i++) { + list_for_each(listp, &vzquota_hash_table[i]) { + qp = list_entry(listp, + struct vz_quota_master, dq_hash); + + /* Skip other VE's information if not root of VE0 */ + if ((!capable(CAP_SYS_ADMIN) || + !capable(CAP_SYS_RESOURCE))) { + ql2 = INODE_QLNK(current->fs->root->d_inode); + if (ql2 == NULL || qp != ql2->qmblk) + continue; + } + /* + * Now print the next record + */ + len = 0; + /* we print quotaid and path only in VE0 */ + if (capable(CAP_SYS_ADMIN)) + len += print_proc_master_id(p+len,path_buf, qp); + len += print_proc_stat(p+len, &qp->dq_stat, + &qp->dq_info); + printed += len; + /* skip unnecessary lines */ + if (printed <= off) + continue; + p += len; + /* provide start offset */ + if (*start == NULL) + *start = p + (off - printed); + /* have we printed all requested size? */ + if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || + (p - *start) >= count) + goto out; + } + } + + *eof = 1; /* checked all hash */ +out: + up(&vz_quota_sem); + + len = 0; + if (*start != NULL) { + len = (p - *start); + if (len > count) + len = count; + } + + if (path_buf) + free_page((unsigned long) path_buf); + + return len; +} + +/* + * Register procfs read callback + */ +int vzquota_proc_init(void) +{ + struct proc_dir_entry *de; + + de = create_proc_entry_mod("vz/vzquota", S_IFREG|S_IRUSR, NULL, + THIS_MODULE); + if (de == NULL) { + /* create "vz" subdirectory, if not exist */ + de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + if (de == NULL) + goto out_err; + de = create_proc_entry_mod("vzquota", S_IFREG|S_IRUSR, de, + THIS_MODULE); + if (de == NULL) + goto out_err; + } + de->read_proc = vzquota_read_proc; + de->data = NULL; + return 0; +out_err: + return -EBUSY; +} + +void vzquota_proc_release(void) +{ + /* Unregister procfs read callback */ + remove_proc_entry("vz/vzquota", NULL); +} + +#endif diff -uprN linux-2.6.24/fs/vzdq_ops.c linux-2.6.24.ovz/fs/vzdq_ops.c --- linux-2.6.24/fs/vzdq_ops.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdq_ops.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,633 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + */ + +#include +#include +#include +#include +#include +#include +#include + + +/* ---------------------------------------------------------------------- + * Quota superblock operations - helper functions. + * --------------------------------------------------------------------- */ + +static inline void vzquota_incr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + dqstat->icurrent += number; +} + +static inline void vzquota_incr_space(struct dq_stat *dqstat, + __u64 number) +{ + dqstat->bcurrent += number; +} + +static inline void vzquota_decr_inodes(struct dq_stat *dqstat, + unsigned long number) +{ + if (dqstat->icurrent > number) + dqstat->icurrent -= number; + else + dqstat->icurrent = 0; + if (dqstat->icurrent < dqstat->isoftlimit) + dqstat->itime = (time_t) 0; +} + +static inline void vzquota_decr_space(struct dq_stat *dqstat, + __u64 number) +{ + if (dqstat->bcurrent > number) + dqstat->bcurrent -= number; + else + dqstat->bcurrent = 0; + if (dqstat->bcurrent < dqstat->bsoftlimit) + dqstat->btime = (time_t) 0; +} + +/* + * better printk() message or use /proc/vzquotamsg interface + * similar to /proc/kmsg + */ +static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, + const char *fmt) +{ + if (dq_info->flags & flag) /* warning already printed for this + masterblock */ + return; + printk(fmt, dq_id); + dq_info->flags |= flag; +} + +/* + * ignore_hardlimit - + * + * Intended to allow superuser of VE0 to overwrite hardlimits. + * + * ignore_hardlimit() has a very bad feature: + * + * writepage() operation for writable mapping of a file with holes + * may trigger get_block() with wrong current and as a consequence, + * opens a possibility to overcommit hardlimits + */ +/* for the reason above, it is disabled now */ +static inline int ignore_hardlimit(struct dq_info *dqstat) +{ +#if 0 + return ve_is_super(get_exec_env()) && + capable(CAP_SYS_RESOURCE) && + (dqstat->options & VZ_QUOTA_OPT_RSQUASH); +#else + return 0; +#endif +} + +static int vzquota_check_inodes(struct dq_info *dq_info, + struct dq_stat *dqstat, + unsigned long number, int dq_id) +{ + if (number == 0) + return QUOTA_OK; + + if (dqstat->icurrent + number > dqstat->ihardlimit && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file hardlimit reached for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: file softlimit exceeded " + "for id=%d\n"); + dqstat->itime = CURRENT_TIME_SECONDS + + dq_info->iexpire; + } else if (CURRENT_TIME_SECONDS >= dqstat->itime && + !ignore_hardlimit(dq_info)) { + vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, + "VZ QUOTA: file softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +static int vzquota_check_space(struct dq_info *dq_info, + struct dq_stat *dqstat, + __u64 number, int dq_id, char prealloc) +{ + if (number == 0) + return QUOTA_OK; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (dqstat->bcurrent + number > dqstat->bhardlimit && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk hardlimit reached " + "for id=%d\n"); + return NO_QUOTA; + } + + if (dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) { + vzquota_warn(dq_info, dq_id, 0, + "VZ QUOTA: disk softlimit exceeded " + "for id=%d\n"); + dqstat->btime = CURRENT_TIME_SECONDS + + dq_info->bexpire; + } else { + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } + } else if (CURRENT_TIME_SECONDS >= dqstat->btime && + !ignore_hardlimit(dq_info)) { + if (!prealloc) + vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, + "VZ QUOTA: disk quota " + "softlimit expired " + "for id=%d\n"); + return NO_QUOTA; + } + } + + return QUOTA_OK; +} + +#ifdef CONFIG_VZ_QUOTA_UGID +static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, unsigned long number) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->ihardlimit != 0 && + dqstat->icurrent + number > dqstat->ihardlimit) + return NO_QUOTA; + + if (dqstat->isoftlimit != 0 && + dqstat->icurrent + number > dqstat->isoftlimit) { + if (dqstat->itime == (time_t)0) + dqstat->itime = CURRENT_TIME_SECONDS + + dqinfo->iexpire; + else if (CURRENT_TIME_SECONDS >= dqstat->itime) + return NO_QUOTA; + } + + return QUOTA_OK; +} + +static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + int type, __u64 number, char prealloc) +{ + struct dq_info *dqinfo; + struct dq_stat *dqstat; + + if (prealloc == DQUOT_CMD_FORCE) + return QUOTA_OK; + + if (qugid[type] == NULL) + return QUOTA_OK; + if (qugid[type] == VZ_QUOTA_UGBAD) + return NO_QUOTA; + + if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) + return QUOTA_OK; + if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) + return QUOTA_OK; + if (number == 0) + return QUOTA_OK; + + dqinfo = &qmblk->dq_ugid_info[type]; + dqstat = &qugid[type]->qugid_stat; + + if (dqstat->bhardlimit != 0 && + dqstat->bcurrent + number > dqstat->bhardlimit) + return NO_QUOTA; + + if (dqstat->bsoftlimit != 0 && + dqstat->bcurrent + number > dqstat->bsoftlimit) { + if (dqstat->btime == (time_t)0) { + if (!prealloc) + dqstat->btime = CURRENT_TIME_SECONDS + + dqinfo->bexpire; + else + /* + * Original Linux quota doesn't allow + * preallocation to exceed softlimit so + * exceeding will be always printed + */ + return NO_QUOTA; + } else if (CURRENT_TIME_SECONDS >= dqstat->btime) + return NO_QUOTA; + } + + return QUOTA_OK; +} +#endif + +/* ---------------------------------------------------------------------- + * Quota superblock operations + * --------------------------------------------------------------------- */ + +/* + * S_NOQUOTA note. + * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for + * - quota file (absent in our case) + * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like + * filesystem-specific new_inode, before the inode gets outside links. + * For the latter case, the only quota operation where care about S_NOQUOTA + * might be required is vzquota_drop, but there S_NOQUOTA has already been + * checked in DQUOT_DROP(). + * So, S_NOQUOTA may be ignored for now in the VZDQ code. + * + * The above note is not entirely correct. + * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from + * delete_inode if new_inode fails (for example, because of inode quota + * limits), so S_NOQUOTA check is needed in free_inode. + * This seems to be the dark corner of the current quota API. + */ + +/* + * Initialize quota operations for the specified inode. + */ +static int vzquota_initialize(struct inode *inode, int type) +{ + vzquota_inode_init_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Release quota for the specified inode. + */ +static int vzquota_drop(struct inode *inode) +{ + vzquota_inode_drop_call(inode); + return 0; /* ignored by caller */ +} + +/* + * Allocate block callback. + * + * If (prealloc) disk quota exceeding warning is not printed. + * See Linux quota to know why. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_space(struct inode *inode, + qsize_t number, int prealloc) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id, prealloc); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_space(qmblk, qugid, + cnt, number, prealloc); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_space(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_space(&qmblk->dq_stat, number); + vzquota_data_unlock(inode, &data); + } + + inode_add_bytes(inode, number); + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock(inode, &data); + return NO_QUOTA; +} + +/* + * Allocate inodes callback. + * + * Return: + * QUOTA_OK == 0 on SUCCESS + * NO_QUOTA == 1 if allocation should fail + */ +static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + int ret = QUOTA_OK; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid *qugid[MAXQUOTAS]; +#endif + + /* checking first */ + ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, + number, qmblk->dq_id); + if (ret == NO_QUOTA) + goto no_quota; +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; + ret = vzquota_check_ugid_inodes(qmblk, qugid, + cnt, number); + if (ret == NO_QUOTA) + goto no_quota; + } + /* check ok, may increment */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (qugid[cnt] == NULL) + continue; + vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); + } +#endif + vzquota_incr_inodes(&qmblk->dq_stat, number); + vzquota_data_unlock((struct inode *)inode, &data); + } + + might_sleep(); + return QUOTA_OK; + +no_quota: + vzquota_data_unlock((struct inode *)inode, &data); + return NO_QUOTA; +} + +/* + * Free space callback. + */ +static int vzquota_free_space(struct inode *inode, qsize_t number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; /* isn't checked by the caller */ + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_space(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock(inode, &data); + } + inode_sub_bytes(inode, number); + might_sleep(); + return QUOTA_OK; +} + +/* + * Free inodes callback. + */ +static int vzquota_free_inode(const struct inode *inode, unsigned long number) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + qmblk = vzquota_inode_data((struct inode *)inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return NO_QUOTA; + if (qmblk != NULL) { +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + vzquota_decr_inodes(&qmblk->dq_stat, number); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_inodes(&qugid->qugid_stat, number); + } +#endif + vzquota_data_unlock((struct inode *)inode, &data); + } + might_sleep(); + return QUOTA_OK; +} + +void vzquota_inode_off(struct inode * inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* The call is made through virtinfo, it can be an inode + * not controlled by vzquota. + */ + if (inode->i_sb->dq_op != &vz_quota_operations) + return; + + qmblk = vzquota_inode_data(inode, &data); + if (qmblk == VZ_QUOTA_BAD) + return; + + if (qmblk == NULL) { + /* Tricky place. If qmblk == NULL, it means that this inode + * is not in area controlled by vzquota (except for rare + * case of already set S_NOQUOTA). But we have to set + * S_NOQUOTA in any case because vzquota can be turned + * on later, when this inode is invalid from viewpoint + * of vzquota. + * + * To be safe, we reacquire vzquota lock. + */ + inode_qmblk_lock(inode->i_sb); + inode->i_flags |= S_NOQUOTA; + inode_qmblk_unlock(inode->i_sb); + return; + } else { + loff_t bytes = inode_get_bytes(inode); +#ifdef CONFIG_VZ_QUOTA_UGID + int cnt; + struct vz_quota_ugid * qugid; +#endif + + inode->i_flags |= S_NOQUOTA; + + vzquota_decr_space(&qmblk->dq_stat, bytes); + vzquota_decr_inodes(&qmblk->dq_stat, 1); +#ifdef CONFIG_VZ_QUOTA_UGID + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + qugid = INODE_QLNK(inode)->qugid[cnt]; + if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) + continue; + vzquota_decr_space(&qugid->qugid_stat, bytes); + vzquota_decr_inodes(&qugid->qugid_stat, 1); + } +#endif + + vzquota_data_unlock(inode, &data); + + vzquota_inode_drop_call(inode); + } +} + + +#ifdef CONFIG_VZ_QUOTA_UGID + +/* + * helper function for quota_transfer + * check that we can add inode to this quota_id + */ +static int vzquota_transfer_check(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid[], + unsigned int type, __u64 size) +{ + if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || + vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) + return -1; + return 0; +} + +int vzquota_transfer_usage(struct inode *inode, + int mask, + struct vz_quota_ilink *qlnk) +{ + struct vz_quota_ugid *qugid_old; + __u64 space; + int i; + + space = inode_get_bytes(inode); + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + /* + * Do not permit chown a file if its owner does not have + * ugid record. This might happen if we somehow exceeded + * the UID/GID (e.g. set uglimit less than number of users). + */ + if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD) + return -1; + if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) + return -1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(mask & (1 << i))) + continue; + qugid_old = INODE_QLNK(inode)->qugid[i]; + vzquota_decr_space(&qugid_old->qugid_stat, space); + vzquota_decr_inodes(&qugid_old->qugid_stat, 1); + vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); + vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); + } + return 0; +} + +/* + * Transfer the inode between diffent user/group quotas. + */ +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return vzquota_inode_transfer_call(inode, iattr) ? + NO_QUOTA : QUOTA_OK; +} + +#else /* CONFIG_VZ_QUOTA_UGID */ + +static int vzquota_transfer(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +#endif + +/* + * Called under following semaphores: + * old_d->d_inode->i_sb->s_vfs_rename_sem + * old_d->d_inode->i_sem + * new_d->d_inode->i_sem + * [not verified --SAW] + */ +static int vzquota_rename(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + return vzquota_rename_check(inode, old_dir, new_dir) ? + NO_QUOTA : QUOTA_OK; +} + +/* + * Structure of superblock diskquota operations. + */ +struct dquot_operations vz_quota_operations = { + .initialize = vzquota_initialize, + .drop = vzquota_drop, + .alloc_space = vzquota_alloc_space, + .alloc_inode = vzquota_alloc_inode, + .free_space = vzquota_free_space, + .free_inode = vzquota_free_inode, + .transfer = vzquota_transfer, + .rename = vzquota_rename, +}; diff -uprN linux-2.6.24/fs/vzdq_tree.c linux-2.6.24.ovz/fs/vzdq_tree.c --- linux-2.6.24/fs/vzdq_tree.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdq_tree.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,286 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo quota tree implementation + */ + +#include +#include +#include + +struct quotatree_tree *quotatree_alloc(void) +{ + int l; + struct quotatree_tree *tree; + + tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); + if (tree == NULL) + goto out; + + for (l = 0; l < QUOTATREE_DEPTH; l++) { + INIT_LIST_HEAD(&tree->levels[l].usedlh); + INIT_LIST_HEAD(&tree->levels[l].freelh); + tree->levels[l].freenum = 0; + } + tree->root = NULL; + tree->leaf_num = 0; +out: + return tree; +} + +static struct quotatree_node * +quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, + struct quotatree_find_state *st) +{ + void **block; + struct quotatree_node *parent; + int l, index; + + parent = NULL; + block = (void **)&tree->root; + l = 0; + while (l < level && *block != NULL) { + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + parent = *block; + block = parent->blocks + index; + l++; + } + if (st != NULL) { + st->block = block; + st->level = l; + } + + return parent; +} + +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st) +{ + quotatree_follow(tree, id, QUOTATREE_DEPTH, st); + if (st->level == QUOTATREE_DEPTH) + return *st->block; + else + return NULL; +} + +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) +{ + int i, count; + struct quotatree_node *p; + void *leaf; + + if (QTREE_LEAFNUM(tree) <= index) + return NULL; + + count = 0; + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + leaf = p->blocks[i]; + if (leaf == NULL) + continue; + if (count == index) + return leaf; + count++; + } + } + return NULL; +} + +/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) + * in the tree... */ +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) +{ + int off; + struct quotatree_node *parent, *p; + struct list_head *lh; + + /* get parent refering correct quota tree node of the last level */ + parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); + if (!parent) + return NULL; + + off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ + lh = &parent->list; + do { + p = list_entry(lh, struct quotatree_node, list); + for ( ; off < QUOTATREE_BSIZE; off++) + if (p->blocks[off]) + return p->blocks[off]; + off = 0; + lh = lh->next; + } while (lh != &QTREE_LEAFLVL(tree)->usedlh); + + return NULL; +} + +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data) +{ + struct quotatree_node *p; + int l, index; + + while (st->level < QUOTATREE_DEPTH) { + l = st->level; + if (!list_empty(&tree->levels[l].freelh)) { + p = list_entry(tree->levels[l].freelh.next, + struct quotatree_node, list); + list_del(&p->list); + } else { + p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); + if (p == NULL) + return -ENOMEM; + /* save block number in the l-level + * it uses for quota file generation */ + p->num = tree->levels[l].freenum++; + } + list_add(&p->list, &tree->levels[l].usedlh); + memset(p->blocks, 0, sizeof(p->blocks)); + *st->block = p; + + index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; + st->block = p->blocks + index; + st->level++; + } + tree->leaf_num++; + *st->block = data; + + return 0; +} + +static struct quotatree_node * +quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, + int level) +{ + struct quotatree_node *parent; + struct quotatree_find_state st; + + parent = quotatree_follow(tree, id, level, &st); + if (st.level == QUOTATREE_DEPTH) + tree->leaf_num--; + *st.block = NULL; + return parent; +} + +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) +{ + struct quotatree_node *p; + int level, i; + + p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); + for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { + for (i = 0; i < QUOTATREE_BSIZE; i++) + if (p->blocks[i] != NULL) + return; + list_move(&p->list, &tree->levels[level].freelh); + p = quotatree_remove_ptr(tree, id, level); + } +} + +#if 0 +static void quotatree_walk(struct quotatree_tree *tree, + struct quotatree_node *node_start, + quotaid_t id_start, + int level_start, int level_end, + int (*callback)(struct quotatree_tree *, + quotaid_t id, + int level, + void *ptr, + void *data), + void *data) +{ + struct quotatree_node *p; + int l, shift, index; + quotaid_t id; + struct quotatree_find_state st; + + p = node_start; + l = level_start; + shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + id = id_start; + index = 0; + + /* + * Invariants: + * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; + * id & ((1 << shift) - 1) == 0 + * p is l-level node corresponding to id + */ + do { + if (!p) + break; + + if (l < level_end) { + for (; index < QUOTATREE_BSIZE; index++) + if (p->blocks[index] != NULL) + break; + if (index < QUOTATREE_BSIZE) { + /* descend */ + p = p->blocks[index]; + l++; + shift -= QUOTAID_BBITS; + id += (quotaid_t)index << shift; + index = 0; + continue; + } + } + + if ((*callback)(tree, id, l, p, data)) + break; + + /* ascend and to the next node */ + p = quotatree_follow(tree, id, l, &st); + + index = ((id >> shift) & QUOTATREE_BMASK) + 1; + l--; + shift += QUOTAID_BBITS; + id &= ~(((quotaid_t)1 << shift) - 1); + } while (l >= level_start); +} +#endif + +static void free_list(struct list_head *node_list) +{ + struct quotatree_node *p, *tmp; + + list_for_each_entry_safe(p, tmp, node_list, list) { + list_del(&p->list); + kfree(p); + } +} + +static inline void quotatree_free_nodes(struct quotatree_tree *tree) +{ + int i; + + for (i = 0; i < QUOTATREE_DEPTH; i++) { + free_list(&tree->levels[i].usedlh); + free_list(&tree->levels[i].freelh); + } +} + +static void quotatree_free_leafs(struct quotatree_tree *tree, + void (*dtor)(void *)) +{ + int i; + struct quotatree_node *p; + + list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { + for (i = 0; i < QUOTATREE_BSIZE; i++) { + if (p->blocks[i] == NULL) + continue; + + dtor(p->blocks[i]); + } + } +} + +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) +{ + quotatree_free_leafs(tree, dtor); + quotatree_free_nodes(tree); + kfree(tree); +} diff -uprN linux-2.6.24/fs/vzdq_ugid.c linux-2.6.24.ovz/fs/vzdq_ugid.c --- linux-2.6.24/fs/vzdq_ugid.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdq_ugid.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1221 @@ +/* + * Copyright (C) 2002 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo UID/GID disk quota implementation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * XXX + * may be something is needed for sb->s_dquot->info[]? + */ + +#define USRQUOTA_MASK (1 << USRQUOTA) +#define GRPQUOTA_MASK (1 << GRPQUOTA) +#define QTYPE2MASK(type) (1 << (type)) + +static struct kmem_cache *vz_quota_ugid_cachep; + +/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects + * list on the hash table */ +extern struct semaphore vz_quota_sem; + +inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) +{ + if (qugid != VZ_QUOTA_UGBAD) + atomic_inc(&qugid->qugid_count); + return qugid; +} + +/* we don't limit users with zero limits */ +static inline int vzquota_fake_stat(struct dq_stat *stat) +{ + return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && + stat->ihardlimit == 0 && stat->isoftlimit == 0; +} + +/* callback function for quotatree_free() */ +static inline void vzquota_free_qugid(void *ptr) +{ + kmem_cache_free(vz_quota_ugid_cachep, ptr); +} + +/* + * destroy ugid, if it have zero refcount, limits and usage + * must be called under qmblk->dq_sem + */ +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid) +{ + if (qugid == VZ_QUOTA_UGBAD) + return; + qmblk_data_read_lock(qmblk); + if (atomic_dec_and_test(&qugid->qugid_count) && + (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && + vzquota_fake_stat(&qugid->qugid_stat) && + qugid->qugid_stat.bcurrent == 0 && + qugid->qugid_stat.icurrent == 0) { + quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); + qmblk->dq_ugid_count--; + vzquota_free_qugid(qugid); + } + qmblk_data_read_unlock(qmblk); +} + +/* + * Get ugid block by its index, like it would present in array. + * In reality, this is not array - this is leafs chain of the tree. + * NULL if index is out of range. + * qmblk semaphore is required to protect the tree. + */ +static inline struct vz_quota_ugid * +vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) +{ + return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); +} + +/* + * get next element from ugid "virtual array" + * ugid must be in current array and this array may not be changed between + * two accesses (quaranteed by "stopped" quota state and quota semaphore) + * qmblk semaphore is required to protect the tree + */ +static inline struct vz_quota_ugid * +vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) +{ + return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), + qugid->qugid_id); +} + +/* + * requires dq_sem + */ +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + struct quotatree_tree *tree; + struct quotatree_find_state st; + + tree = QUGID_TREE(qmblk, type); + qugid = quotatree_find(tree, quota_id, &st); + if (qugid) + goto success; + + /* caller does not want alloc */ + if (flags & VZDQUG_FIND_DONT_ALLOC) + goto fail; + + if (flags & VZDQUG_FIND_FAKE) + goto doit; + + /* check limit */ + if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) + goto fail; + + /* see comment at VZDQUG_FIXED_SET define */ + if (qmblk->dq_flags & VZDQUG_FIXED_SET) + goto fail; + +doit: + /* alloc new structure */ + qugid = kmem_cache_alloc(vz_quota_ugid_cachep, + GFP_NOFS | __GFP_NOFAIL); + if (qugid == NULL) + goto fail; + + /* initialize new structure */ + qugid->qugid_id = quota_id; + memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); + qugid->qugid_type = type; + atomic_set(&qugid->qugid_count, 0); + + /* insert in tree */ + if (quotatree_insert(tree, quota_id, &st, qugid) < 0) + goto fail_insert; + qmblk->dq_ugid_count++; + +success: + vzquota_get_ugid(qugid); + return qugid; + +fail_insert: + vzquota_free_qugid(qugid); +fail: + return VZ_QUOTA_UGBAD; +} + +/* + * takes dq_sem, may schedule + */ +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags) +{ + struct vz_quota_ugid *qugid; + + down(&qmblk->dq_sem); + qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); + up(&qmblk->dq_sem); + + return qugid; +} + +/* + * destroy all ugid records on given quota master + */ +void vzquota_kill_ugid(struct vz_quota_master *qmblk) +{ + BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || + (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); + + if (qmblk->dq_uid_tree != NULL) { + quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); + quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); + } +} + + +/* ---------------------------------------------------------------------- + * Management interface to ugid quota for (super)users. + * --------------------------------------------------------------------- */ + +static int vzquota_initialize2(struct inode *inode, int type) +{ + return QUOTA_OK; +} + +static int vzquota_drop2(struct inode *inode) +{ + return QUOTA_OK; +} + +static int vzquota_alloc_space2(struct inode *inode, + qsize_t number, int prealloc) +{ + inode_add_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_free_space2(struct inode *inode, qsize_t number) +{ + inode_sub_bytes(inode, number); + return QUOTA_OK; +} + +static int vzquota_free_inode2(const struct inode *inode, unsigned long number) +{ + return QUOTA_OK; +} + +static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) +{ + return QUOTA_OK; +} + +struct dquot_operations vz_quota_operations2 = { + .initialize = vzquota_initialize2, + .drop = vzquota_drop2, + .alloc_space = vzquota_alloc_space2, + .alloc_inode = vzquota_alloc_inode2, + .free_space = vzquota_free_space2, + .free_inode = vzquota_free_inode2, + .transfer = vzquota_transfer2, +}; + + +asmlinkage long sys_unlink(const char __user * pathname); +asmlinkage long sys_rename(const char __user * oldname, + const char __user * newname); +asmlinkage long sys_symlink(const char __user * oldname, + const char __user * newname); + +/* called under sb->s_umount semaphore */ +static int vz_restore_symlink(struct super_block *sb, char *path, int type) +{ + mm_segment_t oldfs; + char *newpath; + char dest[64]; + const char *names[] = { + [USRQUOTA] "aquota.user", + [GRPQUOTA] "aquota.group" + }; + int err; + + newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL); + if (newpath == NULL) + return -ENOMEM; + + strcpy(newpath, path); + strcat(newpath, ".new"); + + sprintf(dest, "/proc/vz/vzaquota/%08x/%s", + new_encode_dev(sb->s_dev), names[type]); + + /* + * Lockdep will learn unneeded dependency while unlink(2): + * ->s_umount => ->i_mutex/1 => ->i_mutex + * Reverse dependency is, + * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash() + * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev() + * => user_get_super() => ->s_umount + * + * However, first set of ->i_mutex'es belong to /, second to /proc . + * Right fix is to get rid of vz_restore_symlink(), of course. + */ + up_read(&sb->s_umount); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_unlink(newpath); + if (err < 0 && err != -ENOENT) + goto out_restore; + err = sys_symlink(dest, newpath); + if (err < 0) + goto out_restore; + err = sys_rename(newpath, path); +out_restore: + set_fs(oldfs); + + down_read(&sb->s_umount); + /* umounted meanwhile? */ + if (err == 0 && !sb->s_root) + err = -ENODEV; + + kfree(newpath); + return err; +} + +/* called under sb->s_umount semaphore */ +static int vz_quota_on(struct super_block *sb, int type, + int format_id, char *path) +{ + struct vz_quota_master *qmblk; + int mask, mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = vz_restore_symlink(sb, path, type); + if (err < 0) + goto out_put; + + down(&vz_quota_sem); + mask = 0; + mask2 = 0; + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + if (type == USRQUOTA) { + mask = DQUOT_USR_ENABLED; + mask2 = VZDQ_USRQUOTA; + } + if (type == GRPQUOTA) { + mask = DQUOT_GRP_ENABLED; + mask2 = VZDQ_GRPQUOTA; + } + err = -EBUSY; + if (qmblk->dq_flags & mask2) + goto out_sem; + + err = 0; + qmblk->dq_flags |= mask2; + sb->s_dquot.flags |= mask; + +out_sem: + up(&vz_quota_sem); +out_put: + qmblk_put(qmblk); +out: + return err; +} + +static int vz_quota_off(struct super_block *sb, int type) +{ + struct vz_quota_master *qmblk; + int mask2; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + mask2 = 0; + if (type == USRQUOTA) + mask2 = VZDQ_USRQUOTA; + if (type == GRPQUOTA) + mask2 = VZDQ_GRPQUOTA; + err = -EINVAL; + if (!(qmblk->dq_flags & mask2)) + goto out; + + qmblk->dq_flags &= ~mask2; + err = 0; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_quota_sync(struct super_block *sb, int type) +{ + return 0; /* vz quota is always uptodate */ +} + +static int vz_get_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid *ugid; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); + if (ugid != VZ_QUOTA_UGBAD) { + qmblk_data_read_lock(qmblk); + di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; + di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; + di->dqb_curspace = ugid->qugid_stat.bcurrent; + di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; + di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; + di->dqb_curinodes = ugid->qugid_stat.icurrent; + di->dqb_btime = ugid->qugid_stat.btime; + di->dqb_itime = ugid->qugid_stat.itime; + qmblk_data_read_unlock(qmblk); + di->dqb_valid = QIF_ALL; + vzquota_put_ugid(qmblk, ugid); + } else { + memset(di, 0, sizeof(*di)); + di->dqb_valid = QIF_ALL; + } + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqblk(struct vz_quota_master *qmblk, + int type, qid_t id, struct if_dqblk *di) +{ + struct vz_quota_ugid *ugid; + + ugid = vzquota_find_ugid(qmblk, id, type, 0); + if (ugid == VZ_QUOTA_UGBAD) + return -ESRCH; + + qmblk_data_write_lock(qmblk); + /* + * Subtle compatibility breakage. + * + * Some old non-vz kernel quota didn't start grace period + * if the new soft limit happens to be below the usage. + * Non-vz kernel quota in 2.4.20 starts the grace period + * (if it hasn't been started). + * Current non-vz kernel performs even more complicated + * manipulations... + * + * Also, current non-vz kernels have inconsistency related to + * the grace time start. In regular operations the grace period + * is started if the usage is greater than the soft limit (and, + * strangely, is cancelled if the usage is less). + * However, set_dqblk starts the grace period if the usage is greater + * or equal to the soft limit. + * + * Here we try to mimic the behavior of the current non-vz kernel. + */ + if (di->dqb_valid & QIF_BLIMITS) { + ugid->qugid_stat.bhardlimit = + (__u64)di->dqb_bhardlimit << 10; + ugid->qugid_stat.bsoftlimit = + (__u64)di->dqb_bsoftlimit << 10; + if (di->dqb_bsoftlimit == 0 || + ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) + ugid->qugid_stat.btime = 0; + else if (!(di->dqb_valid & QIF_BTIME)) + ugid->qugid_stat.btime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].bexpire; + else + ugid->qugid_stat.btime = di->dqb_btime; + } + if (di->dqb_valid & QIF_ILIMITS) { + ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; + ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; + if (di->dqb_isoftlimit == 0 || + ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) + ugid->qugid_stat.itime = 0; + else if (!(di->dqb_valid & QIF_ITIME)) + ugid->qugid_stat.itime = CURRENT_TIME_SECONDS + + qmblk->dq_ugid_info[type].iexpire; + else + ugid->qugid_stat.itime = di->dqb_itime; + } + qmblk_data_write_unlock(qmblk); + vzquota_put_ugid(qmblk, ugid); + + return 0; +} + +static int vz_set_dqblk(struct super_block *sb, int type, + qid_t id, struct if_dqblk *di) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqblk(qmblk, type, id, di); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +static int vz_get_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = 0; + ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; + ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; + ii->dqi_flags = 0; + ii->dqi_valid = IIF_ALL; + +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +/* must be called under vz_quota_sem */ +static int __vz_set_dqinfo(struct vz_quota_master *qmblk, + int type, struct if_dqinfo *ii) +{ + if (ii->dqi_valid & IIF_FLAGS) + if (ii->dqi_flags & DQF_MASK) + return -EINVAL; + + if (ii->dqi_valid & IIF_BGRACE) + qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; + if (ii->dqi_valid & IIF_IGRACE) + qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; + return 0; +} + +static int vz_set_dqinfo(struct super_block *sb, int type, + struct if_dqinfo *ii) +{ + struct vz_quota_master *qmblk; + int err; + + qmblk = vzquota_find_qmblk(sb); + down(&vz_quota_sem); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + err = __vz_set_dqinfo(qmblk, type, ii); +out: + up(&vz_quota_sem); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + return err; +} + +#ifdef CONFIG_QUOTA_COMPAT + +#define Q_GETQUOTI_SIZE 1024 + +#define UGID2DQBLK(dst, src) \ + do { \ + (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ + (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ + (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ + /* in 1K blocks */ \ + (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ + /* in 1K blocks */ \ + (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ + /* in bytes, 64 bit */ \ + (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ + (dst)->dqb_btime = (src)->qugid_stat.btime; \ + (dst)->dqb_itime = (src)->qugid_stat.itime; \ + } while (0) + +static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, + struct v2_disk_dqblk __user *dqblk) +{ + struct vz_quota_master *qmblk; + struct v2_disk_dqblk *data, *kbuf; + struct vz_quota_ugid *ugid; + int count; + int err; + + qmblk = vzquota_find_qmblk(sb); + err = -ESRCH; + if (qmblk == NULL) + goto out; + err = -EIO; + if (qmblk == VZ_QUOTA_BAD) + goto out; + + err = -ENOMEM; + kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); + if (!kbuf) + goto out; + + down(&vz_quota_sem); + down(&qmblk->dq_sem); + for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; + ugid != NULL && count < Q_GETQUOTI_SIZE; + count++) + { + data = kbuf + count; + qmblk_data_read_lock(qmblk); + UGID2DQBLK(data, ugid); + qmblk_data_read_unlock(qmblk); + data->dqb_id = ugid->qugid_id; + + /* Find next entry */ + ugid = vzquota_get_next(qmblk, ugid); + BUG_ON(ugid != NULL && ugid->qugid_type != type); + } + up(&qmblk->dq_sem); + up(&vz_quota_sem); + + err = count; + if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) + err = -EFAULT; + + vfree(kbuf); +out: + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + qmblk_put(qmblk); + + return err; +} + +#endif + +struct quotactl_ops vz_quotactl_operations = { + .quota_on = vz_quota_on, + .quota_off = vz_quota_off, + .quota_sync = vz_quota_sync, + .get_info = vz_get_dqinfo, + .set_info = vz_set_dqinfo, + .get_dqblk = vz_get_dqblk, + .set_dqblk = vz_set_dqblk, +#ifdef CONFIG_QUOTA_COMPAT + .get_quoti = vz_get_quoti, +#endif +}; + + +/* ---------------------------------------------------------------------- + * Management interface for host system admins. + * --------------------------------------------------------------------- */ + +static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, + struct vz_quota_iface __user *u_ugid_buf, int compat) +{ + struct vz_quota_master *qmblk; + int ret; + + down(&vz_quota_sem); + + ret = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + ret = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept new ugids */ + + ret = 0; + /* start to add ugids */ + for (ret = 0; ret < ugid_size; ret++) { + struct vz_quota_iface ugid_buf; + struct vz_quota_ugid *ugid; + + if (!compat) { + if (copy_from_user(&ugid_buf, u_ugid_buf, + sizeof(ugid_buf))) + break; + u_ugid_buf++; /* next user buffer */ + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + if (copy_from_user(&oqif, u_ugid_buf, + sizeof(oqif))) + break; + ugid_buf.qi_id = oqif.qi_id; + ugid_buf.qi_type = oqif.qi_type; + compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat); + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); +#endif + } + + if (ugid_buf.qi_type >= MAXQUOTAS) + break; /* bad quota type - this is the only check */ + + ugid = vzquota_find_ugid(qmblk, + ugid_buf.qi_id, ugid_buf.qi_type, 0); + if (ugid == VZ_QUOTA_UGBAD) { + qmblk->dq_flags |= VZDQUG_FIXED_SET; + break; /* limit reached */ + } + + /* update usage/limits + * we can copy the data without the lock, because the data + * cannot be modified in VZDQ_STARTING state */ + ugid->qugid_stat = ugid_buf.qi_stat; + + vzquota_put_ugid(qmblk, ugid); + } +out: + up(&vz_quota_sem); + + return ret; +} + +static int quota_ugid_setgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EBUSY; + if (qmblk->dq_state != VZDQ_STARTING) + goto out; /* working quota doesn't accept changing options */ + + err = -EFAULT; + if (!compat) { + if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) + goto out; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + if (copy_from_user(odqi, u_dq_info, sizeof(odqi))) + goto out; + for (type = 0; type < MAXQUOTAS; type++) + compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]); +#endif + } + + err = 0; + + /* update in qmblk */ + for (type = 0; type < MAXQUOTAS; type++) { + target = &qmblk->dq_ugid_info[type]; + target->bexpire = dq_info[type].bexpire; + target->iexpire = dq_info[type].iexpire; + } +out: + up(&vz_quota_sem); + + return err; +} + +static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, + struct vz_quota_iface *u_ugid_buf) +{ + int type, count; + struct vz_quota_ugid *ugid; + + if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + + QTREE_LEAFNUM(qmblk->dq_gid_tree) + <= index) + return 0; + + count = 0; + + type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; + if (type == GRPQUOTA) + index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); + + /* loop through ugid and then qgid quota */ +repeat: + for (ugid = vzquota_get_byindex(qmblk, index, type); + ugid != NULL && count < size; + ugid = vzquota_get_next(qmblk, ugid), count++) + { + struct vz_quota_iface ugid_buf; + + /* form interface buffer and send in to user-level */ + qmblk_data_read_lock(qmblk); + memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, + sizeof(ugid_buf.qi_stat)); + qmblk_data_read_unlock(qmblk); + ugid_buf.qi_id = ugid->qugid_id; + ugid_buf.qi_type = ugid->qugid_type; + + memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); + u_ugid_buf++; /* next portion of user buffer */ + } + + if (type == USRQUOTA && count < size) { + type = GRPQUOTA; + index = 0; + goto repeat; + } + + return count; +} + +static int quota_ugid_getstat(unsigned int quota_id, + int index, int size, struct vz_quota_iface __user *u_ugid_buf, + int compat) +{ + struct vz_quota_master *qmblk; + struct vz_quota_iface *k_ugid_buf; + int err; + + if (index < 0 || size < 0) + return -EINVAL; + + if (size > INT_MAX / sizeof(struct vz_quota_iface)) + return -EINVAL; + + k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); + if (k_ugid_buf == NULL) + return -ENOMEM; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + down(&qmblk->dq_sem); + err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); + up(&qmblk->dq_sem); + if (err < 0) + goto out; + + if (!compat) { + if (copy_to_user(u_ugid_buf, k_ugid_buf, + err * sizeof(struct vz_quota_iface))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_vz_quota_iface oqif; + int i; + for (i = 0; i < err; i++) { + oqif.qi_id = k_ugid_buf[i].qi_id; + oqif.qi_type = k_ugid_buf[i].qi_type; + dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat, + &oqif.qi_stat); + if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif))) + err = -EFAULT; + u_ugid_buf = (struct vz_quota_iface __user *) + (((void *)u_ugid_buf) + sizeof(oqif)); + } +#endif + } + +out: + up(&vz_quota_sem); + vfree(k_ugid_buf); + return err; +} + +static int quota_ugid_getgrace(unsigned int quota_id, + struct dq_info __user u_dq_info[], int compat) +{ + struct vz_quota_master *qmblk; + struct dq_info dq_info[MAXQUOTAS]; + struct dq_info *target; + int err, type; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + /* update from qmblk */ + for (type = 0; type < MAXQUOTAS; type ++) { + target = &qmblk->dq_ugid_info[type]; + dq_info[type].bexpire = target->bexpire; + dq_info[type].iexpire = target->iexpire; + dq_info[type].flags = target->flags; + } + + if (!compat) { + if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) + err = -EFAULT; + } else { +#ifdef CONFIG_COMPAT + struct compat_dq_info odqi[MAXQUOTAS]; + for (type = 0; type < MAXQUOTAS; type ++) + dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]); + if (copy_to_user(u_dq_info, odqi, sizeof(odqi))) + err = -EFAULT; +#endif + } +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_getconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = 0; + kinfo.limit = qmblk->dq_ugid_max; + kinfo.count = qmblk->dq_ugid_count; + kinfo.flags = qmblk->dq_flags; + + if (copy_to_user(info, &kinfo, sizeof(kinfo))) + err = -EFAULT; +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setconfig(unsigned int quota_id, + struct vz_quota_ugid_stat __user *info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_stat kinfo; + int err; + + down(&vz_quota_sem); + + err = -ENOENT; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&kinfo, info, sizeof(kinfo))) + goto out; + + err = 0; + qmblk->dq_ugid_max = kinfo.limit; + if (qmblk->dq_state == VZDQ_STARTING) { + qmblk->dq_flags = kinfo.flags; + if (qmblk->dq_flags & VZDQUG_ON) + qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; + } + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setlimit(unsigned int quota_id, + struct vz_quota_ugid_setlimit __user *u_lim) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setlimit lim; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&lim, u_lim, sizeof(lim))) + goto out; + + err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); + +out: + up(&vz_quota_sem); + + return err; +} + +static int quota_ugid_setinfo(unsigned int quota_id, + struct vz_quota_ugid_setinfo __user *u_info) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ugid_setinfo info; + int err; + + down(&vz_quota_sem); + + err = -ESRCH; + qmblk = vzquota_find_master(quota_id); + if (qmblk == NULL) + goto out; + + err = -EFAULT; + if (copy_from_user(&info, u_info, sizeof(info))) + goto out; + + err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); + +out: + up(&vz_quota_sem); + + return err; +} + +/* + * This is a system call to maintain UGID quotas + * Note this call is allowed to run ONLY from VE0 + */ +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat) +{ + int ret; + + ret = -EPERM; + /* access allowed only from root of VE0 */ + if (!capable(CAP_SYS_RESOURCE) || + !capable(CAP_SYS_ADMIN)) + goto out; + + switch (cmd) { + case VZ_DQ_UGID_GETSTAT: + ret = quota_ugid_getstat(quota_id, + ugid_index, ugid_size, + (struct vz_quota_iface __user *)addr, + compat); + break; + case VZ_DQ_UGID_ADDSTAT: + ret = quota_ugid_addstat(quota_id, ugid_size, + (struct vz_quota_iface __user *) addr, + compat); + break; + case VZ_DQ_UGID_GETGRACE: + ret = quota_ugid_getgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_SETGRACE: + ret = quota_ugid_setgrace(quota_id, + (struct dq_info __user *)addr, compat); + break; + case VZ_DQ_UGID_GETCONFIG: + ret = quota_ugid_getconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETCONFIG: + ret = quota_ugid_setconfig(quota_id, + (struct vz_quota_ugid_stat __user *) + addr); + break; + case VZ_DQ_UGID_SETLIMIT: + ret = quota_ugid_setlimit(quota_id, + (struct vz_quota_ugid_setlimit __user *) + addr); + break; + case VZ_DQ_UGID_SETINFO: + ret = quota_ugid_setinfo(quota_id, + (struct vz_quota_ugid_setinfo __user *) + addr); + break; + default: + ret = -EINVAL; + goto out; + } +out: + return ret; +} + +static void ugid_quota_on_sb(struct super_block *sb) +{ + struct super_block *real_sb; + struct vz_quota_master *qmblk; + + if (!sb->s_op->get_quota_root) + return; + + real_sb = sb->s_op->get_quota_root(sb)->i_sb; + if (real_sb->dq_op != &vz_quota_operations) + return; + + sb->dq_op = &vz_quota_operations2; + sb->s_qcop = &vz_quotactl_operations; + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + + qmblk = vzquota_find_qmblk(sb); + if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) + return; + down(&vz_quota_sem); + if (qmblk->dq_flags & VZDQ_USRQUOTA) + sb->s_dquot.flags |= DQUOT_USR_ENABLED; + if (qmblk->dq_flags & VZDQ_GRPQUOTA) + sb->s_dquot.flags |= DQUOT_GRP_ENABLED; + up(&vz_quota_sem); + qmblk_put(qmblk); +} + +static void ugid_quota_off_sb(struct super_block *sb) +{ + /* can't make quota off on mounted super block */ + BUG_ON(sb->s_root != NULL); +} + +static int ugid_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int old_ret) +{ + struct virt_info_quota *viq; + + viq = (struct virt_info_quota *)data; + + switch (n) { + case VIRTINFO_QUOTA_ON: + ugid_quota_on_sb(viq->super); + break; + case VIRTINFO_QUOTA_OFF: + ugid_quota_off_sb(viq->super); + break; + case VIRTINFO_QUOTA_GETSTAT: + break; + default: + return old_ret; + } + return NOTIFY_OK; +} + +static struct vnotifier_block ugid_notifier_block = { + .notifier_call = ugid_notifier_call, +}; + +/* ---------------------------------------------------------------------- + * Init/exit. + * --------------------------------------------------------------------- */ + +int vzquota_ugid_init(void) +{ + int err; + + vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", + sizeof(struct vz_quota_ugid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vz_quota_ugid_cachep == NULL) + goto err_slab; + + err = register_quota_format(&vz_quota_empty_v2_format); + if (err) + goto err_reg; + + virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); + return 0; + +err_reg: + kmem_cache_destroy(vz_quota_ugid_cachep); + return err; + +err_slab: + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + return -ENOMEM; +} + +void vzquota_ugid_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); + unregister_quota_format(&vz_quota_empty_v2_format); + + kmem_cache_destroy(vz_quota_ugid_cachep); +} diff -uprN linux-2.6.24/fs/vzdquot.c linux-2.6.24.ovz/fs/vzdquot.c --- linux-2.6.24/fs/vzdquot.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/fs/vzdquot.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1954 @@ +/* + * Copyright (C) 2001, 2002, 2004, 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains the core of Virtuozzo disk quota implementation: + * maintenance of VZDQ information in inodes, + * external interfaces, + * module entry. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ---------------------------------------------------------------------- + * + * Locking + * + * ---------------------------------------------------------------------- */ + +/* + * Serializes on/off and all other do_vzquotactl operations. + * Protects qmblk hash. + */ +struct semaphore vz_quota_sem; + +/* + * Data access locks + * inode_qmblk + * protects qmblk pointers in all inodes and qlnk content in general + * (but not qmblk content); + * also protects related qmblk invalidation procedures; + * can't be per-inode because of vzquota_dtree_qmblk complications + * and problems with serialization with quota_on, + * but can be per-superblock; + * qmblk_data + * protects qmblk fields (such as current usage) + * quota_data + * protects charge/uncharge operations, thus, implies + * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock + * (to protect ugid pointers). + * + * Lock order: + * inode_qmblk_lock -> dcache_lock + * inode_qmblk_lock -> qmblk_data + */ +static DEFINE_SPINLOCK(vzdq_qmblk_lock); + +inline void inode_qmblk_lock(struct super_block *sb) +{ + spin_lock(&vzdq_qmblk_lock); +} + +inline void inode_qmblk_unlock(struct super_block *sb) +{ + spin_unlock(&vzdq_qmblk_lock); +} + +inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) +{ + spin_lock(&qmblk->dq_data_lock); +} + +inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) +{ + spin_unlock(&qmblk->dq_data_lock); +} + +struct quota_format_type vz_quota_empty_v2_format = { + .qf_fmt_id = QFMT_VFS_V0, + .qf_ops = NULL, + .qf_owner = THIS_MODULE, +}; + +/* ---------------------------------------------------------------------- + * + * Master hash table handling. + * + * SMP not safe, serialied by vz_quota_sem within quota syscalls + * + * --------------------------------------------------------------------- */ + +static struct kmem_cache *vzquota_cachep; + +/* + * Hash function. + */ +#define QHASH_BITS 6 +#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) +#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) + +struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; +int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; + +static inline int vzquota_hash_func(unsigned int qid) +{ + return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); +} + +/** + * vzquota_alloc_master - alloc and instantiate master quota record + * + * Returns: + * pointer to newly created record if SUCCESS + * -ENOMEM if out of memory + * -EEXIST if record with given quota_id already exist + */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat) +{ + int err; + struct vz_quota_master *qmblk; + + err = -EEXIST; + if (vzquota_find_master(quota_id) != NULL) + goto out; + + err = -ENOMEM; + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + goto out; +#ifdef CONFIG_VZ_QUOTA_UGID + qmblk->dq_uid_tree = quotatree_alloc(); + if (!qmblk->dq_uid_tree) + goto out_free; + + qmblk->dq_gid_tree = quotatree_alloc(); + if (!qmblk->dq_gid_tree) + goto out_free_tree; +#endif + + qmblk->dq_state = VZDQ_STARTING; + init_MUTEX(&qmblk->dq_sem); + spin_lock_init(&qmblk->dq_data_lock); + + qmblk->dq_id = quota_id; + qmblk->dq_stat = qstat->dq_stat; + qmblk->dq_info = qstat->dq_info; + qmblk->dq_root_dentry = NULL; + qmblk->dq_root_mnt = NULL; + qmblk->dq_sb = NULL; + qmblk->dq_ugid_count = 0; + qmblk->dq_ugid_max = 0; + qmblk->dq_flags = 0; + memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + + atomic_set(&qmblk->dq_count, 1); + + /* insert in hash chain */ + list_add(&qmblk->dq_hash, + &vzquota_hash_table[vzquota_hash_func(quota_id)]); + + /* success */ + return qmblk; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_free_tree: + quotatree_free(qmblk->dq_uid_tree, NULL); +out_free: + kmem_cache_free(vzquota_cachep, qmblk); +#endif +out: + return ERR_PTR(err); +} + +static struct vz_quota_master *vzquota_alloc_fake(void) +{ + struct vz_quota_master *qmblk; + + qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); + if (qmblk == NULL) + return NULL; + memset(qmblk, 0, sizeof(*qmblk)); + qmblk->dq_state = VZDQ_STOPING; + qmblk->dq_flags = VZDQ_NOQUOT; + spin_lock_init(&qmblk->dq_data_lock); + INIT_LIST_HEAD(&qmblk->dq_ilink_list); + atomic_set(&qmblk->dq_count, 1); + return qmblk; +} + +/** + * vzquota_find_master - find master record with given id + * + * Returns qmblk without touching its refcounter. + * Called under vz_quota_sem. + */ +struct vz_quota_master *vzquota_find_master(unsigned int quota_id) +{ + int i; + struct vz_quota_master *qp; + + i = vzquota_hash_func(quota_id); + list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { + if (qp->dq_id == quota_id) + return qp; + } + return NULL; +} + +/** + * vzquota_free_master - release resources taken by qmblk, freeing memory + * + * qmblk is assumed to be already taken out from the hash. + * Should be called outside vz_quota_sem. + */ +void vzquota_free_master(struct vz_quota_master *qmblk) +{ +#ifdef CONFIG_VZ_QUOTA_UGID + vzquota_kill_ugid(qmblk); +#endif + BUG_ON(!list_empty(&qmblk->dq_ilink_list)); + kmem_cache_free(vzquota_cachep, qmblk); +} + + +/* ---------------------------------------------------------------------- + * + * Passing quota information through current + * + * Used in inode -> qmblk lookup at inode creation stage (since at that + * time there are no links between the inode being created and its parent + * directory). + * + * --------------------------------------------------------------------- */ + +#define VZDQ_CUR_MAGIC 0x57d0fee2 + +static inline int vzquota_cur_qmblk_check(void) +{ + return current->magic == VZDQ_CUR_MAGIC; +} + +static inline struct inode *vzquota_cur_qmblk_fetch(void) +{ + return current->ino; +} + +static inline void vzquota_cur_qmblk_set(struct inode *data) +{ + struct task_struct *tsk; + + tsk = current; + tsk->magic = VZDQ_CUR_MAGIC; + tsk->ino = data; +} + +#if 0 +static inline void vzquota_cur_qmblk_reset(void) +{ + current->magic = 0; +} +#endif + + +/* ---------------------------------------------------------------------- + * + * Superblock quota operations + * + * --------------------------------------------------------------------- */ + +/* + * Kernel structure abuse. + * We use files[0] pointer as an int variable: + * reference counter of how many quota blocks uses this superblock. + * files[1] is used for generations structure which helps us to track + * when traversing of dentries is really required. + */ +#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master +#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ + &sb->s_dquot.dqio_mutex) + +#if defined(VZ_QUOTA_UNLOAD) + +#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count + +struct dquot_operations *orig_dq_op; +struct quotactl_ops *orig_dq_cop; + +/** + * quota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. We keep a counter of such subtrees and set VZ quota operations or + * reset the default ones. + * + * Called under vz_quota_sem (from quota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + if (sb->dq_op != &vz_quota_operations) { + down(&sb->s_dquot.dqonoff_sem); + if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { + up(&sb->s_dquot.dqonoff_sem); + return -EEXIST; + } + if (orig_dq_op == NULL && sb->dq_op != NULL) + orig_dq_op = sb->dq_op; + sb->dq_op = &vz_quota_operations; + if (orig_dq_cop == NULL && sb->s_qcop != NULL) + orig_dq_cop = sb->s_qcop; + /* XXX this may race with sys_quotactl */ +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; + /* + * To get quotaops.h call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + __module_get(THIS_MODULE); + up(&sb->s_dquot.dqonoff_sem); + } + /* protected by vz_quota_sem */ + __VZ_QUOTA_SBREF(sb)++; + return 0; +} + +/** + * quota_put_super - release superblock when one quota tree goes away + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + int count; + + count = --__VZ_QUOTA_SBREF(sb); + if (count == 0) { + down(&sb->s_dquot.dqonoff_sem); + sb->s_dquot.flags = 0; + wmb(); synchronize_sched(); + sema_init(&sb->s_dquot.dqio_sem, 1); + sb->s_qcop = orig_dq_cop; + sb->dq_op = orig_dq_op; + inode_qmblk_lock(sb); + quota_gen_put(SB_QGEN(sb)); + SB_QGEN(sb) = NULL; + /* release qlnk's without qmblk */ + remove_inode_quota_links_list(&non_vzquota_inodes_lh, + sb, NULL); + /* + * Races with quota initialization: + * after this inode_qmblk_unlock all inode's generations are + * invalidated, quota_inode_qmblk checks superblock operations. + */ + inode_qmblk_unlock(sb); + /* + * Module refcounting: in theory, this is the best place + * to call module_put(THIS_MODULE). + * In reality, it can't be done because we can't be sure that + * other CPUs do not enter our code segment through dq_op + * cached long time ago. Quotaops interface isn't supposed to + * go into modules currently (that is, into unloadable + * modules). By omitting module_put, our module isn't + * unloadable. + */ + up(&sb->s_dquot.dqonoff_sem); + } +} + +#else + +struct vzquota_new_sop { + struct super_operations new_op; + const struct super_operations *old_op; +}; + +/** + * vzquota_shutdown_super - callback on umount + */ +void vzquota_shutdown_super(struct super_block *sb) +{ + struct vz_quota_master *qmblk; + struct vzquota_new_sop *sop; + + qmblk = __VZ_QUOTA_NOQUOTA(sb); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + if (qmblk != NULL) + qmblk_put(qmblk); + sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); + sb->s_op = sop->old_op; + kfree(sop); + if (sb->s_op->put_super != NULL) + (*sb->s_op->put_super)(sb); +} + +/** + * vzquota_get_super - account for new a quoted tree under the superblock + * + * One superblock can have multiple directory subtrees with different VZ + * quotas. + * + * Called under vz_quota_sem (from vzquota_on). + */ +int vzquota_get_super(struct super_block *sb) +{ + struct vz_quota_master *qnew; + struct vzquota_new_sop *sop; + int err; + + mutex_lock(&sb->s_dquot.dqonoff_mutex); + err = -EEXIST; + if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && + sb->dq_op != &vz_quota_operations) + goto out_up; + + /* + * This allocation code should be under sb->dq_op check below, but + * it doesn't really matter... + */ + if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { + qnew = vzquota_alloc_fake(); + if (qnew == NULL) + goto out_up; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + if (sb->dq_op != &vz_quota_operations) { + sop = kmalloc(sizeof(*sop), GFP_KERNEL); + if (sop == NULL) { + vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); + __VZ_QUOTA_NOQUOTA(sb) = NULL; + goto out_up; + } + memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); + sop->new_op.put_super = &vzquota_shutdown_super; + sop->old_op = sb->s_op; + sb->s_op = &sop->new_op; + + sb->dq_op = &vz_quota_operations; +#ifdef CONFIG_VZ_QUOTA_UGID + sb->s_qcop = &vz_quotactl_operations; +#else + sb->s_qcop = NULL; +#endif + do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); + + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + /* these 2 list heads are checked in sync_dquots() */ + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + sb->s_dquot.info[USRQUOTA].dqi_format = + &vz_quota_empty_v2_format; + sb->s_dquot.info[GRPQUOTA].dqi_format = + &vz_quota_empty_v2_format; + + /* + * To get quotaops.h to call us we need to mark superblock + * as having quota. These flags mark the moment when + * our dq_op start to be called. + * + * The ordering of dq_op and s_dquot.flags assignment + * needs to be enforced, but other CPUs do not do rmb() + * between s_dquot.flags and dq_op accesses. + */ + wmb(); synchronize_sched(); + sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; + } + err = 0; + +out_up: + mutex_unlock(&sb->s_dquot.dqonoff_mutex); + return err; +} + +/** + * vzquota_put_super - one quota tree less on this superblock + * + * Called under vz_quota_sem. + */ +void vzquota_put_super(struct super_block *sb) +{ + /* + * Even if this put is the last one, + * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop + * won't be called and the remaining qmblk references won't be put. + */ +} + +#endif + + +/* ---------------------------------------------------------------------- + * + * Helpers for inode -> qmblk link maintenance + * + * --------------------------------------------------------------------- */ + +#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) +#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) +#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) +extern struct inode_operations vfs_empty_iops; + +static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) +{ + struct vz_quota_master *qmblk; + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk == VZ_QUOTA_BAD) + return 1; + if (qmblk == __VZ_QUOTA_EMPTY) + return 0; + if (qmblk->dq_flags & VZDQ_NOACT) + /* not actual (invalidated) qmblk */ + return 0; + return 1; +} + +static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) +{ + return qlnk->qmblk == __VZ_QUOTA_EMPTY; +} + +static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk, + unsigned char origin) +{ + qlnk->origin[0] = qlnk->origin[1]; + qlnk->origin[1] = origin; +} + +static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) +{ + qlnk->qmblk = __VZ_QUOTA_EMPTY; + set_qlnk_origin(qlnk, VZ_QUOTAO_SETE); +} + +void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) +{ + memset(qlnk, 0, sizeof(*qlnk)); + INIT_LIST_HEAD(&qlnk->list); + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_INIT); +} + +void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) +{ + might_sleep(); + if (vzquota_qlnk_is_empty(qlnk)) + return; +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *quid, *qgid; + qmblk = qlnk->qmblk; + quid = qlnk->qugid[USRQUOTA]; + qgid = qlnk->qugid[GRPQUOTA]; + if (quid != NULL || qgid != NULL) { + down(&qmblk->dq_sem); + if (qgid != NULL) + vzquota_put_ugid(qmblk, qgid); + if (quid != NULL) + vzquota_put_ugid(qmblk, quid); + up(&qmblk->dq_sem); + } + } +#endif + if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) + qmblk_put(qlnk->qmblk); + set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR); +} + +/** + * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents + * @qlt: temporary + * @qli: inode's + * + * Locking is provided by the caller (depending on the context). + * After swap, @qli is inserted into the corresponding dq_ilink_list, + * @qlt list is reinitialized. + */ +static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, + struct vz_quota_ilink *qli) +{ + struct vz_quota_master *qb; + struct vz_quota_ugid *qu; + int i; + + qb = qlt->qmblk; + qlt->qmblk = qli->qmblk; + qli->qmblk = qb; + list_del_init(&qli->list); + if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) + list_add(&qli->list, &qb->dq_ilink_list); + INIT_LIST_HEAD(&qlt->list); + set_qlnk_origin(qli, VZ_QUOTAO_SWAP); + + for (i = 0; i < MAXQUOTAS; i++) { + qu = qlt->qugid[i]; + qlt->qugid[i] = qli->qugid[i]; + qli->qugid[i] = qu; + } +} + +/** + * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + */ +static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, + struct inode *inode) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + if (qlnk->qmblk == VZ_QUOTA_BAD) { + vzquota_qlnk_set_empty(qlnk); + set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK); + return 0; + } + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + return 1; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content + * + * Similar to vzquota_qlnk_reinit_locked, called under different locks. + */ +static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (vzquota_qlnk_is_empty(qlnk)) + return 0; + /* may be optimized if qlnk->qugid all NULLs */ + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(qlnk); + vzquota_qlnk_init(qlnk); + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + return 1; +} +#endif + +/** + * vzquota_qlnk_fill - fill vz_quota_ilink content + * @qlnk: vz_quota_ilink to fill + * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) + * @qmblk: qmblk to which this @qlnk will belong + * + * Called under dcache_lock and inode_qmblk locks. + * Returns 1 if locks were dropped inside, 0 if atomic. + * @qlnk is expected to be empty. + */ +static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct vz_quota_master *qmblk) +{ + if (qmblk != VZ_QUOTA_BAD) + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + +#if defined(CONFIG_VZ_QUOTA_UGID) + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + (qmblk->dq_flags & VZDQUG_ON)) { + struct vz_quota_ugid *quid, *qgid; + + spin_unlock(&dcache_lock); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); + qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + spin_lock(&dcache_lock); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } +#endif + + return 0; +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid + * + * This function is a helper for vzquota_transfer, and differs from + * vzquota_qlnk_fill only by locking. + */ +static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, + struct inode *inode, + struct iattr *iattr, + int mask, + struct vz_quota_master *qmblk) +{ + qmblk_get(qmblk); + qlnk->qmblk = qmblk; + + if (mask) { + struct vz_quota_ugid *quid, *qgid; + + quid = qgid = NULL; /* to make gcc happy */ + if (!(mask & (1 << USRQUOTA))) + quid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[USRQUOTA]); + if (!(mask & (1 << GRPQUOTA))) + qgid = vzquota_get_ugid(INODE_QLNK(inode)-> + qugid[GRPQUOTA]); + + qmblk_data_write_unlock(qmblk); + inode_qmblk_unlock(inode->i_sb); + + down(&qmblk->dq_sem); + if (mask & (1 << USRQUOTA)) + quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, + USRQUOTA, 0); + if (mask & (1 << GRPQUOTA)) + qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, + GRPQUOTA, 0); + up(&qmblk->dq_sem); + + inode_qmblk_lock(inode->i_sb); + qmblk_data_write_lock(qmblk); + qlnk->qugid[USRQUOTA] = quid; + qlnk->qugid[GRPQUOTA] = qgid; + return 1; + } + + return 0; +} +#endif + +/** + * __vzquota_inode_init - make sure inode's qlnk is initialized + * + * May be called if qlnk is already initialized, detects this situation itself. + * Called under inode_qmblk_lock. + */ +static void __vzquota_inode_init(struct inode *inode, unsigned char origin) +{ + if (inode->i_dquot[USRQUOTA] == NODQUOT) { + vzquota_qlnk_init(INODE_QLNK(inode)); + inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; + } + set_qlnk_origin(INODE_QLNK(inode), origin); +} + +/** + * vzquota_inode_drop - destroy VZ quota information in the inode + * + * Inode must not be externally accessible or dirty. + */ +static void vzquota_inode_drop(struct inode *inode) +{ + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL); + inode->i_dquot[USRQUOTA] = NODQUOT; + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); +} + +/** + * vzquota_inode_qmblk_set - initialize inode's qlnk + * @inode: inode to be initialized + * @qmblk: quota master block to which this inode should belong (may be BAD) + * @qlnk: placeholder to store data to resolve locking issues + * + * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. + * Called under dcache_lock and inode_qmblk locks. + * @qlnk will be destroyed in the caller chain. + * + * It is not mandatory to restart parent checks since quota on/off currently + * shrinks dentry tree and checks that there are not outside references. + * But if at some time that shink is removed, restarts will be required. + * Additionally, the restarts prevent inconsistencies if the dentry tree + * changes (inode is moved). This is not a big deal, but anyway... + */ +static int vzquota_inode_qmblk_set(struct inode *inode, + struct vz_quota_master *qmblk, + struct vz_quota_ilink *qlnk) +{ + if (qmblk == NULL) { + printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, " + "dev %s, inode %lu, fs %s\n", + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "current %d (%s), VE %d\n", + current->pid, current->comm, + VEID(get_exec_env())); + dump_stack(); + qmblk = VZ_QUOTA_BAD; + } + while (1) { + if (vzquota_qlnk_is_empty(qlnk) && + vzquota_qlnk_fill(qlnk, inode, qmblk)) + return 1; + if (qlnk->qmblk == qmblk) + break; + if (vzquota_qlnk_reinit_locked(qlnk, inode)) + return 1; + } + vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * vzquota_inode_qmblk (inode -> qmblk lookup) parts + * + * --------------------------------------------------------------------- */ + +static int vzquota_dparents_check_attach(struct inode *inode) +{ + if (!list_empty(&inode->i_dentry)) + return 0; + printk(KERN_ERR "VZDQ: no parent for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + return -1; +} + +static struct inode *vzquota_dparents_check_actual(struct inode *inode) +{ + struct dentry *de; + + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + /* first access to parent, make sure its qlnk initialized */ + __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); + if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) + return de->d_parent->d_inode; + } + return NULL; +} + +static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) +{ + struct dentry *de; + struct vz_quota_master *qmblk; + + qmblk = NULL; + list_for_each_entry(de, &inode->i_dentry, d_alias) { + if (de->d_parent == de) /* detached dentry, perhaps */ + continue; + if (qmblk == NULL) { + qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; + continue; + } + if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { + printk(KERN_WARNING "VZDQ: multiple quotas for " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + break; + } + } + if (qmblk == NULL) { + printk(KERN_WARNING "VZDQ: not attached to tree, " + "dev %s, inode %lu, fs %s\n", + inode->i_sb->s_id, + inode->i_ino, + inode->i_sb->s_type->name); + qmblk = VZ_QUOTA_BAD; + } + return qmblk; +} + +static void vzquota_dbranch_actualize(struct inode *inode, + struct inode *refinode) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + vzquota_qlnk_init(&qlnk); + +start: + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + atomic_inc(&inode->i_count); + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); + goto out; + } + + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + inode = pinode; + goto start; + } + } + + atomic_inc(&inode->i_count); + while (1) { + if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ + break; + /* + * Need to check parents again if we have slept inside + * vzquota_inode_qmblk_set() in the loop. + * If the state of parents is different, just return and repeat + * the actualizing process again from the inode passed to + * vzquota_inode_qmblk_recalc(). + */ + if (!vzquota_dparents_check_attach(inode)) { + if (vzquota_dparents_check_actual(inode) != NULL) + break; + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT); + break; + } + } + +out: + spin_unlock(&dcache_lock); + inode_qmblk_unlock(refinode->i_sb); + vzquota_qlnk_destroy(&qlnk); + iput(inode); + inode_qmblk_lock(refinode->i_sb); + spin_lock(&dcache_lock); +} + +static void vzquota_dtree_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *pinode; + struct vz_quota_master *qmblk; + + if (inode == inode->i_sb->s_root->d_inode) { + /* filesystem root */ + do { + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); + return; + } + +start: + if (VZ_QUOTA_IS_ACTUAL(inode)) + return; + /* + * Here qmblk is (re-)initialized for all ancestors. + * This is not a very efficient procedure, but it guarantees that + * the quota tree is consistent (that is, the inode doesn't have two + * ancestors with different qmblk). + */ + if (!vzquota_dparents_check_attach(inode)) { + pinode = vzquota_dparents_check_actual(inode); + if (pinode != NULL) { + vzquota_dbranch_actualize(pinode, inode); + goto start; + } + qmblk = vzquota_dparents_check_same(inode); + } else + qmblk = VZ_QUOTA_BAD; + + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE); +} + +static void vzquota_det_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + struct inode *parent; + struct vz_quota_master *qmblk; + char *msg; + int cnt; + time_t timeout; + + cnt = 0; + parent = NULL; +start: + /* + * qmblk of detached inodes shouldn't be considered as not actual. + * They are not in any dentry tree, so quota on/off shouldn't affect + * them. + */ + if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) + return; + + timeout = 3; + qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); + /* + * Scenario: + * open + * unlink + * quotaon + * generic_delete_inode + * + * This is the first time vzquota sees inode. inode is outside of + * vzquota area of interest, otherwise quotaon would have got -EBUSY + * due to shrink_dcache_parent(). + * inode is almost completely destroyed, so don't intervene. + * + * dev@: + * However, there is a small race here... + * dput() first removes itself from all the lists, + * so shrink_dcache_parent() can succeed while dentry_iput is not + * done yet. + */ + if (inode->i_state & I_FREEING) + goto set; + + msg = "detached inode not in creation"; + if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) + goto fail; + qmblk = VZ_QUOTA_BAD; + msg = "unexpected creation context"; + if (!vzquota_cur_qmblk_check()) + goto fail; + timeout = 0; + parent = vzquota_cur_qmblk_fetch(); + msg = "uninitialized parent"; + if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) + goto fail; + msg = "parent not in tree"; + if (list_empty(&parent->i_dentry)) + goto fail; + msg = "parent has 0 refcount"; + if (!atomic_read(&parent->i_count)) + goto fail; + msg = "parent has different sb"; + if (parent->i_sb != inode->i_sb) + goto fail; + if (!VZ_QUOTA_IS_ACTUAL(parent)) { + vzquota_dbranch_actualize(parent, inode); + goto start; + } + + qmblk = INODE_QLNK(parent)->qmblk; +set: + if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) + goto start; + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET); + return; + +fail: + { + struct timeval tv, tvo; + do_gettimeofday(&tv); + memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); + tv.tv_sec -= tvo.tv_sec; + if (tv.tv_usec < tvo.tv_usec) { + tv.tv_sec--; + tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; + } else + tv.tv_usec -= tvo.tv_usec; + if (tv.tv_sec < timeout) + goto set; + printk(KERN_ERR "VZDQ: %s, orig {%u, %u}," + " dev %s, inode %lu, fs %s\n", + msg, + INODE_QLNK(inode)->origin[0], + INODE_QLNK(inode)->origin[1], + inode->i_sb->s_id, inode->i_ino, + inode->i_sb->s_type->name); + printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count)); + printk(KERN_ERR "i_mode %o, ", inode->i_mode); + printk(KERN_ERR "i_state %lx, ", inode->i_state); + printk(KERN_ERR "i_flags %x\n", inode->i_flags); + printk(KERN_ERR "i_op %p, vfs_empty_iops %p, " + "i_fop %p, i_mapping %p\n", + inode->i_op, &vfs_empty_iops, + inode->i_fop, inode->i_mapping); + if (!cnt++) { + printk(KERN_ERR "current %d (%s), VE %d," + " time %ld.%06ld\n", + current->pid, current->comm, + VEID(get_exec_env()), + tv.tv_sec, (long)tv.tv_usec); + dump_stack(); + } + if (parent != NULL) + printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", + inode->i_ino, parent->i_ino); + } + goto set; +} + +static void vzquota_inode_qmblk_recalc(struct inode *inode, + struct vz_quota_ilink *qlnk) +{ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_dtree_qmblk_recalc(inode, qlnk); + else + vzquota_det_qmblk_recalc(inode, qlnk); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_qmblk - obtain inode's qmblk + * + * Returns qmblk with refcounter taken, %NULL if not under + * VZ quota or %VZ_QUOTA_BAD. + * + * FIXME: This function should be removed when vzquota_find_qmblk / + * get_quota_root / vzquota_dstat code is cleaned up. + */ +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk; + + might_sleep(); + + if (inode->i_sb->dq_op != &vz_quota_operations) + return NULL; +#if defined(VZ_QUOTA_UNLOAD) +#error Make sure qmblk does not disappear +#endif + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) + qmblk_get(qmblk); + else + qmblk = NULL; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk); + return qmblk; +} + +/** + * vzquota_find_qmblk - helper to emulate quota on virtual filesystems + * + * This function finds a quota master block corresponding to the root of + * a virtual filesystem. + * Returns a quota master block with reference taken, or %NULL if not under + * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation + * operations will fail). + * + * Note: this function uses vzquota_inode_qmblk(). + * The latter is a rather confusing function: it returns qmblk that used to be + * on the inode some time ago (without guarantee that it still has any + * relations to the inode). So, vzquota_find_qmblk() leaves it up to the + * caller to think whether the inode could have changed its qmblk and what to + * do in that case. + * Currently, the callers appear to not care :( + */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) +{ + struct inode *qrinode; + struct vz_quota_master *qmblk; + + qmblk = NULL; + qrinode = NULL; + if (sb->s_op->get_quota_root != NULL) + qrinode = sb->s_op->get_quota_root(sb); + if (qrinode != NULL) + qmblk = vzquota_inode_qmblk(qrinode); + return qmblk; +} + +/* ---------------------------------------------------------------------- + * + * Calls from quota operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_inode_init_call - call from DQUOT_INIT + */ +void vzquota_inode_init_call(struct inode *inode) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + + /* initializes inode's quota inside */ + qmblk = vzquota_inode_data(inode, &data); + if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) + vzquota_data_unlock(inode, &data); + + /* + * The check is needed for repeated new_inode() calls from a single + * ext3 call like create or mkdir in case of -ENOSPC. + */ + spin_lock(&dcache_lock); + if (!list_empty(&inode->i_dentry)) + vzquota_cur_qmblk_set(inode); + spin_unlock(&dcache_lock); +} + +/** + * vzquota_inode_drop_call - call from DQUOT_DROP + */ +void vzquota_inode_drop_call(struct inode *inode) +{ + vzquota_inode_drop(inode); +} + +/** + * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs + * @inode: the inode + * @data: storage space + * + * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. + * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: + * qmblk in inode's qlnk is the same as returned, + * ugid pointers inside inode's qlnk are valid, + * some locks are taken (and should be released by vzquota_data_unlock). + * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. + */ +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *data) +{ + struct vz_quota_master *qmblk; + + might_sleep(); + + vzquota_qlnk_init(&data->qlnk); + inode_qmblk_lock(inode->i_sb); + if (unlikely(inode->i_flags & S_NOQUOTA)) { + inode_qmblk_unlock(inode->i_sb); + return NULL; + } + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) + vzquota_inode_qmblk_recalc(inode, &data->qlnk); + + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != VZ_QUOTA_BAD) { + if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { + /* + * Note that in the current implementation, + * inode_qmblk_lock can theoretically be dropped here. + * This place is serialized with quota_off because + * quota_off fails when there are extra dentry + * references and syncs inodes before removing quota + * information from them. + * However, quota usage information should stop being + * updated immediately after vzquota_off. + */ + qmblk_data_write_lock(qmblk); + } else { + inode_qmblk_unlock(inode->i_sb); + qmblk = NULL; + } + } else { + inode_qmblk_unlock(inode->i_sb); + } + return qmblk; +} + +void vzquota_data_unlock(struct inode *inode, + struct vz_quota_datast *data) +{ + qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&data->qlnk); +} + +#if defined(CONFIG_VZ_QUOTA_UGID) +/** + * vzquota_inode_transfer_call - call from vzquota_transfer + */ +int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) +{ + struct vz_quota_master *qmblk; + struct vz_quota_datast data; + struct vz_quota_ilink qlnew; + int mask; + int ret; + + might_sleep(); + vzquota_qlnk_init(&qlnew); +start: + qmblk = vzquota_inode_data(inode, &data); + ret = NO_QUOTA; + if (qmblk == VZ_QUOTA_BAD) + goto out_destr; + ret = QUOTA_OK; + if (qmblk == NULL) + goto out_destr; + qmblk_get(qmblk); + + ret = QUOTA_OK; + if (!(qmblk->dq_flags & VZDQUG_ON)) + /* no ugid quotas */ + goto out_unlock; + + mask = 0; + if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) + mask |= 1 << USRQUOTA; + if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) + mask |= 1 << GRPQUOTA; + while (1) { + if (vzquota_qlnk_is_empty(&qlnew) && + vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) + break; + if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && + qlnew.qmblk == qmblk) + goto finish; + if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) + break; + } + + /* prepare for restart */ + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); + goto start; + +finish: + /* all references obtained successfully */ + ret = vzquota_transfer_usage(inode, mask, &qlnew); + if (!ret) { + vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS); + } +out_unlock: + vzquota_data_unlock(inode, &data); + qmblk_put(qmblk); +out_destr: + vzquota_qlnk_destroy(&qlnew); + return ret; +} +#endif + +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct vz_quota_master *qmblk; + struct vz_quota_ilink qlnk1, qlnk2, qlnk3; + int c, ret; + + if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) + return -1; + + might_sleep(); + + vzquota_qlnk_init(&qlnk1); + vzquota_qlnk_init(&qlnk2); + vzquota_qlnk_init(&qlnk3); + inode_qmblk_lock(inode->i_sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); + __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); + + do { + c = 0; + if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || + !VZ_QUOTA_IS_ACTUAL(inode)) { + vzquota_inode_qmblk_recalc(inode, &qlnk1); + c++; + } + if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || + !VZ_QUOTA_IS_ACTUAL(new_dir)) { + vzquota_inode_qmblk_recalc(new_dir, &qlnk2); + c++; + } + } while (c); + + ret = 0; + qmblk = INODE_QLNK(inode)->qmblk; + if (qmblk != INODE_QLNK(new_dir)->qmblk) { + ret = -1; + while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) || + !VZ_QUOTA_IS_ACTUAL(old_dir)) + vzquota_inode_qmblk_recalc(old_dir, &qlnk3); + if (qmblk != VZ_QUOTA_BAD && + !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && + qmblk->dq_root_dentry->d_inode == inode && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, + inode->i_sb) && + VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, + inode->i_sb)) + /* quota root rename is allowed */ + ret = 0; + } + + inode_qmblk_unlock(inode->i_sb); + vzquota_qlnk_destroy(&qlnk3); + vzquota_qlnk_destroy(&qlnk2); + vzquota_qlnk_destroy(&qlnk1); + return ret; +} + +/* + * Scan parent subdirs and find busy dentries names/path + * @parent: parent dentry + * @buf: buffer to store path. + */ +static void vzdquota_read_busy_dentries(struct dentry * parent, + struct vfsmount *vfsmnt, char *buf, int buflen) +{ + struct dentry *this_parent = parent; + struct list_head *next; + char *res, *end, *start; + struct vfsmount *rootmnt; + struct dentry *root; + int len; + + if (!buf || buflen <= 0) + return; + + /* From d_path() ... */ + read_lock(¤t->fs->lock); + rootmnt = mntget(current->fs->rootmnt); + root = dget(current->fs->root); + read_unlock(¤t->fs->lock); + + spin_lock(&dcache_lock); + + end = buf + buflen; + start = buf; +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry; + int subdirs; + + dentry = list_entry(tmp, struct dentry, d_u.d_child); + next = tmp->next; + subdirs = !list_empty(&dentry->d_subdirs); + + if (atomic_read(&dentry->d_count) && !subdirs) { + if (!buflen) + goto out; + /* + * Note: __d_path will store filename at the + * end of buf. + */ + res = __d_path(dentry, vfsmnt, root, rootmnt, + buf, buflen); + /* Exit if name is too long */ + if (IS_ERR(res)) + goto out; + + /* + * Move the string obtained by __d_path, + * behind the last dentry path in buf. + */ + len = end - res; + BUG_ON(len <= 0); + + memmove(buf, res, len); + + /* Trick: replace \0 by \n */ + if (buf != start) + *(char *)(buf - 1) = '\n'; + + buf += len; + buflen -= len; + } + + /* + * Descend a level if the d_subdirs list is non-empty. + */ + if (subdirs) { + this_parent = dentry; + goto repeat; + } + } + /* + * All done at this level ... ascend and resume the search. + */ + if (this_parent != parent) { + next = this_parent->d_u.d_child.next; + this_parent = this_parent->d_parent; + goto resume; + } +out: + /* From d_path() ... */ + spin_unlock(&dcache_lock); + dput(root); + mntput(rootmnt); +} + +/* ---------------------------------------------------------------------- + * + * qmblk-related parts of on/off operations + * + * --------------------------------------------------------------------- */ + +/** + * vzquota_check_dtree - check dentry tree if quota on/off is allowed + * + * This function doesn't allow quota to be turned on/off if some dentries in + * the tree have external references. + * In addition to technical reasons, it enforces user-space correctness: + * current usage (taken from or reported to the user space) can be meaningful + * and accurate only if the tree is not being modified. + * Side effect: additional vfsmount structures referencing the tree (bind + * mounts of tree nodes to some other places) are not allowed at on/off time. + * + * Store busy dentries path to the buf (if passed) in case of vzquota_off + * ioctl fail. + */ +int vzquota_check_dtree(struct vz_quota_master *qmblk, int off, + char *buf, int buflen) +{ + struct dentry *dentry; + int err, count; + + err = -EBUSY; + dentry = qmblk->dq_root_dentry; + + if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) + goto unhashed; + + /* attempt to shrink */ + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + inode_qmblk_unlock(dentry->d_sb); + shrink_dcache_parent(dentry); + inode_qmblk_lock(dentry->d_sb); + spin_lock(&dcache_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dcache_lock); + vzdquota_read_busy_dentries(dentry, qmblk->dq_root_mnt, + buf, buflen); + spin_lock(&dcache_lock); + goto out; + } + + count = 1; + if (dentry == dentry->d_sb->s_root) + count += 2; /* sb and mnt refs */ + if (atomic_read(&dentry->d_count) < count) { + printk(KERN_ERR "%s: too small count %d vs %d.\n", + __FUNCTION__, + atomic_read(&dentry->d_count), count); + goto out; + } + if (atomic_read(&dentry->d_count) > count) + goto out; + } + + err = 0; +out: + return err; + +unhashed: + /* + * Quota root is removed. + * Allow to turn quota off, but not on. + */ + if (off) + err = 0; + goto out; +} + +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *ubuf) +{ + struct vz_quota_ilink qlnk; + struct vz_quota_master *qold, *qnew; + int err; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + might_sleep(); + + qold = NULL; + qnew = vzquota_alloc_fake(); + if (qnew == NULL) { + free_page((unsigned long)buf); + return -ENOMEM; + } + + vzquota_qlnk_init(&qlnk); + inode_qmblk_lock(sb); + __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); + + spin_lock(&dcache_lock); + while (1) { + err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE); + if (err) + break; + if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) + break; + } + set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON); + spin_unlock(&dcache_lock); + + if (!err) { + qold = __VZ_QUOTA_NOQUOTA(sb); + qold->dq_flags |= VZDQ_NOACT; + __VZ_QUOTA_NOQUOTA(sb) = qnew; + } + + inode_qmblk_unlock(sb); + vzquota_qlnk_destroy(&qlnk); + if (qold != NULL) + qmblk_put(qold); + + if (buf) { + (void)copy_to_user(ubuf, buf, PAGE_SIZE); + free_page((unsigned long)buf); + } + return err; +} + +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *ubuf, int force) +{ + int ret; + char *buf; + + buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; + + ret = 0; + inode_qmblk_lock(sb); + + spin_lock(&dcache_lock); + if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force) + ret = -EBUSY; + spin_unlock(&dcache_lock); + + if (!ret) + qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; + inode_qmblk_unlock(sb); + + if (buf) { + (void)copy_to_user(ubuf, buf, PAGE_SIZE); + free_page((unsigned long)buf); + } + return ret; +} + + +/* ---------------------------------------------------------------------- + * + * External interfaces + * + * ---------------------------------------------------------------------*/ + +static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_QUOTA_NEW_CTL: { + struct vzctl_quotactl qb; + + err = -EFAULT; + if (copy_from_user(&qb, (void __user *)arg, sizeof(qb))) + break; + err = do_vzquotactl(qb.cmd, qb.quota_id, + qb.qstat, qb.ve_root, 0); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_QUOTA_UGID_CTL: { + struct vzctl_quotaugidctl qub; + + err = -EFAULT; + if (copy_from_user(&qub, (void __user *)arg, sizeof(qub))) + break; + err = do_vzquotaugidctl(qub.cmd, qub.quota_id, + qub.ugid_index, qub.ugid_size, qub.addr, 0); + break; + } +#endif + default: + err = -ENOTTY; + } + return err; +} + +#ifdef CONFIG_COMPAT +static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case VZCTL_COMPAT_QUOTA_CTL: { + struct compat_vzctl_quotactl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = do_vzquotactl(cs.cmd, cs.quota_id, + compat_ptr(cs.qstat), + compat_ptr(cs.ve_root), 1); + break; + } +#ifdef CONFIG_VZ_QUOTA_UGID + case VZCTL_COMPAT_QUOTA_UGID_CTL: { + struct compat_vzctl_quotaugidctl cs; + + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index, + cs.ugid_size, compat_ptr(cs.addr), 1); + break; + } +#endif + default: + err = -ENOIOCTLCMD; + } + return err; +} +#endif + +static struct vzioctlinfo vzdqcalls = { + .type = VZDQCTLTYPE, + .ioctl = vzquota_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzquota_ioctl, +#endif + .owner = THIS_MODULE, +}; + +/** + * vzquota_dstat - get quota usage info for virtual superblock + */ +static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) +{ + struct vz_quota_master *qmblk; + + qmblk = vzquota_find_qmblk(super); + if (qmblk == NULL) + return -ENOENT; + if (qmblk == VZ_QUOTA_BAD) { + memset(qstat, 0, sizeof(*qstat)); + return 0; + } + + qmblk_data_read_lock(qmblk); + memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); + qmblk_data_read_unlock(qmblk); + qmblk_put(qmblk); + return 0; +} + + +/* ---------------------------------------------------------------------- + * + * Init/exit helpers + * + * ---------------------------------------------------------------------*/ + +static int vzquota_cache_init(void) +{ + int i; + + vzquota_cachep = kmem_cache_create("vz_quota_master", + sizeof(struct vz_quota_master), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (vzquota_cachep == NULL) { + printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); + goto nomem2; + } + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + INIT_LIST_HEAD(&vzquota_hash_table[i]); + + return 0; + +nomem2: + return -ENOMEM; +} + +static void vzquota_cache_release(void) +{ + int i; + + /* sanity check */ + for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) + if (!list_empty(&vzquota_hash_table[i])) + BUG(); + + /* release caches */ + kmem_cache_destroy(vzquota_cachep); + vzquota_cachep = NULL; +} + +static int quota_notifier_call(struct vnotifier_block *self, + unsigned long n, void *data, int err) +{ + struct virt_info_quota *viq; + struct super_block *sb; + + viq = (struct virt_info_quota *)data; + switch (n) { + case VIRTINFO_QUOTA_ON: + err = NOTIFY_BAD; + if (!try_module_get(THIS_MODULE)) + break; + sb = viq->super; + memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); + INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); + INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_OFF: + module_put(THIS_MODULE); + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_GETSTAT: + err = NOTIFY_BAD; + if (vzquota_dstat(viq->super, viq->qstat)) + break; + err = NOTIFY_OK; + break; + case VIRTINFO_QUOTA_DISABLE: + err = NOTIFY_OK; + vzquota_inode_off((struct inode *)data); + break; + } + return err; +} + +struct vnotifier_block quota_notifier_block = { + .notifier_call = quota_notifier_call, + .priority = INT_MAX, +}; + +/* ---------------------------------------------------------------------- + * + * Init/exit procedures + * + * ---------------------------------------------------------------------*/ + +static int __init vzquota_init(void) +{ + int err; + + if ((err = vzquota_cache_init()) != 0) + goto out_cache; + + if ((err = vzquota_proc_init()) != 0) + goto out_proc; + +#ifdef CONFIG_VZ_QUOTA_UGID + if ((err = vzquota_ugid_init()) != 0) + goto out_ugid; +#endif + + init_MUTEX(&vz_quota_sem); + vzioctl_register(&vzdqcalls); + virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); +#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) + vzaquota_init(); +#endif + + return 0; + +#ifdef CONFIG_VZ_QUOTA_UGID +out_ugid: + vzquota_proc_release(); +#endif +out_proc: + vzquota_cache_release(); +out_cache: + return err; +} + +#if defined(VZ_QUOTA_UNLOAD) +static void __exit vzquota_release(void) +{ + virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); + vzioctl_unregister(&vzdqcalls); +#ifdef CONFIG_VZ_QUOTA_UGID +#ifdef CONFIG_PROC_FS + vzaquota_fini(); +#endif + vzquota_ugid_release(); +#endif + vzquota_proc_release(); + vzquota_cache_release(); +} +#endif + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Disk Quota"); +MODULE_LICENSE("GPL v2"); + +module_init(vzquota_init) +#if defined(VZ_QUOTA_UNLOAD) +module_exit(vzquota_release) +#endif diff -uprN linux-2.6.24/grsecurity/Kconfig linux-2.6.24.ovz/grsecurity/Kconfig --- linux-2.6.24/grsecurity/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,129 @@ +# +# grecurity configuration +# + +menu "Grsecurity" + +config GRKERNSEC + bool "Grsecurity" + help + If you say Y here, you will be able to configure many features + that will enhance the security of your system. It is highly + recommended that you say Y here and read through the help + for each option so that you fully understand the features and + can evaluate their usefulness for your machine. + +menu "Executable Protections" +depends on GRKERNSEC + +config GRKERNSEC_TPE + bool "Trusted Path Execution (TPE)" + help + If you say Y here, you will be able to choose a gid to add to the + supplementary groups of users you want to mark as "untrusted." + These users will not be able to execute any files that are not in + root-owned directories writable only by root. If the sysctl option + is enabled, a sysctl option with name "tpe" is created. + +config GRKERNSEC_TPE_ALL + bool "Partially restrict non-root users" + depends on GRKERNSEC_TPE + help + If you say Y here, All non-root users other than the ones in the + group specified in the main TPE option will only be allowed to + execute files in directories they own that are not group or + world-writable, or in directories owned by root and writable only by + root. If the sysctl option is enabled, a sysctl option with name + "tpe_restrict_all" is created. + +config GRKERNSEC_TPE_INVERT + bool "Invert GID option" + depends on GRKERNSEC_TPE + help + If you say Y here, the group you specify in the TPE configuration will + decide what group TPE restrictions will be *disabled* for. This + option is useful if you want TPE restrictions to be applied to most + users on the system. + +config GRKERNSEC_TPE_GID + int "GID for untrusted users" + depends on GRKERNSEC_TPE && !GRKERNSEC_TPE_INVERT + default 1005 + help + If you have selected the "Invert GID option" above, setting this + GID determines what group TPE restrictions will be *disabled* for. + If you have not selected the "Invert GID option" above, setting this + GID determines what group TPE restrictions will be *enabled* for. + If the sysctl option is enabled, a sysctl option with name "tpe_gid" + is created. + +config GRKERNSEC_TPE_GID + int "GID for trusted users" + depends on GRKERNSEC_TPE && GRKERNSEC_TPE_INVERT + default 1005 + help + If you have selected the "Invert GID option" above, setting this + GID determines what group TPE restrictions will be *disabled* for. + If you have not selected the "Invert GID option" above, setting this + GID determines what group TPE restrictions will be *enabled* for. + If the sysctl option is enabled, a sysctl option with name "tpe_gid" + is created. + +endmenu +menu "Sysctl support" +depends on GRKERNSEC && SYSCTL + +config GRKERNSEC_SYSCTL + bool "Sysctl support" + help + If you say Y here, you will be able to change the options that + grsecurity runs with at bootup, without having to recompile your + kernel. You can echo values to files in /proc/sys/kernel/grsecurity + to enable (1) or disable (0) various features. All the sysctl entries + are mutable until the "grsec_lock" entry is set to a non-zero value. + All features enabled in the kernel configuration are disabled at boot + if you do not say Y to the "Turn on features by default" option. + All options should be set at startup, and the grsec_lock entry should + be set to a non-zero value after all the options are set. + *THIS IS EXTREMELY IMPORTANT* + +config GRKERNSEC_SYSCTL_ON + bool "Turn on features by default" + depends on GRKERNSEC_SYSCTL + help + If you say Y here, instead of having all features enabled in the + kernel configuration disabled at boot time, the features will be + enabled at boot time. It is recommended you say Y here unless + there is some reason you would want all sysctl-tunable features to + be disabled by default. As mentioned elsewhere, it is important + to enable the grsec_lock entry once you have finished modifying + the sysctl entries. + +endmenu + +menu "Logging Options" +depends on GRKERNSEC + +config GRKERNSEC_FLOODTIME + int "Seconds in between log messages (minimum)" + default 10 + help + This option allows you to enforce the number of seconds between + grsecurity log messages. The default should be suitable for most + people, however, if you choose to change it, choose a value small enough + to allow informative logs to be produced, but large enough to + prevent flooding. + +config GRKERNSEC_FLOODBURST + int "Number of messages in a burst (maximum)" + default 4 + help + This option allows you to choose the maximum number of messages allowed + within the flood time interval you chose in a separate option. The + default should be suitable for most people, however if you find that + many of your logs are being interpreted as flooding, you may want to + raise this value. + +endmenu + +endmenu diff -uprN linux-2.6.24/grsecurity/Makefile linux-2.6.24.ovz/grsecurity/Makefile --- linux-2.6.24/grsecurity/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,15 @@ +# grsecurity's ACL system was originally written in 2001 by Michael Dalton +# during 2001-2005 it has been completely redesigned by Brad Spengler +# into an RBAC system +# +# All code in this directory and various hooks inserted throughout the kernel +# are copyright Brad Spengler, and released under the GPL v2 or higher + +obj-y = grsec_tpe.o grsec_sysctl.o + +obj-$(CONFIG_GRKERNSEC) += grsec_init.o gracl.o grsec_log.o + +ifndef CONFIG_GRKERNSEC +obj-y += grsec_disabled.o +endif + diff -uprN linux-2.6.24/grsecurity/gracl.c linux-2.6.24.ovz/grsecurity/gracl.c --- linux-2.6.24/grsecurity/gracl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/gracl.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +extern char *gr_shared_page[4]; + +static char * +gen_full_path(struct dentry *dentry, struct vfsmount *vfsmnt, + struct dentry *root, struct vfsmount *rootmnt, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + int namelen = 0; + + *--end = '\0'; + + retval = end - 1; + *retval = '/'; + + if (dentry == root && vfsmnt == rootmnt) + return retval; + if (dentry != vfsmnt->mnt_root && !IS_ROOT(dentry)) { + namelen = strlen(dentry->d_name.name); + buflen -= namelen; + if (buflen < 2) + goto err; + if (dentry->d_parent != root || vfsmnt != rootmnt) + buflen--; + } + + retval = __d_path(dentry->d_parent, vfsmnt, root, rootmnt, buf, buflen); + if (unlikely(IS_ERR(retval))) +err: + retval = strcpy(buf, ""); + else if (namelen != 0) { + end = buf + buflen - 1; // accounts for null termination + if (dentry->d_parent != root || vfsmnt != rootmnt) + *end++ = '/'; // accounted for above with buflen-- + memcpy(end, dentry->d_name.name, namelen); + } + + return retval; +} + +static char * +d_real_path(const struct dentry *dentry, const struct vfsmount *vfsmnt, + char *buf, int buflen) +{ + char *res; + struct dentry *root; + struct vfsmount *rootmnt; + + /* we can't use real_root, real_root_mnt, because they belong only to the RBAC system */ +#ifdef CONFIG_VE + /* Don't use child_reaper, because it's VE0 process */ + root = dget(get_exec_env()->fs_root); + rootmnt = mntget(get_exec_env()->fs_rootmnt); +#else + read_lock(&child_reaper->fs->lock); + root = dget(child_reaper->fs->root); + rootmnt = mntget(child_reaper->fs->rootmnt); + read_unlock(&child_reaper->fs->lock); +#endif + + spin_lock(&dcache_lock); + res = gen_full_path((struct dentry *)dentry, (struct vfsmount *)vfsmnt, root, rootmnt, buf, buflen); + spin_unlock(&dcache_lock); + + dput(root); + mntput(rootmnt); + return res; +} + +char * +gr_to_filename(const struct dentry *dentry, const struct vfsmount *mnt) +{ + return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[0], smp_processor_id()), + PAGE_SIZE); +} + +char * +gr_to_filename2(const struct dentry *dentry, const struct vfsmount *mnt) +{ + return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[2], smp_processor_id()), + PAGE_SIZE); +} + +char * +gr_to_filename3(const struct dentry *dentry, const struct vfsmount *mnt) +{ + return d_real_path(dentry, mnt, per_cpu_ptr(gr_shared_page[3], smp_processor_id()), + PAGE_SIZE); +} + +int +gr_acl_handle_mmap(const struct file *file, const unsigned long prot) +{ + if (unlikely(!file || !(prot & PROT_EXEC))) + return 1; + + if (!gr_tpe_allow(file)) + return 0; + return 1; +} + +int +gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) +{ + if (unlikely(!file || !(prot & PROT_EXEC))) + return 1; + + if (!gr_tpe_allow(file)) + return 0; + return 1; +} diff -uprN linux-2.6.24/grsecurity/grsec_disabled.c linux-2.6.24.ovz/grsecurity/grsec_disabled.c --- linux-2.6.24/grsecurity/grsec_disabled.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/grsec_disabled.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,39 @@ +#include +#include +#include + +void +gr_copy_label(struct task_struct *tsk) +{ + return; +} + +int +gr_acl_handle_mmap(const struct file *file, const unsigned long prot, + unsigned int *vm_flags) +{ + return 1; +} + +void +grsecurity_init(void) +{ + return; +} + +void +gr_acl_handle_exit(void) +{ + return; +} + +int +gr_acl_handle_mprotect(const struct file *file, const unsigned long prot) +{ + return 1; +} + +void grsecurity_setup(void) +{ +} +EXPORT_SYMBOL(grsecurity_setup); diff -uprN linux-2.6.24/grsecurity/grsec_init.c linux-2.6.24.ovz/grsecurity/grsec_init.c --- linux-2.6.24/grsecurity/grsec_init.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/grsec_init.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_VE +#include +#else +int grsec_enable_tpe; +int grsec_tpe_gid; +int grsec_enable_tpe_all; +int grsec_lock; +#endif + +spinlock_t grsec_alert_lock = SPIN_LOCK_UNLOCKED; + +unsigned long grsec_alert_wtime = 0; +unsigned long grsec_alert_fyet = 0; + +spinlock_t grsec_audit_lock = SPIN_LOCK_UNLOCKED; + +char *gr_shared_page[4]; + +char *gr_alert_log_fmt; +char *gr_audit_log_fmt; + +char *gr_alert_log_buf; +char *gr_audit_log_buf; + +void grsecurity_setup(void) +{ +#if !defined(CONFIG_GRKERNSEC_SYSCTL) || defined(CONFIG_GRKERNSEC_SYSCTL_ON) +#ifndef CONFIG_GRKERNSEC_SYSCTL + grsec_lock = 1; +#endif +#ifdef CONFIG_GRKERNSEC_TPE + grsec_enable_tpe = 1; + grsec_tpe_gid = CONFIG_GRKERNSEC_TPE_GID; +#ifdef CONFIG_GRKERNSEC_TPE_ALL + grsec_enable_tpe_all = 1; +#endif +#endif +#endif +} +EXPORT_SYMBOL(grsecurity_setup); + +void +grsecurity_init(void) +{ + int j; + /* create the per-cpu shared pages */ + + for (j = 0; j < 4; j++) { + gr_shared_page[j] = (char *)__alloc_percpu(PAGE_SIZE); + if (gr_shared_page[j] == NULL) { + panic("Unable to allocate grsecurity shared page"); + return; + } + } + + /* allocate log buffers */ + gr_alert_log_fmt = kmalloc(512, GFP_KERNEL); + if (!gr_alert_log_fmt) { + panic("Unable to allocate grsecurity alert log format buffer"); + return; + } + gr_audit_log_fmt = kmalloc(512, GFP_KERNEL); + if (!gr_audit_log_fmt) { + panic("Unable to allocate grsecurity audit log format buffer"); + return; + } + gr_alert_log_buf = (char *) get_zeroed_page(GFP_KERNEL); + if (!gr_alert_log_buf) { + panic("Unable to allocate grsecurity alert log buffer"); + return; + } + gr_audit_log_buf = (char *) get_zeroed_page(GFP_KERNEL); + if (!gr_audit_log_buf) { + panic("Unable to allocate grsecurity audit log buffer"); + return; + } + grsecurity_setup(); + + return; +} diff -uprN linux-2.6.24/grsecurity/grsec_log.c linux-2.6.24.ovz/grsecurity/grsec_log.c --- linux-2.6.24/grsecurity/grsec_log.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/grsec_log.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include + +#define BEGIN_LOCKS(x) \ + if (x != GR_DO_AUDIT) \ + spin_lock(&grsec_alert_lock); \ + else \ + spin_lock(&grsec_audit_lock) + +#define END_LOCKS(x) \ + if (x != GR_DO_AUDIT) \ + spin_unlock(&grsec_alert_lock); \ + else \ + spin_unlock(&grsec_audit_lock); + +enum { + FLOODING, + NO_FLOODING +}; + +extern char *gr_alert_log_fmt; +extern char *gr_audit_log_fmt; +extern char *gr_alert_log_buf; +extern char *gr_audit_log_buf; + +static int gr_log_start(int audit) +{ + char *loglevel = (audit == GR_DO_AUDIT) ? KERN_INFO : KERN_ALERT; + char *fmt = (audit == GR_DO_AUDIT) ? gr_audit_log_fmt : gr_alert_log_fmt; + char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; + + if (audit == GR_DO_AUDIT) + goto set_fmt; + + if (!grsec_alert_wtime || jiffies - grsec_alert_wtime > CONFIG_GRKERNSEC_FLOODTIME * HZ) { + grsec_alert_wtime = jiffies; + grsec_alert_fyet = 0; + } else if ((jiffies - grsec_alert_wtime < CONFIG_GRKERNSEC_FLOODTIME * HZ) && (grsec_alert_fyet < CONFIG_GRKERNSEC_FLOODBURST)) { + grsec_alert_fyet++; + } else if (grsec_alert_fyet == CONFIG_GRKERNSEC_FLOODBURST) { + grsec_alert_wtime = jiffies; + grsec_alert_fyet++; + ve_printk(VE_LOG, KERN_ALERT "grsec: more alerts, logging disabled for %d seconds\n", CONFIG_GRKERNSEC_FLOODTIME); + return FLOODING; + } else return FLOODING; + +set_fmt: + memset(buf, 0, PAGE_SIZE); + sprintf(fmt, "%s%s", loglevel, "grsec: "); + strcpy(buf, fmt); + + return NO_FLOODING; +} + +static void gr_log_middle(int audit, const char *msg, va_list ap) +{ + char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; + unsigned int len = strlen(buf); + + vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); + + return; +} + +static void gr_log_middle_varargs(int audit, const char *msg, ...) +{ + char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; + unsigned int len = strlen(buf); + va_list ap; + + va_start(ap, msg); + vsnprintf(buf + len, PAGE_SIZE - len - 1, msg, ap); + va_end(ap); + + return; +} + +static void gr_log_end(int audit) +{ + char *buf = (audit == GR_DO_AUDIT) ? gr_audit_log_buf : gr_alert_log_buf; + unsigned int len = strlen(buf); + + snprintf(buf + len, PAGE_SIZE - len - 1, DEFAULTSECMSG, DEFAULTSECARGS(current)); + ve_printk(VE_LOG, "%s\n", buf); + + return; +} + +void gr_log_varargs(int audit, const char *msg, int argtypes, ...) +{ + int logtype; + struct dentry *dentry; + struct vfsmount *mnt; + va_list ap; + + BEGIN_LOCKS(audit); + logtype = gr_log_start(audit); + if (logtype == FLOODING) { + END_LOCKS(audit); + return; + } + va_start(ap, argtypes); + switch (argtypes) { + /* + * Only GR_FILENAME is now supported in VZ + */ + case GR_FILENAME: + dentry = va_arg(ap, struct dentry *); + mnt = va_arg(ap, struct vfsmount *); + gr_log_middle_varargs(audit, msg, gr_to_filename(dentry, mnt)); + break; + default: + gr_log_middle(audit, msg, ap); + } + va_end(ap); + gr_log_end(audit); + END_LOCKS(audit); +} diff -uprN linux-2.6.24/grsecurity/grsec_sysctl.c linux-2.6.24.ovz/grsecurity/grsec_sysctl.c --- linux-2.6.24/grsecurity/grsec_sysctl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/grsec_sysctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include + +int +gr_handle_sysctl_mod(const char *dirname, const char *name, const int op) +{ +#ifdef CONFIG_GRKERNSEC_SYSCTL + if (!strcmp(dirname, "grsecurity") && grsec_lock && (op & 002)) { + gr_log_str(GR_DONT_AUDIT, GR_SYSCTL_MSG, name); + return -EACCES; + } +#endif + return 0; +} + +#ifdef CONFIG_GRKERNSEC_SYSCTL +static int grsec_proc_dointvec(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; +#ifdef CONFIG_VE + struct ctl_table fake_table; + struct ve_struct *env = get_exec_env(); + + if (!ve_is_super(env)) { + memcpy(&fake_table, ctl, sizeof(struct ctl_table)); + fake_table.data = (char *)((unsigned long)&env->grsec + + (unsigned long)ctl->data - + (unsigned long)&get_ve0()->grsec); + ctl = &fake_table; + } +#endif + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + return ret; +} + +enum {GS_TPE = 1, GS_TPE_GID, GS_TPE_ALL, GS_LOCK}; + +ctl_table grsecurity_table[] = { +#ifdef CONFIG_GRKERNSEC_TPE + { + .ctl_name = GS_TPE, + .procname = "tpe", + .data = &ve0.grsec.enable_tpe, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &grsec_proc_dointvec, + .virt_handler = 1, + }, + { + .ctl_name = GS_TPE_GID, + .procname = "tpe_gid", + .data = &ve0.grsec.tpe_gid, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &grsec_proc_dointvec, + .virt_handler = 1, + }, +#endif +#ifdef CONFIG_GRKERNSEC_TPE_ALL + { + .ctl_name = GS_TPE_ALL, + .procname = "tpe_restrict_all", + .data = &ve0.grsec.enable_tpe_all, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &grsec_proc_dointvec, + .virt_handler = 1, + }, +#endif + { + .ctl_name = GS_LOCK, + .procname = "grsec_lock", + .data = &ve0.grsec.lock, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &grsec_proc_dointvec, + .virt_handler = 1, + }, + { .ctl_name = 0 } +}; +#endif diff -uprN linux-2.6.24/grsecurity/grsec_tpe.c linux-2.6.24.ovz/grsecurity/grsec_tpe.c --- linux-2.6.24/grsecurity/grsec_tpe.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/grsecurity/grsec_tpe.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include + +extern int gr_acl_tpe_check(void); + +int +gr_tpe_allow(const struct file *file) +{ +#ifdef CONFIG_GRKERNSEC_TPE + struct inode *inode = file->f_dentry->d_parent->d_inode; + + if (current->uid && ((grsec_enable_tpe && +#ifdef CONFIG_GRKERNSEC_TPE_INVERT + !in_group_p(grsec_tpe_gid) +#else + in_group_p(grsec_tpe_gid) +#endif + )) && + (inode->i_uid || (!inode->i_uid && ((inode->i_mode & S_IWGRP) || + (inode->i_mode & S_IWOTH))))) { + gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_dentry, file->f_vfsmnt); + return 0; + } +#ifdef CONFIG_GRKERNSEC_TPE_ALL + if (current->uid && grsec_enable_tpe && grsec_enable_tpe_all && + ((inode->i_uid && (inode->i_uid != current->uid)) || + (inode->i_mode & S_IWGRP) || (inode->i_mode & S_IWOTH))) { + gr_log_fs_generic(GR_DONT_AUDIT, GR_EXEC_TPE_MSG, file->f_dentry, file->f_vfsmnt); + return 0; + } +#endif +#endif + return 1; +} diff -uprN linux-2.6.24/include/asm-ia64/mman.h linux-2.6.24.ovz/include/asm-ia64/mman.h --- linux-2.6.24/include/asm-ia64/mman.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-ia64/mman.h 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,7 @@ #define MAP_NORESERVE 0x04000 /* don't check for reservations */ #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff -uprN linux-2.6.24/include/asm-ia64/pgalloc.h linux-2.6.24.ovz/include/asm-ia64/pgalloc.h --- linux-2.6.24/include/asm-ia64/pgalloc.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-ia64/pgalloc.h 2008-03-25 18:53:59.000000000 -0500 @@ -20,11 +20,13 @@ #include #include +#include + #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pgd_free(pgd_t * pgd) @@ -41,7 +43,7 @@ pgd_populate(struct mm_struct *mm, pgd_t static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pud_free(pud_t * pud) @@ -59,7 +61,7 @@ pud_populate(struct mm_struct *mm, pud_t static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); } static inline void pmd_free(pmd_t * pmd) @@ -84,7 +86,7 @@ pmd_populate_kernel(struct mm_struct *mm static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr) { - void *pg = quicklist_alloc(0, GFP_KERNEL, NULL); + void *pg = quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); return pg ? virt_to_page(pg) : NULL; } diff -uprN linux-2.6.24/include/asm-ia64/processor.h linux-2.6.24.ovz/include/asm-ia64/processor.h --- linux-2.6.24/include/asm-ia64/processor.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-ia64/processor.h 2008-03-25 18:53:59.000000000 -0500 @@ -297,7 +297,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(!get_dumpable(current->mm) || !current->mm->vps_dumpable)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff -uprN linux-2.6.24/include/asm-ia64/unistd.h linux-2.6.24.ovz/include/asm-ia64/unistd.h --- linux-2.6.24/include/asm-ia64/unistd.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-ia64/unistd.h 2008-03-25 18:53:59.000000000 -0500 @@ -299,11 +299,23 @@ #define __NR_signalfd 1307 #define __NR_timerfd 1308 #define __NR_eventfd 1309 +#define __NR_fairsched_vcpus 1499 +#define __NR_fairsched_mknod 1500 +#define __NR_fairsched_rmnod 1501 +#define __NR_fairsched_chwt 1502 +#define __NR_fairsched_mvpr 1503 +#define __NR_fairsched_rate 1504 +#define __NR_getluid 1505 +#define __NR_setluid 1506 +#define __NR_setublimit 1507 +#define __NR_ubstat 1508 +#define __NR_lchmod 1509 +#define __NR_lutime 1510 #ifdef __KERNEL__ -#define NR_syscalls 286 /* length of syscall table */ +#define NR_syscalls 487 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff -uprN linux-2.6.24/include/asm-powerpc/elf.h linux-2.6.24.ovz/include/asm-powerpc/elf.h --- linux-2.6.24/include/asm-powerpc/elf.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-powerpc/elf.h 2008-03-25 18:53:59.000000000 -0500 @@ -280,7 +280,8 @@ extern int ucache_bsize; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int executable_stack, + unsigned long map_address); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); /* diff -uprN linux-2.6.24/include/asm-powerpc/mman.h linux-2.6.24.ovz/include/asm-powerpc/mman.h --- linux-2.6.24/include/asm-powerpc/mman.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-powerpc/mman.h 2008-03-25 18:53:59.000000000 -0500 @@ -23,5 +23,6 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ #endif /* _ASM_POWERPC_MMAN_H */ diff -uprN linux-2.6.24/include/asm-powerpc/pgalloc-64.h linux-2.6.24.ovz/include/asm-powerpc/pgalloc-64.h --- linux-2.6.24/include/asm-powerpc/pgalloc-64.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-powerpc/pgalloc-64.h 2008-03-25 18:53:59.000000000 -0500 @@ -22,7 +22,8 @@ extern struct kmem_cache *pgtable_cache[ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], + GFP_KERNEL_UBC | __GFP_SOFT_UBC); } static inline void pgd_free(pgd_t *pgd) @@ -37,7 +38,7 @@ static inline void pgd_free(pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], - GFP_KERNEL|__GFP_REPEAT); + GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); } static inline void pud_free(pud_t *pud) @@ -81,16 +82,21 @@ static inline void pmd_free(pmd_t *pmd) kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } +static inline pte_t *do_pte_alloc(gfp_t flags) +{ + return (pte_t *)__get_free_page(flags); +} + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + return do_pte_alloc(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = pte_alloc_one_kernel(mm, address); + pte_t *pte = do_pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC | __GFP_ZERO); return pte ? virt_to_page(pte) : NULL; } diff -uprN linux-2.6.24/include/asm-powerpc/systbl.h linux-2.6.24.ovz/include/asm-powerpc/systbl.h --- linux-2.6.24/include/asm-powerpc/systbl.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-powerpc/systbl.h 2008-03-25 18:53:59.000000000 -0500 @@ -303,7 +303,7 @@ SYSCALL_SPU(readlinkat) SYSCALL_SPU(fchmodat) SYSCALL_SPU(faccessat) COMPAT_SYS_SPU(get_robust_list) -COMPAT_SYS_SPU(set_robust_list) +COMPAT_SYS_SPU(set_robust_list) /* 300 */ COMPAT_SYS_SPU(move_pages) SYSCALL_SPU(getcpu) COMPAT_SYS(epoll_pwait) @@ -313,3 +313,19 @@ COMPAT_SYS_SPU(timerfd) SYSCALL_SPU(eventfd) COMPAT_SYS_SPU(sync_file_range2) COMPAT_SYS(fallocate) +SYS_SKIP(310, 400) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(fairsched_mknod) /* 400 */ +SYSCALL(fairsched_rmnod) +SYSCALL(fairsched_chwt) +SYSCALL(fairsched_mvpr) +SYSCALL(fairsched_rate) +SYSCALL(fairsched_vcpus) +SYS_SKIP(406, 410) +SYSCALL(ni_syscall) +SYS_SKIP_END() +SYSCALL(getluid) /* 410 */ +SYSCALL(setluid) +SYSCALL(setublimit) +SYSCALL(ubstat) diff -uprN linux-2.6.24/include/asm-powerpc/unistd.h linux-2.6.24.ovz/include/asm-powerpc/unistd.h --- linux-2.6.24/include/asm-powerpc/unistd.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-powerpc/unistd.h 2008-03-25 18:53:59.000000000 -0500 @@ -333,9 +333,14 @@ #define __NR_sync_file_range2 308 #define __NR_fallocate 309 +#define __NR_getluid 410 +#define __NR_setluid 411 +#define __NR_setublimit 412 +#define __NR_ubstat 413 + #ifdef __KERNEL__ -#define __NR_syscalls 310 +#define __NR_syscalls 414 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff -uprN linux-2.6.24/include/asm-sparc64/mman.h linux-2.6.24.ovz/include/asm-sparc64/mman.h --- linux-2.6.24/include/asm-sparc64/mman.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-sparc64/mman.h 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ /* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system * XXX calls. diff -uprN linux-2.6.24/include/asm-sparc64/pgalloc.h linux-2.6.24.ovz/include/asm-sparc64/pgalloc.h --- linux-2.6.24/include/asm-sparc64/pgalloc.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-sparc64/pgalloc.h 2008-03-25 18:53:59.000000000 -0500 @@ -17,7 +17,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC, NULL); } static inline void pgd_free(pgd_t *pgd) @@ -29,7 +29,7 @@ static inline void pgd_free(pgd_t *pgd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_REPEAT, NULL); } static inline void pmd_free(pmd_t *pmd) @@ -46,7 +46,7 @@ static inline pte_t *pte_alloc_one_kerne static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - void *pg = quicklist_alloc(0, GFP_KERNEL, NULL); + void *pg = quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_REPEAT, NULL); return pg ? virt_to_page(pg) : NULL; } diff -uprN linux-2.6.24/include/asm-sparc64/thread_info.h linux-2.6.24.ovz/include/asm-sparc64/thread_info.h --- linux-2.6.24/include/asm-sparc64/thread_info.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-sparc64/thread_info.h 2008-03-25 18:53:59.000000000 -0500 @@ -162,14 +162,14 @@ register struct thread_info *current_thr struct thread_info *ret; \ \ ret = (struct thread_info *) \ - __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER); \ + __get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER);\ if (ret) \ memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER); \ ret; \ }) #else #define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER)) + ((struct thread_info *)__get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER)) #endif #define free_thread_info(ti) \ @@ -236,6 +236,7 @@ register struct thread_info *current_thr #define TIF_ABI_PENDING 12 #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 +#define TIF_FREEZE 15 /* Freeze request (atomic PF_FREEZE) */ #define _TIF_SYSCALL_TRACE (1< -# define STACK_TOP TASK_SIZE +# define STACK_TOP (TASK_SIZE - PAGE_SIZE) /* +1 page for vdso */ # ifdef CONFIG_X86_32 # define STACK_TOP_MAX STACK_TOP # else diff -uprN linux-2.6.24/include/asm-x86/elf.h linux-2.6.24.ovz/include/asm-x86/elf.h --- linux-2.6.24/include/asm-x86/elf.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/elf.h 2008-03-25 18:53:59.000000000 -0500 @@ -262,7 +262,7 @@ extern void __kernel_vsyscall; /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ -do if (vdso_enabled) { \ +do if (vdso_enabled && sysctl_at_vsyscall) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } while (0) @@ -283,7 +283,8 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int executable_stack, + unsigned long map_address); #endif /* __KERNEL__ */ diff -uprN linux-2.6.24/include/asm-x86/ia32.h linux-2.6.24.ovz/include/asm-x86/ia32.h --- linux-2.6.24/include/asm-x86/ia32.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/ia32.h 2008-03-25 18:53:59.000000000 -0500 @@ -156,7 +156,7 @@ struct ustat32 { char f_fpack[6]; }; -#define IA32_STACK_TOP IA32_PAGE_OFFSET +#define IA32_STACK_TOP (IA32_PAGE_OFFSET - PAGE_SIZE * 2) #ifdef __KERNEL__ struct user_desc; diff -uprN linux-2.6.24/include/asm-x86/mman.h linux-2.6.24.ovz/include/asm-x86/mman.h --- linux-2.6.24/include/asm-x86/mman.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/mman.h 2008-03-25 18:53:59.000000000 -0500 @@ -12,6 +12,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ diff -uprN linux-2.6.24/include/asm-x86/nmi_32.h linux-2.6.24.ovz/include/asm-x86/nmi_32.h --- linux-2.6.24/include/asm-x86/nmi_32.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/nmi_32.h 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,10 @@ extern void release_perfctr_nmi(unsigned extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); +void set_nmi_ipi_callback(nmi_callback_t callback); +void unset_nmi_ipi_callback(void); + extern void setup_apic_nmi_watchdog (void *); extern void stop_apic_nmi_watchdog (void *); extern void disable_timer_nmi_watchdog(void); @@ -33,7 +37,7 @@ extern int nmi_watchdog_tick (struct pt_ extern atomic_t nmi_active; extern unsigned int nmi_watchdog; -#define NMI_DISABLED -1 +#define NMI_DISABLED -1 #define NMI_NONE 0 #define NMI_IO_APIC 1 #define NMI_LOCAL_APIC 2 diff -uprN linux-2.6.24/include/asm-x86/nmi_64.h linux-2.6.24.ovz/include/asm-x86/nmi_64.h --- linux-2.6.24/include/asm-x86/nmi_64.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/nmi_64.h 2008-03-25 18:53:59.000000000 -0500 @@ -35,6 +35,11 @@ static inline void unset_nmi_pm_callback } #endif /* CONFIG_PM */ + +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); +void set_nmi_ipi_callback(nmi_callback_t callback); +void unset_nmi_ipi_callback(void); + extern void default_do_nmi(struct pt_regs *); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); diff -uprN linux-2.6.24/include/asm-x86/pgalloc_64.h linux-2.6.24.ovz/include/asm-x86/pgalloc_64.h --- linux-2.6.24/include/asm-x86/pgalloc_64.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/pgalloc_64.h 2008-03-25 18:53:59.000000000 -0500 @@ -25,12 +25,14 @@ static inline void pmd_free(pmd_t *pmd) static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); } static inline void pud_free (pud_t *pud) @@ -60,7 +62,8 @@ static inline void pgd_list_del(pgd_t *p static inline pgd_t *pgd_alloc(struct mm_struct *mm) { unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); if (!pgd) return NULL; pgd_list_add(pgd); @@ -91,7 +94,8 @@ static inline pte_t *pte_alloc_one_kerne static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| + __GFP_SOFT_UBC); if (!p) return NULL; return virt_to_page(p); diff -uprN linux-2.6.24/include/asm-x86/processor_64.h linux-2.6.24.ovz/include/asm-x86/processor_64.h --- linux-2.6.24/include/asm-x86/processor_64.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/processor_64.h 2008-03-25 18:53:59.000000000 -0500 @@ -142,7 +142,7 @@ static inline void clear_in_cr4 (unsigne /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET 0xc0000000 #define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) diff -uprN linux-2.6.24/include/asm-x86/thread_info_32.h linux-2.6.24.ovz/include/asm-x86/thread_info_32.h --- linux-2.6.24/include/asm-x86/thread_info_32.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/thread_info_32.h 2008-03-25 18:53:59.000000000 -0500 @@ -96,10 +96,10 @@ static inline struct thread_info *curren /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE #define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL| __GFP_ZERO, get_order(THREAD_SIZE))) + __get_free_pages(GFP_KERNEL_UBC| __GFP_ZERO, get_order(THREAD_SIZE))) #else #define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE))) + __get_free_pages(GFP_KERNEL_UBC, get_order(THREAD_SIZE))) #endif #define free_thread_info(info) free_pages((unsigned long)(info), get_order(THREAD_SIZE)) diff -uprN linux-2.6.24/include/asm-x86/thread_info_64.h linux-2.6.24.ovz/include/asm-x86/thread_info_64.h --- linux-2.6.24/include/asm-x86/thread_info_64.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/asm-x86/thread_info_64.h 2008-03-25 18:53:59.000000000 -0500 @@ -33,6 +33,7 @@ struct thread_info { mm_segment_t addr_limit; struct restart_block restart_block; + void *sysenter_return; }; #endif @@ -78,14 +79,15 @@ static inline struct thread_info *stack_ ({ \ struct thread_info *ret; \ \ - ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)); \ + ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,\ + THREAD_ORDER)); \ if (ret) \ memset(ret, 0, THREAD_SIZE); \ ret; \ }) #else #define alloc_thread_info(tsk) \ - ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) + ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,THREAD_ORDER)) #endif #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) @@ -123,6 +125,7 @@ static inline struct thread_info *stack_ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FREEZE 23 /* is freezing for suspend */ +#define TIF_RESUME 24 #define _TIF_SYSCALL_TRACE (1<mm->context.vdso) #define VSYSCALL32_END (VSYSCALL32_BASE + PAGE_SIZE) #define VSYSCALL32_EHDR ((const struct elf32_hdr *) VSYSCALL32_BASE) +#define __VSYSCALL32_BASE ((unsigned long)(IA32_PAGE_OFFSET - PAGE_SIZE)) +#define __VSYSCALL32_END (__VSYSCALL32_BASE + PAGE_SIZE) + #define VSYSCALL32_VSYSCALL ((void *)VSYSCALL32_BASE + 0x400) -#define VSYSCALL32_SYSEXIT ((void *)VSYSCALL32_BASE + 0x410) +#define VSYSCALL32_SYSEXIT ((void *)VSYSCALL32_BASE + 0x420) #define VSYSCALL32_SIGRETURN ((void __user *)VSYSCALL32_BASE + 0x500) #define VSYSCALL32_RTSIGRETURN ((void __user *)VSYSCALL32_BASE + 0x600) #endif diff -uprN linux-2.6.24/include/bc/beancounter.h linux-2.6.24.ovz/include/bc/beancounter.h --- linux-2.6.24/include/bc/beancounter.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/beancounter.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,451 @@ +/* + * include/bc/beancounter.h + * + * Copyright (C) 1999-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Andrey Savochkin saw@sw-soft.com + * + */ + +#ifndef _LINUX_BEANCOUNTER_H +#define _LINUX_BEANCOUNTER_H + +/* + * Generic ratelimiting stuff. + */ + +struct ub_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int ub_ratelimit(struct ub_rate_info *); + + +/* + * This magic is used to distinuish user beancounter and pages beancounter + * in struct page. page_ub and page_bc are placed in union and MAGIC + * ensures us that we don't use pbc as ubc in ub_page_uncharge(). + */ +#define UB_MAGIC 0x62756275 + +/* + * Resource list. + */ + +#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including + * struct task, page directories, etc. + */ +#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ +#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially + * private pages as private and used. + */ +#define UB_SHMPAGES 3 /* IPC SHM segment size. */ +#define UB_DUMMY 4 /* Dummy resource (compatibility) */ +#define UB_NUMPROC 5 /* Number of processes. */ +#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ +#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, + * checked against PRIVVMPAGES. + */ +#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. + * Only limit is used, no accounting. + */ +#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ +#define UB_NUMFLOCK 10 /* Number of file locks. */ +#define UB_NUMPTY 11 /* Number of PTYs. */ +#define UB_NUMSIGINFO 12 /* Number of siginfos. */ +#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ +#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ +#define UB_OTHERSOCKBUF 15 /* Total size of other socket + * send buffers (all buffers for PF_UNIX). + */ +#define UB_DGRAMRCVBUF 16 /* Total size of other socket + * receive buffers. + */ +#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ +#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ +#define UB_NUMFILE 19 /* Number of open files. */ + +#define UB_RESOURCES_COMPAT 24 + +/* Add new resources here */ + +#define UB_NUMXTENT 23 +#define UB_RESOURCES 24 + +#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) +#define UB_TMPFSPAGES (UB_RESOURCES + 1) +#define UB_SWAPPAGES (UB_RESOURCES + 2) +#define UB_HELDPAGES (UB_RESOURCES + 3) + +struct ubparm { + /* + * A barrier over which resource allocations are failed gracefully. + * If the amount of consumed memory is over the barrier further sbrk() + * or mmap() calls fail, the existing processes are not killed. + */ + unsigned long barrier; + /* hard resource limit */ + unsigned long limit; + /* consumed resources */ + unsigned long held; + /* maximum amount of consumed resources through the last period */ + unsigned long maxheld; + /* minimum amount of consumed resources through the last period */ + unsigned long minheld; + /* count of failed charges */ + unsigned long failcnt; +}; + +/* + * Kernel internal part. + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. + */ +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) + + +/* + * Resource management structures + * Serialization issues: + * beancounter list management is protected via ub_hash_lock + * task pointers are set only for current task and only once + * refcount is managed atomically + * value and limit comparison and change are protected by per-ub spinlock + */ + +struct page_beancounter; +struct task_beancounter; +struct sock_beancounter; + +struct page_private { + unsigned long ubp_unused_privvmpages; + unsigned long ubp_tmpfs_respages; + unsigned long ubp_swap_pages; + unsigned long long ubp_held_pages; +}; + +struct sock_private { + unsigned long ubp_rmem_thres; + unsigned long ubp_wmem_pressure; + unsigned long ubp_maxadvmss; + unsigned long ubp_rmem_pressure; + int ubp_tw_count; +#define UB_RMEM_EXPAND 0 +#define UB_RMEM_KEEP 1 +#define UB_RMEM_SHRINK 2 + struct list_head ubp_other_socks; + struct list_head ubp_tcp_socks; + atomic_t ubp_orphan_count; +}; + +struct ub_percpu_struct { + unsigned long unmap; + unsigned long swapin; +#ifdef CONFIG_BC_IO_ACCOUNTING + unsigned long long bytes_wrote; + unsigned long long bytes_read; + unsigned long long bytes_cancelled; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + long pages_charged; + long vmalloc_charged; + long pbcs; +#endif + unsigned long sync; + unsigned long sync_done; + + unsigned long fsync; + unsigned long fsync_done; + + unsigned long fdsync; + unsigned long fdsync_done; + + unsigned long frsync; + unsigned long frsync_done; + + unsigned long write; + unsigned long read; + unsigned long long wchar; + unsigned long long rchar; +}; + +struct user_beancounter +{ + unsigned long ub_magic; + atomic_t ub_refcount; + struct list_head ub_list; + struct hlist_node ub_hash; + + union { + struct rcu_head rcu; + struct execute_work cleanup; + }; + + spinlock_t ub_lock; + uid_t ub_uid; + + struct ub_rate_info ub_limit_rl; + int ub_oom_noproc; + + struct page_private ppriv; +#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages +#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages +#define ub_swap_pages ppriv.ubp_swap_pages +#define ub_held_pages ppriv.ubp_held_pages + struct sock_private spriv; +#define ub_rmem_thres spriv.ubp_rmem_thres +#define ub_maxadvmss spriv.ubp_maxadvmss +#define ub_rmem_pressure spriv.ubp_rmem_pressure +#define ub_wmem_pressure spriv.ubp_wmem_pressure +#define ub_tcp_sk_list spriv.ubp_tcp_socks +#define ub_other_sk_list spriv.ubp_other_socks +#define ub_orphan_count spriv.ubp_orphan_count +#define ub_tw_count spriv.ubp_tw_count + struct ub_iopriv iopriv; + + struct user_beancounter *parent; + void *private_data; + unsigned long ub_aflags; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + + /* resources statistic and settings */ + struct ubparm ub_parms[UB_RESOURCES]; + /* resources statistic for last interval */ + struct ubparm ub_store[UB_RESOURCES]; + + struct ub_percpu_struct *ub_percpu; +#ifdef CONFIG_BC_IO_ACCOUNTING + /* these are protected with pb_lock */ + unsigned long long bytes_wrote; + unsigned long long bytes_dirtied; + unsigned long long bytes_dirty_missed; + unsigned long io_pb_held; +#endif +#ifdef CONFIG_BC_DEBUG_KMEM + struct list_head ub_cclist; +#endif +}; + +enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; + +#define UB_AFLAG_NOTIF_PAGEIN 0 + +static inline +struct user_beancounter *top_beancounter(struct user_beancounter *ub) +{ + while (ub->parent != NULL) + ub = ub->parent; + return ub; +} + +static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) +{ + return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; +} + +static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) +{ + return (ub->ub_parms[resource].held > + ((ub->ub_parms[resource].barrier) >> 1)); +} + +static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3); +} + +static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource) +{ + struct ubparm *p; + p = ub->ub_parms + resource; + return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024; +} + +#ifndef CONFIG_BEANCOUNTERS + +#define ub_percpu_add(ub, f, v) do { } while (0) +#define ub_percpu_sub(ub, f, v) do { } while (0) +#define ub_percpu_inc(ub, f) do { } while (0) +#define ub_percpu_dec(ub, f) do { } while (0) + +#define mm_ub(mm) (NULL) + +extern inline struct user_beancounter *get_beancounter_byuid + (uid_t uid, int create) { return NULL; } +extern inline struct user_beancounter *get_beancounter + (struct user_beancounter *ub) { return NULL; } +extern inline void put_beancounter(struct user_beancounter *ub) { } + +static inline void ub_init_late(void) { }; +static inline void ub_init_early(void) { }; + +static inline int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, + enum ub_severity strict) { return 0; } +static inline void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) { } + +#else /* CONFIG_BEANCOUNTERS */ + +#define ub_percpu_add(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) + +#define ub_percpu_sub(ub, field, v) do { \ + per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ + put_cpu(); \ + } while (0) +#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1) + +#define mm_ub(mm) ((mm)->mm_ub) +/* + * Charge/uncharge operations + */ + +extern int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict); + +extern void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val); + +extern void put_beancounter_safe(struct user_beancounter *ub); +extern void __put_beancounter(struct user_beancounter *ub); + +extern void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held); + +extern const char *ub_rnames[]; +/* + * Put a beancounter reference + */ + +static inline void put_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return; + + /* FIXME - optimize not to disable interrupts and make call */ + __put_beancounter(ub); +} + +/* fast put, refcount can't reach zero */ +static inline void __put_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_sub(n, &ub->ub_refcount); +} + +static inline void put_beancounter_batch(struct user_beancounter *ub, int n) +{ + if (n > 1) + __put_beancounter_batch(ub, n - 1); + __put_beancounter(ub); +} + +/* + * Create a new beancounter reference + */ +extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); + +static inline +struct user_beancounter *get_beancounter(struct user_beancounter *ub) +{ + if (unlikely(ub == NULL)) + return NULL; + + atomic_inc(&ub->ub_refcount); + return ub; +} + +static inline +struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub) +{ + return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL; +} + +static inline void get_beancounter_batch(struct user_beancounter *ub, int n) +{ + atomic_add(n, &ub->ub_refcount); +} + +extern struct user_beancounter *get_subbeancounter_byid( + struct user_beancounter *, + int id, int create); + +extern void ub_init_late(void); +extern void ub_init_early(void); + +extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size); + +/* + * Resource charging + * Change user's account and compare against limits + */ + +static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) +{ + if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) + ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; + if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) + ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; +} + +int charge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val, enum ub_severity strict); +void uncharge_beancounter(struct user_beancounter *ub, int resource, + unsigned long val); +void __charge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); +void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource, + unsigned long val); + +static inline void charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __charge_beancounter_notop(ub, resource, val); +} + +static inline void uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + if (ub->parent != NULL) + __uncharge_beancounter_notop(ub, resource, val); +} + +#endif /* CONFIG_BEANCOUNTERS */ + +#ifndef CONFIG_BC_RSS_ACCOUNTING +static inline void ub_ini_pbc(void) { } +#else +extern void ub_init_pbc(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _LINUX_BEANCOUNTER_H */ diff -uprN linux-2.6.24/include/bc/dcache.h linux-2.6.24.ovz/include/bc/dcache.h --- linux-2.6.24/include/bc/dcache.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/dcache.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,47 @@ +/* + * include/bc/dcache.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_H_ +#define __BC_DCACHE_H_ + +#include + +/* + * UB_DCACHESIZE accounting + */ + +struct dentry_beancounter +{ + /* + * d_inuse = + * + + * + * + * d_inuse == -1 means that dentry is unused + * state change -1 => 0 causes charge + * state change 0 => -1 causes uncharge + */ + atomic_t d_inuse; + /* charged size, including name length if name is not inline */ + unsigned long d_ubsize; + struct user_beancounter *d_ub; +}; + +#ifdef CONFIG_BEANCOUNTERS +#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) +#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) +#define INUSE_INIT 0 + +extern int ub_dentry_on; +#else +#define ub_dget_testone(d) (0) +#define ub_dput_testzero(d) (0) +#endif +#endif diff -uprN linux-2.6.24/include/bc/dcache_op.h linux-2.6.24.ovz/include/bc/dcache_op.h --- linux-2.6.24/include/bc/dcache_op.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/dcache_op.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,100 @@ +/* + * include/bc/dcache_op.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DCACHE_OP_H_ +#define __BC_DCACHE_OP_H_ + +struct dentry; + +#ifdef CONFIG_BEANCOUNTERS + +#include +#include +#include + +extern int ub_dentry_alloc_barrier; +extern spinlock_t dcache_lock; + +static inline int ub_dentry_alloc(struct dentry *d) +{ + extern int __ub_dentry_alloc(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_alloc(d); +} + +static inline void ub_dentry_alloc_start(void) +{ + extern void __ub_dentry_alloc_start(void); + + if (ub_dentry_alloc_barrier) + __ub_dentry_alloc_start(); +} + +static inline void ub_dentry_alloc_end(void) +{ + extern void __ub_dentry_alloc_end(void); + + if (current->task_bc.dentry_alloc) + __ub_dentry_alloc_end(); +} + +static inline int ub_dentry_charge(struct dentry *d) +{ + extern int __ub_dentry_charge(struct dentry *); + + if (!ub_dentry_on) + return 0; + return __ub_dentry_charge(d); +} + +static inline void ub_dentry_charge_nofail(struct dentry *d) +{ + extern void __ub_dentry_charge_nofail(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_charge_nofail(d); +} + +static inline void ub_dentry_uncharge_locked(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + __ub_dentry_uncharge(d); +} + +static inline void ub_dentry_uncharge(struct dentry *d) +{ + extern void __ub_dentry_uncharge(struct dentry *); + + if (!ub_dentry_on) + return; + spin_lock(&dcache_lock); + __ub_dentry_uncharge(d); + spin_unlock(&dcache_lock); +} + +#else /* CONFIG_BEANCOUNTERS */ + +static inline int ub_dentry_alloc(struct dentry *d) { return 0; } +static inline void ub_dentry_alloc_start(void) { } +static inline void ub_dentry_alloc_end(void) { } +static inline int ub_dentry_charge(struct dentry *d) { return 0; } +static inline void ub_dentry_charge_nofail(struct dentry *d) { } +static inline void ub_dentry_uncharge_locked(struct dentry *d) { } +static inline void ub_dentry_uncharge(struct dentry *d) { } + +#endif /* CONFIG_BEANCOUNTERS */ + +#endif /* __dcache_op.h_ */ diff -uprN linux-2.6.24/include/bc/debug.h linux-2.6.24.ovz/include/bc/debug.h --- linux-2.6.24/include/bc/debug.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/debug.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,109 @@ +/* + * include/bc/debug.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DEBUG_H_ +#define __BC_DEBUG_H_ + +/* + * general debugging + */ + +#define UBD_ALLOC 0x1 +#define UBD_CHARGE 0x2 +#define UBD_LIMIT 0x4 +#define UBD_TRACE 0x8 + +/* + * ub_net debugging + */ + +#define UBD_NET_SOCKET 0x10 +#define UBD_NET_SLEEP 0x20 +#define UBD_NET_SEND 0x40 +#define UBD_NET_RECV 0x80 + +/* + * Main routines + */ + +#define UB_DEBUG (0) +#define DEBUG_RESOURCE (0ULL) + +#define ub_dbg_cond(__cond, __str, args...) \ + do { \ + if ((__cond) != 0) \ + printk(__str, ##args); \ + } while(0) + +#define ub_debug(__section, __str, args...) \ + ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) + +#define ub_debug_resource(__resource, __str, args...) \ + ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ + (DEBUG_RESOURCE & (1 << (__resource))), \ + __str, ##args) + +#if UB_DEBUG & UBD_TRACE +#define ub_debug_trace(__cond, __b, __r) \ + do { \ + static struct ub_rate_info ri = { __b, __r }; \ + if ((__cond) != 0 && ub_ratelimit(&ri)) \ + dump_stack(); \ + } while(0) +#else +#define ub_debug_trace(__cond, __burst, __rate) +#endif + +#ifdef CONFIG_BC_DEBUG_KMEM +#include + +struct user_beancounter; +struct ub_cache_counter { + struct list_head ulist; + struct ub_cache_counter *next; + struct user_beancounter *ub; + struct kmem_cache *cachep; + unsigned long counter; +}; + +extern spinlock_t cc_lock; +extern void init_cache_counters(void); +extern void ub_free_counters(struct user_beancounter *); +extern void ub_kmemcache_free(struct kmem_cache *cachep); + +struct vm_struct; +#define inc_vmalloc_charged(vm, flags) do { \ + if (flags & __GFP_UBC) \ + ub_percpu_add(get_exec_ub(), vmalloc_charged, \ + vm->nr_pages); \ + } while (0) +#define dec_vmalloc_charged(vm) do { \ + struct user_beancounter *ub; \ + ub = page_ub(vm->pages[0]); \ + if (ub != NULL) \ + ub_percpu_sub(ub, vmalloc_charged, \ + vm->nr_pages); \ + } while (0) + +#define inc_pbc_count(ub) ub_percpu_inc(ub, pbcs) +#define dec_pbc_count(ub) ub_percpu_dec(ub, pbcs) +#else +#define init_cache_counters() do { } while (0) +#define inc_vmalloc_charged(vm, f) do { } while (0) +#define dec_vmalloc_charged(vm) do { } while (0) + +#define inc_pbc_count(ub) do { } while (0) +#define dec_pbc_count(ub) do { } while (0) + +#define ub_free_counters(ub) do { } while (0) +#define ub_kmemcache_free(cachep) do { } while (0) +#endif + +#endif diff -uprN linux-2.6.24/include/bc/decl.h linux-2.6.24.ovz/include/bc/decl.h --- linux-2.6.24/include/bc/decl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/decl.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,41 @@ +/* + * include/bc/decl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_DECL_H_ +#define __BC_DECL_H_ + +#ifdef __KERNEL__ + +/* + * Naming convension: + * ub__ + */ + +#ifdef CONFIG_BEANCOUNTERS + +#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; +#define UB_DECLARE_VOID_FUNC(decl) extern void decl; + +#else /* CONFIG_BEANCOUNTERS */ + +#define UB_DECLARE_FUNC(ret_type, decl) \ + static inline ret_type decl \ + { \ + return (ret_type)0; \ + } +#define UB_DECLARE_VOID_FUNC(decl) \ + static inline void decl \ + { \ + } + +#endif /* CONFIG_BEANCOUNTERS */ +#endif + +#endif diff -uprN linux-2.6.24/include/bc/hash.h linux-2.6.24.ovz/include/bc/hash.h --- linux-2.6.24/include/bc/hash.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/hash.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,36 @@ +/* + * include/bc/hash.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_UBHASH_H +#define _LINUX_UBHASH_H + +#ifdef __KERNEL__ + +#define UB_HASH_SIZE 256 + +extern struct hlist_head ub_hash[]; +extern spinlock_t ub_hash_lock; +extern struct list_head ub_list_head; + +#ifdef CONFIG_BEANCOUNTERS + +/* + * Iterate over beancounters + * @__ubp - beancounter ptr + * Can use break :) + */ +#define for_each_beancounter(__ubp) \ + list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \ + +#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __KERNEL__ */ +#endif /* _LINUX_UBHASH_H */ diff -uprN linux-2.6.24/include/bc/io_acct.h linux-2.6.24.ovz/include/bc/io_acct.h --- linux-2.6.24/include/bc/io_acct.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/io_acct.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,113 @@ +/* + * include/bc/io_acct.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#ifndef __UB_IO_ACCT_H_ +#define __UB_IO_ACCT_H_ + +#ifdef CONFIG_BC_IO_ACCOUNTING +#include +#include + +#define page_iopb(page) ({ \ + struct page_beancounter *pb; \ + pb = page_pbc(page); \ + rmb(); \ + pb; \ + }) + +/* + * IO ub is required in task context only, so if exec_ub is set + * to NULL this means that uses doesn't need to charge some + * resources. nevertheless IO activity must be accounted, so we + * account it to current's task beancounter. + */ + +static inline struct user_beancounter *get_io_ub(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (unlikely(ub == NULL)) + ub = get_task_ub(current); + + return top_beancounter(ub); +} + +extern struct page_beancounter **page_pblist(struct page *); + +extern void ub_io_save_context(struct page *, size_t); +extern void ub_io_release_context(struct page *pg, size_t size); + +#define PAGE_IO_MARK (0x1UL) + +static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) +{ + if (!((unsigned long)pb & PAGE_IO_MARK)) + return NULL; + + return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK); +} + +static inline void ub_io_account_read(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_read, bytes); +} + +static inline void ub_io_account_write(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_wrote, bytes); +} + +static inline void ub_io_account_dirty(struct page *page, size_t bytes) +{ + ub_io_save_context(page, bytes); +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ + ub_percpu_add(get_io_ub(), bytes_cancelled, bytes); +} + +void ub_init_io(struct kmem_cache *); +#else /* BC_IO_ACCOUNTING */ +#define page_iopb(page) (NULL) +#define page_pblist(page) (&page_pbc(page)) + +static inline void ub_io_release_context(struct page *pg, size_t bytes) +{ +} + +static inline void ub_io_account_dirty(struct page *p, size_t bytes) +{ +} + +static inline void ub_io_account_read(size_t bytes) +{ +} + +static inline void ub_io_account_write(size_t bytes) +{ +} + +static inline void ub_io_account_write_cancelled(size_t bytes) +{ +} + +static inline void ub_init_io(struct kmem_cache *pb_cachep) { }; +#endif + +#ifdef CONFIG_BC_DEBUG_IO +extern void ub_io_release_debug(struct page *pg); +#else +#define ub_io_release_debug(pg) do { } while (0) +#endif +#endif diff -uprN linux-2.6.24/include/bc/io_prio.h linux-2.6.24.ovz/include/bc/io_prio.h --- linux-2.6.24/include/bc/io_prio.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/io_prio.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,82 @@ +/* + * include/bc/io_prio.h + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#ifndef _UB_IO_PRIO_H +#define _UB_IO_PRIO_H + +#include +#include +#include + +#define UB_IOPRIO_MIN 0 +#define UB_IOPRIO_MAX IOPRIO_BE_NR +#define UB_IOPRIO_BASE 4 + +struct ub_iopriv { + struct list_head cfq_bc_head; + rwlock_t cfq_bc_list_lock; + + unsigned int ioprio; +}; + +struct cfq_data; +struct cfq_queue; + +#ifdef CONFIG_BC_IO_SCHED +extern void bc_init_ioprio(struct ub_iopriv *); +extern void bc_fini_ioprio(struct ub_iopriv *); +extern struct cfq_bc_data * bc_find_cfq_bc(struct ub_iopriv *, + struct cfq_data *); +extern struct cfq_bc_data * bc_findcreate_cfq_bc(struct ub_iopriv *, + struct cfq_data *, gfp_t gfp_mask); +extern void bc_cfq_exit_queue(struct cfq_data *); +extern int bc_expired(struct cfq_data *); +extern void bc_schedule_active(struct cfq_data *); +extern void bc_inc_rqnum(struct cfq_queue *); +extern void bc_dec_rqnum(struct cfq_queue *); +extern unsigned long bc_set_ioprio(int, int); +extern struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd); +extern struct user_beancounter *bc_io_switch_context(struct page *); +extern void bc_io_restore_context(struct user_beancounter *); +#else +#include +static inline void bc_init_ioprio(struct ub_iopriv *iopriv) { ; } +static inline void bc_fini_ioprio(struct ub_iopriv *iopriv) { ; } +static inline struct cfq_bc_data * +bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t mask) +{ + return &cfqd->cfq_bc; +} +static inline void bc_cfq_exit_queue(struct cfq_data *cfqd) { ; } +static inline int bc_expired(struct cfq_data *cfqd) { return 0; } +static inline void bc_schedule_active(struct cfq_data *cfqd) +{ + cfqd->active_cfq_bc = &cfqd->cfq_bc; +} +static inline void bc_inc_rqnum(struct cfq_queue *cfqq) { ; } +static inline void bc_dec_rqnum(struct cfq_queue *cfqq) { ; } +static inline unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + return -EINVAL; +} +static inline struct cfq_bc_data * +__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd) +{ + return &cfqd->cfq_bc; +} +static inline struct user_beancounter * +bc_io_switch_context(struct page *page) { return NULL; } +static inline void bc_io_restore_context(struct user_beancounter *ub) { ; } +#endif /* CONFIG_BC_IO_SCHED */ +#endif /* _UB_IO_PRIO_H */ diff -uprN linux-2.6.24/include/bc/kmem.h linux-2.6.24.ovz/include/bc/kmem.h --- linux-2.6.24/include/bc/kmem.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/kmem.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,69 @@ +/* + * include/bc/kmem.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_SLAB_H_ +#define __UB_SLAB_H_ + +#include +#include + +/* + * UB_KMEMSIZE accounting + */ + +#ifdef CONFIG_BC_DEBUG_ITEMS +#define CHARGE_ORDER(__o) (1 << (__o)) +#define CHARGE_SIZE(__s) 1 +#else +#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) +#define CHARGE_SIZE(__s) (__s) +#endif + +#ifdef CONFIG_BEANCOUNTERS +#define page_ub(__page) ((__page)->bc.page_ub) +#else +#define page_ub(__page) NULL +#endif + +struct mm_struct; +struct page; +struct kmem_cache; + +UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) +UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) + +UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, enum ub_severity strict)) +UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size)) + +UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask)) +UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) +UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep, + void *objp, gfp_t flags)) +UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj)) + +#ifdef CONFIG_BEANCOUNTERS +static inline int should_charge(struct kmem_cache *cachep, gfp_t flags) +{ + if (!(cachep->flags & SLAB_UBC)) + return 0; + if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) + return 0; + return 1; +} + +#define should_uncharge(cachep) should_charge(cachep, __GFP_UBC) +#else +#define should_charge(cache, f) 0 +#define should_uncharge(cache) 0 +#endif + +#endif /* __UB_SLAB_H_ */ diff -uprN linux-2.6.24/include/bc/misc.h linux-2.6.24.ovz/include/bc/misc.h --- linux-2.6.24/include/bc/misc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/misc.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,55 @@ +/* + * include/bc/misc.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_MISC_H_ +#define __BC_MISC_H_ + +#include + +struct tty_struct; +struct file; +struct file_lock; +struct sigqueue; + +UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) +UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) +UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) +UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) +UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, + struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) +UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, + struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) +UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task)) +UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) +UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) + +#ifdef CONFIG_BEANCOUNTERS +#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) +#define unset_flock_charged(fl) do { \ + WARN_ON((fl)->fl_charged == 0); \ + (fl)->fl_charged = 0; \ + } while (0) +#define set_mm_ub(mm, tsk) do { \ + (mm)->mm_ub = get_beancounter(tsk ? \ + tsk->task_bc.task_ub : get_exec_ub()); \ + } while (0) +#define put_mm_ub(mm) do { \ + put_beancounter((mm)->mm_ub); \ + (mm)->mm_ub = NULL; \ + } while (0) +#else +#define set_flock_charged(fl) do { } while (0) +#define unset_flock_charged(fl) do { } while (0) +#define set_mm_ub(mm, tsk) do { } while (0) +#define put_mm_ub(mm) do { } while (0) +#endif +#endif diff -uprN linux-2.6.24/include/bc/net.h linux-2.6.24.ovz/include/bc/net.h --- linux-2.6.24/include/bc/net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/net.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,215 @@ +/* + * include/bc/net.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_NET_H_ +#define __BC_NET_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include +#include + +#define bid2sid(__bufid) \ + ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) + +#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ + ~(SMP_CACHE_BYTES-1))) +#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) + +static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask) +{ +#ifdef CONFIG_BEANCOUNTERS + memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); +#endif + return 0; +} + +static inline void ub_skb_free_bc(struct sk_buff *skb) +{ +} + +#define IS_TCP_SOCK(__family, __type) \ + (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) + +/* number of sockets */ +UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) +UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) +UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) + +/* management of queue for send space */ +UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) + +/* send space */ +UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_unchargesend(struct sock *sk, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) +UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) + +UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) + +/* receive space */ +UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) +UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk, + struct sk_buff *skb, enum ub_severity strict)) + +/* skb destructor */ +UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) + +static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); +} + +static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, + unsigned long size)) + +static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) +{ + return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); +} + +UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize)) + +static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, + unsigned long ressize) +{ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); +} + +static inline void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); +} + +static inline void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) +{ + ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); +} + +static inline int ub_tcpsndbuf_charge(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_HARD); +} + +static inline int ub_tcpsndbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargesend(sk, skb, UB_FORCE); +} + +static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT); +} + +static inline int ub_tcprcvbuf_charge_forced(struct sock *sk, + struct sk_buff *skb) +{ + return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE); +} + +/* Charge size */ +static inline unsigned long skb_charge_datalen(unsigned long chargesize) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned long slabsize; + + chargesize -= sizeof(struct sk_buff); + slabsize = 64; + do { + slabsize <<= 1; + } while (slabsize <= chargesize); + + slabsize >>= 1; + return (slabsize - sizeof(struct skb_shared_info)) & + ~(SMP_CACHE_BYTES-1); +#else + return 0; +#endif +} + +static inline unsigned long skb_charge_size_gen(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int slabsize; + + size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); + slabsize = 32; /* min size is 64 because of skb_shared_info */ + do { + slabsize <<= 1; + } while (slabsize < size); + + return slabsize + sizeof(struct sk_buff); +#else + return 0; +#endif + +} + +static inline unsigned long skb_charge_size_const(unsigned long size) +{ +#ifdef CONFIG_BEANCOUNTERS + unsigned int ret; + if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) + ret = 64 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) + ret = 128 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) + ret = 256 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) + ret = 512 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) + ret = 1024 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) + ret = 2048 + sizeof(struct sk_buff); + else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) + ret = 4096 + sizeof(struct sk_buff); + else + ret = skb_charge_size_gen(size); + return ret; +#else + return 0; +#endif +} + + +#define skb_charge_size(__size) \ + (__builtin_constant_p(__size) ? \ + skb_charge_size_const(__size) : \ + skb_charge_size_gen(__size)) + +UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) +UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, + struct sock *sk, unsigned long size, int res)) + +#endif diff -uprN linux-2.6.24/include/bc/oom_kill.h linux-2.6.24.ovz/include/bc/oom_kill.h --- linux-2.6.24/include/bc/oom_kill.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/oom_kill.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,26 @@ +#include +#include + +UB_DECLARE_FUNC(int, ub_oom_lock(void)) +UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) +UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_unlock(void)) +UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub)) +UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk)) +UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub, + struct task_struct *tsk)) + +#ifdef CONFIG_BEANCOUNTERS +extern int oom_generation; +extern int oom_kill_counter; +#define ub_oom_start() do { \ + current->task_bc.oom_generation = oom_generation; \ + } while (0) +#define ub_oom_task_killed(p) do { \ + oom_kill_counter++; \ + wake_up_process(p); \ + } while (0) +#else +#define ub_oom_start() do { } while (0) +#define ub_oom_task_killed(p) do { } while (0) +#endif diff -uprN linux-2.6.24/include/bc/proc.h linux-2.6.24.ovz/include/bc/proc.h --- linux-2.6.24/include/bc/proc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/proc.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,40 @@ +/* + * include/bc/proc.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PROC_H_ +#define __UB_PROC_H_ + +#include + +struct bc_proc_entry { + char *name; + union { + int (*show)(struct seq_file *, void *); + struct file_operations *fops; + } u; + struct bc_proc_entry *next; + int cookie; +}; + +struct user_beancounter; + +void bc_register_proc_entry(struct bc_proc_entry *); +void bc_register_proc_root_entry(struct bc_proc_entry *); + +static inline struct user_beancounter *seq_beancounter(struct seq_file *f) +{ + return (struct user_beancounter *)(f->private); +} + +extern const char *bc_proc_lu_fmt; +extern const char *bc_proc_lu_lfmt; +extern const char *bc_proc_llu_fmt; +extern const char *bc_proc_lu_lu_fmt; +#endif diff -uprN linux-2.6.24/include/bc/rss_pages.h linux-2.6.24.ovz/include/bc/rss_pages.h --- linux-2.6.24/include/bc/rss_pages.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/rss_pages.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,57 @@ +/* + * include/bc/rss_pages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __RSS_PAGES_H_ +#define __RSS_PAGES_H_ + +/* + * Page_beancounters + */ + +struct page; +struct user_beancounter; + +#define PB_MAGIC 0x62700001UL + +struct page_beancounter { + unsigned long pb_magic; + struct page *page; + struct user_beancounter *ub; + union { + struct page_beancounter *next_hash; + struct page_beancounter *page_pb_list; + }; + union { + unsigned refcount; + unsigned io_debug; + }; + union { + struct list_head page_list; + struct list_head io_list; + }; +}; + +#define PB_REFCOUNT_BITS 24 +#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) +#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) +#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) +#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) +#define PB_COUNT_INC(c) ((c)++) +#define PB_COUNT_DEC(c) ((c)--) +#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) + +#define page_pbc(__page) ((__page)->bc.page_pb) + +extern spinlock_t pb_lock; + +struct address_space; +extern int is_shmem_mapping(struct address_space *); + +#endif diff -uprN linux-2.6.24/include/bc/sock.h linux-2.6.24.ovz/include/bc/sock.h --- linux-2.6.24/include/bc/sock.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/sock.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,47 @@ +/* + * include/bc/sock.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_H_ +#define __BC_SOCK_H_ + +#include + +struct sock; +struct sk_buff; + +struct skb_beancounter { + struct user_beancounter *ub; + unsigned long charged:27, resource:5; +}; + +struct sock_beancounter { + struct user_beancounter *ub; + /* + * poll_reserv accounts space already charged for future sends. + * It is required to make poll agree with sendmsg. + * Additionally, it makes real charges (with taking bc spinlock) + * in the send path rarer, speeding networking up. + * For TCP (only): changes are protected by socket lock (not bc!) + * For all proto: may be read without serialization in poll. + */ + unsigned long poll_reserv; + unsigned long forw_space; + /* fields below are protected by bc spinlock */ + unsigned long ub_waitspc; /* space waiting for */ + unsigned long ub_wcharged; + struct list_head ub_sock_list; +}; + +#define sock_bc(__sk) (&(__sk)->sk_bc) +#define skb_bc(__skb) (&(__skb)->skb_bc) +#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) +#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) + +#endif diff -uprN linux-2.6.24/include/bc/sock_orphan.h linux-2.6.24.ovz/include/bc/sock_orphan.h --- linux-2.6.24/include/bc/sock_orphan.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/sock_orphan.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,106 @@ +/* + * include/bc/sock_orphan.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_SOCK_ORPHAN_H_ +#define __BC_SOCK_ORPHAN_H_ + +#include + +#include "bc/beancounter.h" +#include "bc/net.h" + + +static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return &sock_bc(sk)->ub->ub_orphan_count; +#endif + return sk->sk_prot->orphan_count; +} + +static inline void ub_inc_orphan_count(struct sock *sk) +{ + atomic_inc(__ub_get_orphan_count_ptr(sk)); +} + +static inline void ub_dec_orphan_count(struct sock *sk) +{ + atomic_dec(__ub_get_orphan_count_ptr(sk)); +} + +static inline int ub_get_orphan_count(struct sock *sk) +{ + return atomic_read(__ub_get_orphan_count_ptr(sk)); +} + +extern int __ub_too_many_orphans(struct sock *sk, int count); +static inline int ub_too_many_orphans(struct sock *sk, int count) +{ +#ifdef CONFIG_BEANCOUNTERS + if (__ub_too_many_orphans(sk, count)) + return 1; +#endif + return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); +} + +#include + +struct inet_timewait_sock; + +static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + + ub = slab_ub(tw); + if (ub != NULL) + ub->ub_tw_count += incdec; +#endif +} + +static inline int __ub_timewait_check(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long mem_max, mem; + int tw_count; + + ub = sock_bc(sk)->ub; + if (ub == NULL) + return 1; + + tw_count = ub->ub_tw_count; + mem_max = sysctl_tcp_max_tw_kmem_fraction * + ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1); + mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab); + mem *= tw_count; + return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max; +#else + return 1; +#endif +} + +#define ub_timewait_inc(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, 1); \ + } while (0) + +#define ub_timewait_dec(tw, twdr) do { \ + if ((twdr)->ub_managed) \ + ub_timewait_mod(tw, -1); \ + } while (0) + +#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \ + __ub_timewait_check(sk)) + +#endif diff -uprN linux-2.6.24/include/bc/statd.h linux-2.6.24.ovz/include/bc/statd.h --- linux-2.6.24/include/bc/statd.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/statd.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,70 @@ +/* + * include/bc/statd.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_STATD_H_ +#define __BC_STATD_H_ + +/* sys_ubstat commands list */ +#define UBSTAT_READ_ONE 0x010000 +#define UBSTAT_READ_ALL 0x020000 +#define UBSTAT_READ_FULL 0x030000 +#define UBSTAT_UBLIST 0x040000 +#define UBSTAT_UBPARMNUM 0x050000 +#define UBSTAT_GETTIME 0x060000 + +#define UBSTAT_CMD(func) ((func) & 0xF0000) +#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) + +#define TIME_MAX_SEC (LONG_MAX / HZ) +#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) + +typedef unsigned long ubstattime_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstattime_t cur_time; +} ubgettime_t; + +typedef struct { + long maxinterval; + int signum; +} ubnotifrq_t; + +typedef struct { + unsigned long maxheld; + unsigned long failcnt; +} ubstatparm_t; + +typedef struct { + unsigned long barrier; + unsigned long limit; + unsigned long held; + unsigned long maxheld; + unsigned long minheld; + unsigned long failcnt; + unsigned long __unused1; + unsigned long __unused2; +} ubstatparmf_t; + +typedef struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[0]; +} ubstatfull_t; + +#ifdef __KERNEL__ +struct ub_stat_notify { + struct list_head list; + struct task_struct *task; + int signum; +}; +#endif +#endif diff -uprN linux-2.6.24/include/bc/task.h linux-2.6.24.ovz/include/bc/task.h --- linux-2.6.24/include/bc/task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/task.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,69 @@ +/* + * include/bc/task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TASK_H_ +#define __BC_TASK_H_ + +struct user_beancounter; + + +#ifdef CONFIG_BEANCOUNTERS +struct task_beancounter { + struct user_beancounter *exec_ub; + struct user_beancounter *saved_ub; + struct user_beancounter *task_ub; + struct user_beancounter *fork_sub; + unsigned long file_precharged, file_quant, file_count; + unsigned long kmem_precharged; + char dentry_alloc, pgfault_handle; + void *task_fnode, *task_freserv; + unsigned long oom_generation; + unsigned long task_data[4]; + unsigned long pgfault_allot; +}; + +#define get_task_ub(__task) ((__task)->task_bc.task_ub) + +extern struct user_beancounter ub0; +#define get_ub0() (&ub0) + +#define ub_save_context(t) do { \ + t->task_bc.saved_ub = t->task_bc.exec_ub; \ + t->task_bc.exec_ub = get_ub0(); \ + } while (0) +#define ub_restore_context(t) do { \ + t->task_bc.exec_ub = t->task_bc.saved_ub; \ + } while (0) + +#define get_exec_ub() (current->task_bc.exec_ub) +#define set_exec_ub(__newub) \ +({ \ + struct user_beancounter *old; \ + struct task_beancounter *tbc; \ + \ + tbc = ¤t->task_bc; \ + old = tbc->exec_ub; \ + tbc->exec_ub = __newub; \ + old; \ +}) + +void ub_init_task_bc(struct task_beancounter *); + +#else /* CONFIG_BEANCOUNTERS */ + +#define get_ub0() (NULL) +#define get_exec_ub() (NULL) +#define get_task_ub(task) (NULL) +#define set_exec_ub(__ub) (NULL) +#define ub_save_context(t) do { } while (0) +#define ub_restore_context(t) do { } while (0) + +#endif /* CONFIG_BEANCOUNTERS */ +#endif /* __task.h_ */ diff -uprN linux-2.6.24/include/bc/tcp.h linux-2.6.24.ovz/include/bc/tcp.h --- linux-2.6.24/include/bc/tcp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/tcp.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,76 @@ +/* + * include/bc/tcp.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __BC_TCP_H_ +#define __BC_TCP_H_ + +/* + * UB_NUMXXXSOCK, UB_XXXBUF accounting + */ + +#include +#include + +static inline void ub_tcp_update_maxadvmss(struct sock *sk) +{ +#ifdef CONFIG_BEANCOUNTERS + if (!sock_has_ubc(sk)) + return; + if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) + return; + + sock_bc(sk)->ub->ub_maxadvmss = + skb_charge_size(MAX_HEADER + sizeof(struct iphdr) + + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); +#endif +} + +static inline int ub_tcp_rmem_allows_expand(struct sock *sk) +{ + if (tcp_memory_pressure) + return 0; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) { + struct user_beancounter *ub; + + ub = sock_bc(sk)->ub; + if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) + return 1; + if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) + return 0; + return sk->sk_rcvbuf <= ub->ub_rmem_thres; + } +#endif + return 1; +} + +static inline int ub_tcp_memory_pressure(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; +#endif + return 0; +} + +static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) +{ + if (tcp_memory_pressure) + return 1; +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; +#endif + return 0; +} + +#endif diff -uprN linux-2.6.24/include/bc/vmpages.h linux-2.6.24.ovz/include/bc/vmpages.h --- linux-2.6.24/include/bc/vmpages.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/bc/vmpages.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,152 @@ +/* + * include/bc/vmpages.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __UB_PAGES_H_ +#define __UB_PAGES_H_ + +#include +#include +#include + +/* + * Check whether vma has private or copy-on-write mapping. + * Should match checks in ub_protected_charge(). + */ +#define VM_UB_PRIVATE(__flags, __file) \ + ( ((__flags) & VM_WRITE) ? \ + (__file) == NULL || !((__flags) & VM_SHARED) : \ + 0 \ + ) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, + unsigned long size, + unsigned long newflags, + struct vm_area_struct *vma)) + +UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) +UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long num)) +#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) + +UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, + long sz)) + +UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file, + int strict)) +UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, + unsigned long size, + unsigned vm_flags, + struct file *vm_file)) + +struct shmem_inode_info; +UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, + unsigned long sz)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) +UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size)) +#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) + +#ifdef CONFIG_BEANCOUNTERS +#define shmi_ub_set(shi, ub) do { \ + (shi)->shmi_ub = get_beancounter(ub); \ + } while (0) +#define shmi_ub_put(shi) do { \ + put_beancounter((shi)->shmi_ub); \ + (shi)->shmi_ub = NULL; \ + } while (0) +#else +#define shmi_ub_set(shi, ub) do { } while (0) +#define shmi_ub_put(shi) do { } while (0) +#endif + +UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, + unsigned long size)) +UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, + unsigned long size)) +UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, + unsigned long size)) + +UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end)) +#define pages_in_vma(vma) (pages_in_vma_range(vma, \ + vma->vm_start, vma->vm_end)) + +#define UB_PAGE_WEIGHT_SHIFT 24 +#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) + +struct page_beancounter; +#define PBC_COPY_SAME ((struct page_beancounter *) 1) + +/* Mprotect charging result */ +#define PRIVVM_ERROR -1 +#define PRIVVM_NO_CHARGE 0 +#define PRIVVM_TO_PRIVATE 1 +#define PRIVVM_TO_SHARED 2 + +extern void fastcall __ub_update_physpages(struct user_beancounter *ub); +extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub); +extern void fastcall __ub_update_privvm(struct user_beancounter *ub); + +#ifdef CONFIG_BC_RSS_ACCOUNTING +#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) +PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) +PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, + struct mm_struct *mm, + struct page_beancounter **pbc)) +PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) +PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, + struct mm_struct *mm)) + +PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) +#endif + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) +#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) +#else +#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} +#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } +#endif + +struct swap_info_struct; +SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) +SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, + struct user_beancounter *ub)) +SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) diff -uprN linux-2.6.24/include/linux/aio.h linux-2.6.24.ovz/include/linux/aio.h --- linux-2.6.24/include/linux/aio.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/aio.h 2008-03-25 18:53:59.000000000 -0500 @@ -245,4 +245,8 @@ static inline struct kiocb *list_kiocb(s extern unsigned long aio_nr; extern unsigned long aio_max_nr; +void wait_for_all_aios(struct kioctx *ctx); +extern struct kmem_cache *kioctx_cachep; +extern void aio_kick_handler(struct work_struct *); + #endif /* __LINUX__AIO_H */ diff -uprN linux-2.6.24/include/linux/auto_fs.h linux-2.6.24.ovz/include/linux/auto_fs.h --- linux-2.6.24/include/linux/auto_fs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/auto_fs.h 2008-03-25 18:53:59.000000000 -0500 @@ -51,6 +51,8 @@ typedef unsigned int autofs_wqt_t; typedef unsigned long autofs_wqt_t; #endif +typedef __u32 autofs_wqt_t_32bit; + /* Packet types */ #define autofs_ptype_missing 0 /* Missing entry (mount request) */ #define autofs_ptype_expire 1 /* Expire entry (umount request) */ @@ -67,6 +69,13 @@ struct autofs_packet_missing { char name[NAME_MAX+1]; }; +struct autofs_packet_missing_32bit { + struct autofs_packet_hdr hdr; + autofs_wqt_t_32bit wait_queue_token; + int len; + char name[NAME_MAX+1]; +} __attribute__ ((__packed__)); + /* v3 expire (via ioctl) */ struct autofs_packet_expire { struct autofs_packet_hdr hdr; @@ -74,6 +83,13 @@ struct autofs_packet_expire { char name[NAME_MAX+1]; }; +/* v3 expire (via ioctl) for 32 bit userspace daemon and x68_64 kernel */ +struct autofs_packet_expire_32bit { + struct autofs_packet_hdr hdr; + int len; + char name[NAME_MAX+1]; +} __attribute__ ((__packed__)); + #define AUTOFS_IOC_READY _IO(0x93,0x60) #define AUTOFS_IOC_FAIL _IO(0x93,0x61) #define AUTOFS_IOC_CATATONIC _IO(0x93,0x62) diff -uprN linux-2.6.24/include/linux/auto_fs4.h linux-2.6.24.ovz/include/linux/auto_fs4.h --- linux-2.6.24/include/linux/auto_fs4.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/auto_fs4.h 2008-03-25 18:53:59.000000000 -0500 @@ -59,11 +59,22 @@ struct autofs_packet_expire_multi { char name[NAME_MAX+1]; }; +/* v4 multi expire (via pipe) for 32 bit userspace daemon and x68_64 kernel */ +struct autofs_packet_expire_multi_32bit { + struct autofs_packet_hdr hdr; + autofs_wqt_t_32bit wait_queue_token; + int len; + char name[NAME_MAX+1]; +} __attribute__ ((__packed__)); + union autofs_packet_union { struct autofs_packet_hdr hdr; struct autofs_packet_missing missing; + struct autofs_packet_missing_32bit missing_32bit; struct autofs_packet_expire expire; + struct autofs_packet_expire_32bit expire_32bit; struct autofs_packet_expire_multi expire_multi; + struct autofs_packet_expire_multi_32bit expire_multi_32bit; }; /* autofs v5 common packet struct */ @@ -80,6 +91,20 @@ struct autofs_v5_packet { char name[NAME_MAX+1]; }; +/* autofs v5 packet struct for 32 bit userspace daemon and x68_64 kernel*/ +struct autofs_v5_packet_32bit { + struct autofs_packet_hdr hdr; + autofs_wqt_t_32bit wait_queue_token; + __u32 dev; + __u64 ino; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 tgid; + __u32 len; + char name[NAME_MAX+1]; +} __attribute__ ((__packed__)); + typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; typedef struct autofs_v5_packet autofs_packet_missing_direct_t; @@ -88,6 +113,7 @@ typedef struct autofs_v5_packet autofs_p union autofs_v5_packet_union { struct autofs_packet_hdr hdr; struct autofs_v5_packet v5_packet; + struct autofs_v5_packet_32bit v5_packet_32bit; autofs_packet_missing_indirect_t missing_indirect; autofs_packet_expire_indirect_t expire_indirect; autofs_packet_missing_direct_t missing_direct; diff -uprN linux-2.6.24/include/linux/capability.h linux-2.6.24.ovz/include/linux/capability.h --- linux-2.6.24/include/linux/capability.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/capability.h 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,7 @@ struct vfs_cap_data { }; #ifdef __KERNEL__ +#include /* #define STRICT_CAP_T_TYPECHECKS */ @@ -163,12 +164,9 @@ typedef __u32 kernel_cap_t; #define CAP_NET_BROADCAST 11 -/* Allow interface configuration */ /* Allow administration of IP firewall, masquerading and accounting */ /* Allow setting debug option on sockets */ /* Allow modification of routing tables */ -/* Allow setting arbitrary process / process group ownership on - sockets */ /* Allow binding to any address for transparent proxying */ /* Allow setting TOS (type of service) */ /* Allow setting promiscuous mode */ @@ -199,6 +197,7 @@ typedef __u32 kernel_cap_t; #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ +/* Allow O_DIRECT access */ /* Allow sending USB messages to any device via /proc/bus/usb */ #define CAP_SYS_RAWIO 17 @@ -217,24 +216,19 @@ typedef __u32 kernel_cap_t; /* Allow configuration of the secure attention key */ /* Allow administration of the random device */ -/* Allow examination and configuration of disk quotas */ /* Allow configuring the kernel's syslog (printk behaviour) */ /* Allow setting the domainname */ /* Allow setting the hostname */ /* Allow calling bdflush() */ -/* Allow mount() and umount(), setting up new smb connection */ +/* Allow setting up new smb connection */ /* Allow some autofs root ioctls */ /* Allow nfsservctl */ /* Allow VM86_REQUEST_IRQ */ /* Allow to read/write pci config on alpha */ /* Allow irix_prctl on mips (setstacksize) */ /* Allow flushing all cache on m68k (sys_cacheflush) */ -/* Allow removing semaphores */ -/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores - and shared memory */ /* Allow locking/unlocking of shared memory segment */ /* Allow turning swap on/off */ -/* Allow forged pids on socket credentials passing */ /* Allow setting readahead and flushing buffers on block devices */ /* Allow setting geometry in floppy driver */ /* Allow turning DMA on/off in xd driver */ @@ -252,6 +246,8 @@ typedef __u32 kernel_cap_t; arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Modify data journaling mode on ext[34] filesystem (uses journaling + resources) */ #define CAP_SYS_ADMIN 21 @@ -271,7 +267,7 @@ typedef __u32 kernel_cap_t; /* Override resource limits. Set resource limits. */ /* Override quota limits. */ /* Override reserved space on ext2 filesystem */ -/* Modify data journaling mode on ext3 filesystem (uses journaling +/* Modify data journaling mode on ext[34] filesystem (uses journaling resources) */ /* NOTE: ext2 honors fsuid when checking for resource overrides, so you can override using fsuid too */ @@ -307,8 +303,59 @@ typedef __u32 kernel_cap_t; #define CAP_SETFCAP 31 +/* + * Important note: VZ capabilities do intersect with CAP_AUDIT + * this is due to compatibility reasons. Nothing bad. + * Both VZ and Audit/SELinux caps are disabled in VPSs. + */ + +/* Allow access to all information. In the other case some structures will be + hiding to ensure different Virtual Environment non-interaction on the same + node */ +#define CAP_SETVEID 29 + +#define CAP_VE_ADMIN 30 + #ifdef __KERNEL__ +#ifdef CONFIG_VE + +/* Replacement for CAP_NET_ADMIN: + delegated rights to the Virtual environment of its network administration. + For now the following rights have been delegated: + + Allow setting arbitrary process / process group ownership on sockets + Allow interface configuration + */ +#define CAP_VE_NET_ADMIN CAP_VE_ADMIN + +/* Replacement for CAP_SYS_ADMIN: + delegated rights to the Virtual environment of its administration. + For now the following rights have been delegated: + */ +/* Allow mount/umount/remount */ +/* Allow examination and configuration of disk quotas */ +/* Allow removing semaphores */ +/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ +/* Allow locking/unlocking of shared memory segment */ +/* Allow forged pids on socket credentials passing */ + +#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN +#else +#define CAP_VE_NET_ADMIN CAP_NET_ADMIN +#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN +#endif + +/* + * Bounding set + */ +#ifdef CONFIG_VE +#define cap_bset (get_exec_env()->ve_cap_bset) +#else +extern kernel_cap_t cap_bset; +#endif + /* * Internal kernel functions only */ @@ -367,13 +414,23 @@ static inline kernel_cap_t cap_invert(ke #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) #define cap_clear(c) do { cap_t(c) = 0; } while(0) +#ifndef CONFIG_VE #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) +#else +#define cap_set_full(c) \ + do { \ + cap_t(c) = ve_is_super(get_exec_env()) ? \ + ~0 : \ + cap_bset; \ + } while(0) +#endif #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) int capable(int cap); int __capable(struct task_struct *t, int cap); +extern spinlock_t task_capability_lock; #endif /* __KERNEL__ */ diff -uprN linux-2.6.24/include/linux/cfq-iosched.h linux-2.6.24.ovz/include/linux/cfq-iosched.h --- linux-2.6.24/include/linux/cfq-iosched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/cfq-iosched.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,151 @@ +#ifndef _LINUX_CFQ_IOSCHED_H +#define _LINUX_CFQ_IOSCHED_H + +#include +#include +#include + +extern struct kmem_cache *cfq_pool; + +#define CFQ_PRIO_LISTS IOPRIO_BE_NR + +/* + * Most of our rbtree usage is for sorting with min extraction, so + * if we cache the leftmost node we don't have to walk down the tree + * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should + * move this into the elevator for the rq sorting as well. + */ +struct cfq_rb_root { + struct rb_root rb; + struct rb_node *left; +}; +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } + +/* + * Per (Device, UBC) queue data + */ +struct cfq_bc_data { + /* for ub.iopriv->cfq_bc_head */ + struct list_head cfq_bc_list; + /* for cfqd->act_cfq_bc_head */ + struct list_head act_cfq_bc_list; + + struct cfq_data *cfqd; + struct ub_iopriv *ub_iopriv; + + /* + * rr list of queues with requests and the count of them + */ + struct cfq_rb_root service_tree; + + int cur_prio; + int cur_end_prio; + + unsigned long rqnum; + unsigned long on_dispatch; + + /* + * async queue for each priority case + */ + struct cfq_queue *async_cfqq[2][CFQ_PRIO_LISTS]; + struct cfq_queue *async_idle_cfqq; +}; + +/* + * Per block device queue structure + */ +struct cfq_data { + struct request_queue *queue; + +#ifndef CONFIG_BC_IO_SCHED + struct cfq_bc_data cfq_bc; +#endif + unsigned int busy_queues; + + int rq_in_driver; + int sync_flight; + int hw_tag; + + /* + * idle window management + */ + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct cfq_queue *active_queue; + struct cfq_io_context *active_cic; + + struct timer_list idle_class_timer; + + sector_t last_position; + unsigned long last_end_request; + + /* + * tunables, see top of file + */ + unsigned int cfq_quantum; + unsigned int cfq_fifo_expire[2]; + unsigned int cfq_back_penalty; + unsigned int cfq_back_max; + unsigned int cfq_slice[2]; + unsigned int cfq_slice_async_rq; + unsigned int cfq_slice_idle; + + struct list_head cic_list; + + /* list of ub that have requests */ + struct list_head act_cfq_bc_head; + /* ub that owns a timeslice at the moment */ + struct cfq_bc_data *active_cfq_bc; + unsigned int cfq_ub_slice; + unsigned long slice_end; + int virt_mode; + int write_virt_mode; +}; + +/* + * Per process-grouping structure + */ +struct cfq_queue { + /* reference count */ + atomic_t ref; + /* parent cfq_data */ + struct cfq_data *cfqd; + /* service_tree member */ + struct rb_node rb_node; + /* service_tree key */ + unsigned long rb_key; + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* requests queued in sort_list */ + int queued[2]; + /* currently allocated requests */ + int allocated[2]; + /* pending metadata requests */ + int meta_pending; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + unsigned long slice_end; + long slice_resid; + + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; + + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; + + /* various state flags, see below */ + unsigned int flags; + + struct cfq_bc_data *cfq_bc; +}; + +static void inline cfq_init_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + cfq_bc->service_tree = CFQ_RB_ROOT; +} +#endif /* _LINUX_CFQ_IOSCHED_H */ diff -uprN linux-2.6.24/include/linux/compat.h linux-2.6.24.ovz/include/linux/compat.h --- linux-2.6.24/include/linux/compat.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/compat.h 2008-03-25 18:53:59.000000000 -0500 @@ -233,6 +233,7 @@ extern int put_compat_itimerspec(struct asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); +extern int ve_compat_printk(int dst, const char *fmt, ...); extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, diff -uprN linux-2.6.24/include/linux/console.h linux-2.6.24.ovz/include/linux/console.h --- linux-2.6.24/include/linux/console.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/console.h 2008-03-25 18:53:59.000000000 -0500 @@ -147,4 +147,22 @@ void vcs_remove_sysfs(struct tty_struct #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 + +#include +#include +#include + +struct printk_aligned { + int v; +} ____cacheline_aligned; +extern struct printk_aligned printk_no_wake_var[NR_CPUS]; +#define __printk_no_wake (printk_no_wake_var[smp_processor_id()].v) +#define printk_no_wake ({ \ + int v; \ + preempt_disable(); \ + v = __printk_no_wake; \ + preempt_enable_no_resched(); \ + v; \ + }) + #endif /* _LINUX_CONSOLE_H */ diff -uprN linux-2.6.24/include/linux/cpt_image.h linux-2.6.24.ovz/include/linux/cpt_image.h --- linux-2.6.24/include/linux/cpt_image.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/cpt_image.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1705 @@ +/* + * + * include/linux/cpt_image.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __CPT_IMAGE_H_ +#define __CPT_IMAGE_H_ 1 + +#define CPT_NULL (~0ULL) +#define CPT_NOINDEX (~0U) + +/* + * Image file layout. + * + * - major header + * - sections[] + * + * Each section is: + * - section header + * - array of objects + * + * All data records are arch independent, 64 bit aligned. + */ + +enum _cpt_object_type +{ + CPT_OBJ_TASK = 0, + CPT_OBJ_MM, + CPT_OBJ_FS, + CPT_OBJ_FILES, + CPT_OBJ_FILE, + CPT_OBJ_SIGHAND_STRUCT, + CPT_OBJ_SIGNAL_STRUCT, + CPT_OBJ_TTY, + CPT_OBJ_SOCKET, + CPT_OBJ_SYSVSEM_UNDO, + CPT_OBJ_NAMESPACE, + CPT_OBJ_SYSV_SHM, + CPT_OBJ_INODE, + CPT_OBJ_UBC, + CPT_OBJ_SLM_SGREG, + CPT_OBJ_SLM_REGOBJ, + CPT_OBJ_SLM_MM, + CPT_OBJ_MAX, + /* The objects above are stored in memory while checkpointing */ + + CPT_OBJ_VMA = 1024, + CPT_OBJ_FILEDESC, + CPT_OBJ_SIGHANDLER, + CPT_OBJ_SIGINFO, + CPT_OBJ_LASTSIGINFO, + CPT_OBJ_SYSV_SEM, + CPT_OBJ_SKB, + CPT_OBJ_FLOCK, + CPT_OBJ_OPENREQ, + CPT_OBJ_VFSMOUNT, + CPT_OBJ_TRAILER, + CPT_OBJ_SYSVSEM_UNDO_REC, + CPT_OBJ_NET_DEVICE, + CPT_OBJ_NET_IFADDR, + CPT_OBJ_NET_ROUTE, + CPT_OBJ_NET_CONNTRACK, + CPT_OBJ_NET_CONNTRACK_EXPECT, + CPT_OBJ_AIO_CONTEXT, + CPT_OBJ_VEINFO, + CPT_OBJ_EPOLL, + CPT_OBJ_EPOLL_FILE, + CPT_OBJ_SKFILTER, + CPT_OBJ_SIGALTSTACK, + CPT_OBJ_SOCK_MCADDR, + CPT_OBJ_BIND_MNT, + CPT_OBJ_SYSVMSG, + CPT_OBJ_SYSVMSG_MSG, + + CPT_OBJ_X86_REGS = 4096, + CPT_OBJ_X86_64_REGS, + CPT_OBJ_PAGES, + CPT_OBJ_COPYPAGES, + CPT_OBJ_REMAPPAGES, + CPT_OBJ_LAZYPAGES, + CPT_OBJ_NAME, + CPT_OBJ_BITS, + CPT_OBJ_REF, + CPT_OBJ_ITERPAGES, + CPT_OBJ_ITERYOUNGPAGES, + CPT_OBJ_VSYSCALL, + CPT_OBJ_IA64_REGS, + CPT_OBJ_INOTIFY, + CPT_OBJ_INOTIFY_WATCH, + CPT_OBJ_INOTIFY_EVENT, + CPT_OBJ_TASK_AUX, + CPT_OBJ_NET_TUNTAP, +}; + +#define CPT_ALIGN(n) (((n)+7)&~7) + +struct cpt_major_hdr +{ + __u8 cpt_signature[4]; /* Magic number */ + __u16 cpt_hdrlen; /* Length of this header */ + __u16 cpt_image_version; /* Format of this file */ +#define CPT_VERSION_MINOR(a) ((a) & 0xf) +#define CPT_VERSION_8 0 +#define CPT_VERSION_9 0x100 +#define CPT_VERSION_9_1 0x101 +#define CPT_VERSION_16 0x200 +#define CPT_VERSION_18 0x300 +#define CPT_VERSION_20 0x400 +#define CPT_VERSION_24 0x500 + __u16 cpt_os_arch; /* Architecture */ +#define CPT_OS_ARCH_I386 0 +#define CPT_OS_ARCH_EMT64 1 +#define CPT_OS_ARCH_IA64 2 + __u16 __cpt_pad1; + __u32 cpt_ve_features; /* VE features */ + __u32 cpt_ve_features2; /* VE features */ + __u16 cpt_pagesize; /* Page size used by OS */ + __u16 cpt_hz; /* HZ used by OS */ + __u64 cpt_start_jiffies64; /* Jiffies */ + __u32 cpt_start_sec; /* Seconds */ + __u32 cpt_start_nsec; /* Nanoseconds */ + __u32 cpt_cpu_caps[4]; /* CPU capabilities */ + __u32 cpt_kernel_config[4]; /* Kernel config */ + __u64 cpt_iptables_mask; /* Used netfilter modules */ +} __attribute__ ((aligned (8))); + +#define CPT_SIGNATURE0 0x79 +#define CPT_SIGNATURE1 0x1c +#define CPT_SIGNATURE2 0x01 +#define CPT_SIGNATURE3 0x63 + +/* CPU capabilities */ +#define CPT_CPU_X86_CMOV 0 +#define CPT_CPU_X86_FXSR 1 +#define CPT_CPU_X86_SSE 2 +#define CPT_CPU_X86_SSE2 3 +#define CPT_CPU_X86_MMX 4 +#define CPT_CPU_X86_3DNOW 5 +#define CPT_CPU_X86_3DNOW2 6 +#define CPT_CPU_X86_SEP 7 +#define CPT_CPU_X86_EMT64 8 +#define CPT_CPU_X86_IA64 9 +#define CPT_CPU_X86_SYSCALL 10 +#define CPT_CPU_X86_SYSCALL32 11 +#define CPT_CPU_X86_SEP32 12 + +/* Unsupported features */ +#define CPT_EXTERNAL_PROCESS 16 +#define CPT_NAMESPACES 17 +#define CPT_SCHEDULER_POLICY 18 +#define CPT_PTRACED_FROM_VE0 19 +#define CPT_UNSUPPORTED_FSTYPE 20 +#define CPT_BIND_MOUNT 21 +#define CPT_UNSUPPORTED_NETDEV 22 +#define CPT_UNSUPPORTED_MISC 23 + +/* This mask is used to determine whether VE + has some unsupported features or not */ +#define CPT_UNSUPPORTED_MASK 0xffff0000UL + +#define CPT_KERNEL_CONFIG_PAE 0 + +struct cpt_section_hdr +{ + __u64 cpt_next; + __u32 cpt_section; + __u16 cpt_hdrlen; + __u16 cpt_align; +} __attribute__ ((aligned (8))); + +enum +{ + CPT_SECT_ERROR, /* Error section, content is string */ + CPT_SECT_VEINFO, + CPT_SECT_FILES, /* Files. Content is array of file objects */ + CPT_SECT_TASKS, + CPT_SECT_MM, + CPT_SECT_FILES_STRUCT, + CPT_SECT_FS, + CPT_SECT_SIGHAND_STRUCT, + CPT_SECT_TTY, + CPT_SECT_SOCKET, + CPT_SECT_NAMESPACE, + CPT_SECT_SYSVSEM_UNDO, + CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and + * deleted dentires with inodes not + * referenced inside dumped process. + */ + CPT_SECT_SYSV_SHM, + CPT_SECT_SYSV_SEM, + CPT_SECT_ORPHANS, + CPT_SECT_NET_DEVICE, + CPT_SECT_NET_IFADDR, + CPT_SECT_NET_ROUTE, + CPT_SECT_NET_IPTABLES, + CPT_SECT_NET_CONNTRACK, + CPT_SECT_NET_CONNTRACK_VE0, + CPT_SECT_UTSNAME, + CPT_SECT_TRAILER, + CPT_SECT_UBC, + CPT_SECT_SLM_SGREGS, + CPT_SECT_SLM_REGOBJS, +/* Due to silly mistake we cannot index sections beyond this value */ +#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) + CPT_SECT_EPOLL, + CPT_SECT_VSYSCALL, + CPT_SECT_INOTIFY, + CPT_SECT_SYSV_MSG, + CPT_SECT_MAX +}; + +struct cpt_major_tail +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_lazypages; + __u32 cpt_64bit; + __u64 cpt_sections[CPT_SECT_MAX_INDEX]; + __u32 cpt_nsect; + __u8 cpt_signature[4]; /* Magic number */ +} __attribute__ ((aligned (8))); + + +/* Common object header. */ +struct cpt_object_hdr +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; +} __attribute__ ((aligned (8))); + +enum _cpt_content_type { + CPT_CONTENT_VOID, + CPT_CONTENT_ARRAY, + CPT_CONTENT_DATA, + CPT_CONTENT_NAME, + + CPT_CONTENT_STACK, + CPT_CONTENT_X86_FPUSTATE_OLD, + CPT_CONTENT_X86_FPUSTATE, + CPT_CONTENT_MM_CONTEXT, + CPT_CONTENT_SEMARRAY, + CPT_CONTENT_SEMUNDO, + CPT_CONTENT_NLMARRAY, + CPT_CONTENT_MAX +}; + +/* CPT_OBJ_BITS: encode array of bytes */ +struct cpt_obj_bits +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_REF: a reference to another object */ +struct cpt_obj_ref +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_pos; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VEINFO: various ve specific data */ +struct cpt_veinfo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + /* ipc ctls */ + __u32 shm_ctl_max; + __u32 shm_ctl_all; + __u32 shm_ctl_mni; + __u32 msg_ctl_max; + __u32 msg_ctl_mni; + __u32 msg_ctl_mnb; + __u32 sem_ctl_arr[4]; + + /* start time */ + __u64 start_timespec_delta; + __u64 start_jiffies_delta; + + /* later extension */ + __u32 last_pid; + __u32 pad1; + __u64 reserved[8]; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILE: one struct file */ +struct cpt_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_flags; + __u32 cpt_mode; + __u64 cpt_pos; + __u32 cpt_uid; + __u32 cpt_gid; + + __u32 cpt_i_mode; + __u32 cpt_lflags; +#define CPT_DENTRY_DELETED 1 +#define CPT_DENTRY_ROOT 2 +#define CPT_DENTRY_CLONING 4 +#define CPT_DENTRY_PROC 8 +#define CPT_DENTRY_EPOLL 0x10 +#define CPT_DENTRY_REPLACED 0x20 +#define CPT_DENTRY_INOTIFY 0x40 +#define CPT_DENTRY_FUTEX 0x80 +#define CPT_DENTRY_TUNTAP 0x100 + __u64 cpt_inode; + __u64 cpt_priv; + + __u32 cpt_fown_fd; + __u32 cpt_fown_pid; +#define CPT_FOWN_STRAY_PID 0 + __u32 cpt_fown_uid; + __u32 cpt_fown_euid; + __u32 cpt_fown_signo; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by file name, encoded as CPT_OBJ_NAME */ + +struct cpt_epoll_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_epoll_file */ + +struct cpt_epoll_file_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_fd; + __u32 cpt_events; + __u64 cpt_data; + __u32 cpt_revents; + __u32 cpt_ready; +} __attribute__ ((aligned (8))); + +struct cpt_inotify_wd_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; +} __attribute__ ((aligned (8))); +/* Followed by cpt_file_image of inode to watch */ + +struct cpt_inotify_ev_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_wd; + __u32 cpt_mask; + __u32 cpt_cookie; + __u32 cpt_namelen; +} __attribute__ ((aligned (8))); +/* Followed by name */ + +struct cpt_inotify_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_user; + __u32 cpt_max_events; + __u32 cpt_last_wd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */ + + +/* CPT_OBJ_FILEDESC: one file descriptor */ +struct cpt_fd_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_fd; + __u32 cpt_flags; +#define CPT_FD_FLAG_CLOSEEXEC 1 + __u64 cpt_file; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_FILES: one files_struct */ +struct cpt_files_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_max_fds; + __u32 cpt_next_fd; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by array of cpt_fd_image */ + +/* CPT_OBJ_FS: one fs_struct */ +struct cpt_fs_struct_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_umask; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); +/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ + +/* CPT_OBJ_INODE: one struct inode */ +struct cpt_inode_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_dev; + __u64 cpt_ino; + __u32 cpt_mode; + __u32 cpt_nlink; + __u32 cpt_uid; + __u32 cpt_gid; + __u64 cpt_rdev; + __u64 cpt_size; + __u64 cpt_blksize; + __u64 cpt_atime; + __u64 cpt_mtime; + __u64 cpt_ctime; + __u64 cpt_blocks; + __u32 cpt_sb; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +/* CPT_OBJ_VFSMOUNT: one vfsmount */ +struct cpt_vfsmount_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_mntflags; +#define CPT_MNT_BIND 0x80000000 +#define CPT_MNT_EXT 0x40000000 + __u32 cpt_flags; +} __attribute__ ((aligned (8))); + + +struct cpt_flock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_pid; + __u64 cpt_start; + __u64 cpt_end; + __u32 cpt_flags; + __u32 cpt_type; +} __attribute__ ((aligned (8))); + + +struct cpt_tty_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_flags; + __u32 cpt_link; + __u32 cpt_index; + __u32 cpt_drv_type; + __u32 cpt_drv_subtype; + __u32 cpt_drv_flags; + __u8 cpt_packet; + __u8 cpt_stopped; + __u8 cpt_hw_stopped; + __u8 cpt_flow_stopped; + + __u32 cpt_canon_data; + __u32 cpt_canon_head; + __u32 cpt_canon_column; + __u32 cpt_column; + __u8 cpt_ctrl_status; + __u8 cpt_erasing; + __u8 cpt_lnext; + __u8 cpt_icanon; + __u8 cpt_raw; + __u8 cpt_real_raw; + __u8 cpt_closing; + __u8 __cpt_pad1; + __u16 cpt_minimum_to_wake; + __u16 __cpt_pad2; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_c_line; + __u8 cpt_name[64]; + __u16 cpt_ws_row; + __u16 cpt_ws_col; + __u16 cpt_ws_prow; + __u16 cpt_ws_pcol; + __u8 cpt_c_cc[32]; + __u32 cpt_c_iflag; + __u32 cpt_c_oflag; + __u32 cpt_c_cflag; + __u32 cpt_c_lflag; + __u32 cpt_read_flags[4096/32]; +} __attribute__ ((aligned (8))); + +struct cpt_sock_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_parent; + __u32 cpt_index; + + __u64 cpt_ssflags; + __u16 cpt_type; + __u16 cpt_family; + __u8 cpt_sstate; + __u8 cpt_passcred; + __u8 cpt_state; + __u8 cpt_reuse; + + __u8 cpt_zapped; + __u8 cpt_shutdown; + __u8 cpt_userlocks; + __u8 cpt_no_check; + __u8 cpt_debug; + __u8 cpt_rcvtstamp; + __u8 cpt_localroute; + __u8 cpt_protocol; + + __u32 cpt_err; + __u32 cpt_err_soft; + + __u16 cpt_max_ack_backlog; + __u16 __cpt_pad1; + __u32 cpt_priority; + + __u32 cpt_rcvlowat; + __u32 cpt_bound_dev_if; + + __u64 cpt_rcvtimeo; + __u64 cpt_sndtimeo; + __u32 cpt_rcvbuf; + __u32 cpt_sndbuf; + __u64 cpt_flags; + __u64 cpt_lingertime; + __u32 cpt_peer_pid; + __u32 cpt_peer_uid; + + __u32 cpt_peer_gid; + __u32 cpt_laddrlen; + __u32 cpt_laddr[128/4]; + __u32 cpt_raddrlen; + __u32 cpt_raddr[128/4]; + /* AF_UNIX */ + __u32 cpt_peer; + + __u8 cpt_socketpair; + __u8 cpt_deleted; + __u16 __cpt_pad4; + __u32 __cpt_pad5; +/* + struct sk_filter *sk_filter; + */ + + __u64 cpt_stamp; + __u32 cpt_daddr; + __u16 cpt_dport; + __u16 cpt_sport; + + __u32 cpt_saddr; + __u32 cpt_rcv_saddr; + + __u32 cpt_uc_ttl; + __u32 cpt_tos; + + __u32 cpt_cmsg_flags; + __u32 cpt_mc_index; + + __u32 cpt_mc_addr; +/* + struct ip_options *opt; + */ + __u8 cpt_hdrincl; + __u8 cpt_mc_ttl; + __u8 cpt_mc_loop; + __u8 cpt_pmtudisc; + + __u8 cpt_recverr; + __u8 cpt_freebind; + __u16 cpt_idcounter; + __u32 cpt_cork_flags; + + __u32 cpt_cork_fragsize; + __u32 cpt_cork_length; + __u32 cpt_cork_addr; + __u32 cpt_cork_saddr; + __u32 cpt_cork_daddr; + __u32 cpt_cork_oif; + + __u32 cpt_udp_pending; + __u32 cpt_udp_corkflag; + __u16 cpt_udp_encap; + __u16 cpt_udp_len; + __u32 __cpt_pad7; + + __u64 cpt_saddr6[2]; + __u64 cpt_rcv_saddr6[2]; + __u64 cpt_daddr6[2]; + __u32 cpt_flow_label6; + __u32 cpt_frag_size6; + __u32 cpt_hop_limit6; + __u32 cpt_mcast_hops6; + + __u32 cpt_mcast_oif6; + __u8 cpt_rxopt6; + __u8 cpt_mc_loop6; + __u8 cpt_recverr6; + __u8 cpt_sndflow6; + + __u8 cpt_pmtudisc6; + __u8 cpt_ipv6only6; + __u8 cpt_mapped; + __u8 __cpt_pad8; + __u32 cpt_pred_flags; + + __u32 cpt_rcv_nxt; + __u32 cpt_snd_nxt; + + __u32 cpt_snd_una; + __u32 cpt_snd_sml; + + __u32 cpt_rcv_tstamp; + __u32 cpt_lsndtime; + + __u8 cpt_tcp_header_len; + __u8 cpt_ack_pending; + __u8 cpt_quick; + __u8 cpt_pingpong; + __u8 cpt_blocked; + __u8 __cpt_pad9; + __u16 __cpt_pad10; + + __u32 cpt_ato; + __u32 cpt_ack_timeout; + + __u32 cpt_lrcvtime; + __u16 cpt_last_seg_size; + __u16 cpt_rcv_mss; + + __u32 cpt_snd_wl1; + __u32 cpt_snd_wnd; + + __u32 cpt_max_window; + __u32 cpt_pmtu_cookie; + + __u32 cpt_mss_cache; + __u16 cpt_mss_cache_std; + __u16 cpt_mss_clamp; + + __u16 cpt_ext_header_len; + __u16 cpt_ext2_header_len; + __u8 cpt_ca_state; + __u8 cpt_retransmits; + __u8 cpt_reordering; + __u8 cpt_frto_counter; + + __u32 cpt_frto_highmark; + __u8 cpt_adv_cong; + __u8 cpt_defer_accept; + __u8 cpt_backoff; + __u8 __cpt_pad11; + + __u32 cpt_srtt; + __u32 cpt_mdev; + + __u32 cpt_mdev_max; + __u32 cpt_rttvar; + + __u32 cpt_rtt_seq; + __u32 cpt_rto; + + __u32 cpt_packets_out; + __u32 cpt_left_out; + + __u32 cpt_retrans_out; + __u32 cpt_snd_ssthresh; + + __u32 cpt_snd_cwnd; + __u16 cpt_snd_cwnd_cnt; + __u16 cpt_snd_cwnd_clamp; + + __u32 cpt_snd_cwnd_used; + __u32 cpt_snd_cwnd_stamp; + + __u32 cpt_timeout; + __u32 cpt_ka_timeout; + + __u32 cpt_rcv_wnd; + __u32 cpt_rcv_wup; + + __u32 cpt_write_seq; + __u32 cpt_pushed_seq; + + __u32 cpt_copied_seq; + __u8 cpt_tstamp_ok; + __u8 cpt_wscale_ok; + __u8 cpt_sack_ok; + __u8 cpt_saw_tstamp; + + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + __u8 cpt_nonagle; + __u8 cpt_keepalive_probes; + __u32 cpt_rcv_tsval; + + __u32 cpt_rcv_tsecr; + __u32 cpt_ts_recent; + + __u64 cpt_ts_recent_stamp; + __u16 cpt_user_mss; + __u8 cpt_dsack; + __u8 cpt_eff_sacks; + __u32 cpt_sack_array[2*5]; + __u32 cpt_window_clamp; + + __u32 cpt_rcv_ssthresh; + __u8 cpt_probes_out; + __u8 cpt_num_sacks; + __u16 cpt_advmss; + + __u8 cpt_syn_retries; + __u8 cpt_ecn_flags; + __u16 cpt_prior_ssthresh; + __u32 cpt_lost_out; + + __u32 cpt_sacked_out; + __u32 cpt_fackets_out; + + __u32 cpt_high_seq; + __u32 cpt_retrans_stamp; + + __u32 cpt_undo_marker; + __u32 cpt_undo_retrans; + + __u32 cpt_urg_seq; + __u16 cpt_urg_data; + __u8 cpt_pending; + __u8 cpt_urg_mode; + + __u32 cpt_snd_up; + __u32 cpt_keepalive_time; + + __u32 cpt_keepalive_intvl; + __u32 cpt_linger2; + + __u32 cpt_rcvrtt_rtt; + __u32 cpt_rcvrtt_seq; + + __u32 cpt_rcvrtt_time; + __u32 __cpt_pad12; +} __attribute__ ((aligned (8))); + +struct cpt_sockmc_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u16 cpt_family; + __u16 cpt_mode; + __u32 cpt_ifindex; + __u32 cpt_mcaddr[4]; +} __attribute__ ((aligned (8))); +/* Followed by array of source addresses, each zero padded to 16 bytes */ + +struct cpt_openreq_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_rcv_isn; + __u32 cpt_snt_isn; + + __u16 cpt_rmt_port; + __u16 cpt_mss; + __u8 cpt_family; + __u8 cpt_retrans; + __u8 cpt_snd_wscale; + __u8 cpt_rcv_wscale; + + __u8 cpt_tstamp_ok; + __u8 cpt_sack_ok; + __u8 cpt_wscale_ok; + __u8 cpt_ecn_ok; + __u8 cpt_acked; + __u8 __cpt_pad1; + __u16 __cpt_pad2; + + __u32 cpt_window_clamp; + __u32 cpt_rcv_wnd; + __u32 cpt_ts_recent; + __u32 cpt_iif; + __u64 cpt_expires; + + __u64 cpt_loc_addr[2]; + __u64 cpt_rmt_addr[2]; +/* + struct ip_options *opt; + */ + +} __attribute__ ((aligned (8))); + +struct cpt_skb_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_queue; +#define CPT_SKB_NQ 0 +#define CPT_SKB_RQ 1 +#define CPT_SKB_WQ 2 +#define CPT_SKB_OFOQ 3 + + __u64 cpt_stamp; + __u32 cpt_len; + __u32 cpt_hspace; + __u32 cpt_tspace; + __u32 cpt_h; + __u32 cpt_nh; + __u32 cpt_mac; + + __u64 cpt_cb[5]; + __u32 cpt_mac_len; + __u32 cpt_csum; + __u8 cpt_local_df; + __u8 cpt_pkt_type; + __u8 cpt_ip_summed; + __u8 __cpt_pad1; + __u32 cpt_priority; + __u16 cpt_protocol; + __u16 cpt_security; + __u16 cpt_gso_segs; + __u16 cpt_gso_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvshm_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + + __u32 cpt_id; + __u32 cpt_mlockuser; + __u64 cpt_segsz; + __u64 cpt_atime; + __u64 cpt_ctime; + __u64 cpt_dtime; + __u64 cpt_creator; + __u64 cpt_last; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvsem_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_otime; + __u64 cpt_ctime; +} __attribute__ ((aligned (8))); +/* Content is array of pairs semval/sempid */ + +struct cpt_sysvsem_undo_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_id; + __u32 cpt_nsem; +} __attribute__ ((aligned (8))); + +struct cpt_sysvmsg_msg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_type; + __u64 cpt_size; +} __attribute__ ((aligned (8))); + + +struct cpt_sysvmsg_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_key; + __u64 cpt_uid; + __u64 cpt_gid; + __u64 cpt_cuid; + __u64 cpt_cgid; + __u64 cpt_mode; + __u64 cpt_seq; + __u32 cpt_id; + __u32 __cpt_pad1; + + __u64 cpt_stime; + __u64 cpt_rtime; + __u64 cpt_ctime; + __u64 cpt_last_sender; + __u64 cpt_last_receiver; + __u64 cpt_qbytes; +} __attribute__ ((aligned (8))); +/* Content is array of sysv msg */ + + +struct cpt_mm_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start_code; + __u64 cpt_end_code; + __u64 cpt_start_data; + __u64 cpt_end_data; + __u64 cpt_start_brk; + __u64 cpt_brk; + __u64 cpt_start_stack; + __u64 cpt_start_arg; + __u64 cpt_end_arg; + __u64 cpt_start_env; + __u64 cpt_end_env; + __u64 cpt_def_flags; + __u64 cpt_mmub; + __u8 cpt_dumpable; + __u8 cpt_vps_dumpable; + __u8 cpt_used_hugetlb; + __u8 __cpt_pad; + __u32 cpt_vdso; +} __attribute__ ((aligned (8))); + +struct cpt_page_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); + +struct cpt_remappage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_copypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_source; +} __attribute__ ((aligned (8))); + +struct cpt_lazypage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_index; +} __attribute__ ((aligned (8))); + +struct cpt_iterpage_block +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_start; + __u64 cpt_end; +} __attribute__ ((aligned (8))); +/* Followed by array of PFNs */ + +struct cpt_vma_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_file; + __u32 cpt_type; +#define CPT_VMA_TYPE_0 0 +#define CPT_VMA_TYPE_SHM 1 +#define CPT_VMA_VDSO 2 + __u32 cpt_anonvma; + __u64 cpt_anonvmaid; + + __u64 cpt_start; + __u64 cpt_end; + __u64 cpt_flags; + __u64 cpt_pgprot; + __u64 cpt_pgoff; +} __attribute__ ((aligned (8))); + +struct cpt_aio_ctx_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_max_reqs; + __u32 cpt_ring_pages; + __u32 cpt_tail; + __u32 cpt_nr; + __u64 cpt_mmap_base; + /* Data (io_event's) and struct aio_ring are stored in user space VM */ +} __attribute__ ((aligned (8))); + + +/* Format of MM section. + * + * It is array of MM objects (mm_struct). Each MM object is + * header, encoding mm_struct, followed by array of VMA objects. + * Each VMA consists of VMA header, encoding vm_area_struct, and + * if the VMA contains copied pages, the header is followed by + * array of tuples start-end each followed by data. + * + * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? + */ + +struct cpt_restart_block { + __u64 fn; +#define CPT_RBL_0 0 +#define CPT_RBL_NANOSLEEP 1 +#define CPT_RBL_COMPAT_NANOSLEEP 2 +#define CPT_RBL_POLL 3 +#define CPT_RBL_FUTEX_WAIT 4 + __u64 arg0; + __u64 arg1; + __u64 arg2; + __u64 arg3; +} __attribute__ ((aligned (8))); + +struct cpt_siginfo_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_qflags; + __u32 cpt_signo; + __u32 cpt_errno; + __u32 cpt_code; + + __u64 cpt_sigval; + __u32 cpt_pid; + __u32 cpt_uid; + __u64 cpt_utime; + __u64 cpt_stime; + + __u64 cpt_user; +} __attribute__ ((aligned (8))); + +/* Portable presentaions for segment registers */ + +#define CPT_SEG_ZERO 0 +#define CPT_SEG_TLS1 1 +#define CPT_SEG_TLS2 2 +#define CPT_SEG_TLS3 3 +#define CPT_SEG_USER32_DS 4 +#define CPT_SEG_USER32_CS 5 +#define CPT_SEG_USER64_DS 6 +#define CPT_SEG_USER64_CS 7 +#define CPT_SEG_LDT 256 + +struct cpt_x86_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_debugreg[8]; + __u32 cpt_fs; + __u32 cpt_gs; + + __u32 cpt_ebx; + __u32 cpt_ecx; + __u32 cpt_edx; + __u32 cpt_esi; + __u32 cpt_edi; + __u32 cpt_ebp; + __u32 cpt_eax; + __u32 cpt_xds; + __u32 cpt_xes; + __u32 cpt_orig_eax; + __u32 cpt_eip; + __u32 cpt_xcs; + __u32 cpt_eflags; + __u32 cpt_esp; + __u32 cpt_xss; + __u32 pad; +}; + +struct cpt_x86_64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_debugreg[8]; + + __u64 cpt_fsbase; + __u64 cpt_gsbase; + __u32 cpt_fsindex; + __u32 cpt_gsindex; + __u32 cpt_ds; + __u32 cpt_es; + + __u64 cpt_r15; + __u64 cpt_r14; + __u64 cpt_r13; + __u64 cpt_r12; + __u64 cpt_rbp; + __u64 cpt_rbx; + __u64 cpt_r11; + __u64 cpt_r10; + __u64 cpt_r9; + __u64 cpt_r8; + __u64 cpt_rax; + __u64 cpt_rcx; + __u64 cpt_rdx; + __u64 cpt_rsi; + __u64 cpt_rdi; + __u64 cpt_orig_rax; + __u64 cpt_rip; + __u64 cpt_cs; + __u64 cpt_eflags; + __u64 cpt_rsp; + __u64 cpt_ss; +}; + +struct cpt_ia64_regs +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 gr[128]; + __u64 fr[256]; + __u64 br[8]; + __u64 nat[2]; + + __u64 ar_bspstore; + __u64 num_regs; + __u64 loadrs; + __u64 ar_bsp; + __u64 ar_unat; + __u64 ar_pfs; + __u64 ar_ccv; + __u64 ar_fpsr; + __u64 ar_csd; + __u64 ar_ssd; + __u64 ar_ec; + __u64 ar_lc; + __u64 ar_rsc; + __u64 ar_rnat; + + __u64 cr_iip; + __u64 cr_ipsr; + + __u64 cfm; + __u64 pr; + + __u64 ibr[8]; + __u64 dbr[8]; +}; + + +struct cpt_task_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_state; + __u64 cpt_flags; + __u64 cpt_ptrace; + __u32 cpt_prio; + __u32 cpt_static_prio; + __u32 cpt_policy; + __u32 cpt_rt_priority; + + /* struct thread_info */ + __u64 cpt_exec_domain; + __u64 cpt_thrflags; + __u64 cpt_thrstatus; + __u64 cpt_addr_limit; + + __u64 cpt_personality; + + __u64 cpt_mm; + __u64 cpt_files; + __u64 cpt_fs; + __u64 cpt_signal; + __u64 cpt_sighand; + __u64 cpt_sigblocked; + __u64 cpt_sigrblocked; + __u64 cpt_sigpending; + __u64 cpt_namespace; + __u64 cpt_sysvsem_undo; + __u32 cpt_pid; + __u32 cpt_tgid; + __u32 cpt_ppid; + __u32 cpt_rppid; + __u32 cpt_pgrp; + __u32 cpt_session; + __u32 cpt_old_pgrp; + __u32 __cpt_pad; + __u32 cpt_leader; + __u8 cpt_pn_state; + __u8 cpt_stopped_state; + __u8 cpt_sigsuspend_state; + __u8 cpt_64bit; + __u64 cpt_set_tid; + __u64 cpt_clear_tid; + __u32 cpt_exit_code; + __u32 cpt_exit_signal; + __u32 cpt_pdeath_signal; + __u32 cpt_user; + __u32 cpt_uid; + __u32 cpt_euid; + __u32 cpt_suid; + __u32 cpt_fsuid; + __u32 cpt_gid; + __u32 cpt_egid; + __u32 cpt_sgid; + __u32 cpt_fsgid; + __u32 cpt_ngids; + __u32 cpt_gids[32]; + __u8 cpt_prctl_uac; + __u8 cpt_prctl_fpemu; + __u16 __cpt_pad1; + __u64 cpt_ecap; + __u64 cpt_icap; + __u64 cpt_pcap; + __u8 cpt_comm[16]; + __u64 cpt_tls[3]; + struct cpt_restart_block cpt_restart; + __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */ + __u64 cpt_it_prof_value; + __u64 cpt_it_prof_incr; + __u64 cpt_it_virt_value; + __u64 cpt_it_virt_incr; + + __u16 cpt_used_math; + __u8 cpt_keepcap; + __u8 cpt_did_exec; + __u32 cpt_ptrace_message; + + __u64 cpt_utime; + __u64 cpt_stime; + __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */ + __u64 cpt_nvcsw; + __u64 cpt_nivcsw; + __u64 cpt_min_flt; + __u64 cpt_maj_flt; + + __u64 cpt_sigsuspend_blocked; + __u64 cpt_cutime, cpt_cstime; + __u64 cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_cmin_flt, cpt_cmaj_flt; + +#define CPT_RLIM_NLIMITS 16 + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; + + __u64 cpt_task_ub; + __u64 cpt_exec_ub; + __u64 cpt_mm_ub; + __u64 cpt_fork_sub; +} __attribute__ ((aligned (8))); + +struct cpt_sigaltstack_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_stack; + __u32 cpt_stacksize; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_task_aux_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_robust_list; + __u64 __cpt_future[16]; +} __attribute__ ((aligned (8))); + + +struct cpt_signal_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_leader; + __u8 cpt_pgrp_type; + __u8 cpt_old_pgrp_type; + __u8 cpt_session_type; +#define CPT_PGRP_NORMAL 0 +#define CPT_PGRP_ORPHAN 1 +#define CPT_PGRP_STRAY 2 + __u8 __cpt_pad1; + __u64 cpt_pgrp; + __u64 cpt_old_pgrp; + __u64 cpt_session; + __u64 cpt_sigpending; + __u64 cpt_ctty; + + __u32 cpt_curr_target; + __u32 cpt_group_exit; + __u32 cpt_group_exit_code; + __u32 cpt_group_exit_task; + __u32 cpt_notify_count; + __u32 cpt_group_stop_count; + __u32 cpt_stop_state; + __u32 __cpt_pad2; + + __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; + __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; + __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; + + __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; + __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; +} __attribute__ ((aligned (8))); +/* Followed by list of posix timers. */ + +struct cpt_sighand_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + +} __attribute__ ((aligned (8))); +/* Followed by list of sighandles. */ + +struct cpt_sighandler_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_signo; + __u32 __cpt_pad1; + __u64 cpt_handler; + __u64 cpt_restorer; + __u64 cpt_flags; + __u64 cpt_mask; +} __attribute__ ((aligned (8))); + +struct cpt_netdev_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u32 cpt_flags; + __u8 cpt_name[16]; +} __attribute__ ((aligned (8))); + +struct cpt_tuntap_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_owner; + __u32 cpt_attached; + __u64 cpt_flags; + __u64 cpt_bindfile; + __u64 cpt_if_flags; + __u8 cpt_dev_addr[6]; + __u16 cpt_pad; + __u32 cpt_chr_filter[2]; + __u32 cpt_net_filter[2]; +} __attribute__ ((aligned (8))); + +struct cpt_ifaddr_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_index; + __u8 cpt_family; + __u8 cpt_masklen; + __u8 cpt_flags; + __u8 cpt_scope; + __u32 cpt_address[4]; + __u32 cpt_peer[4]; + __u32 cpt_broadcast[4]; + __u8 cpt_label[16]; + __u32 cpt_valid_lft; + __u32 cpt_prefered_lft; +} __attribute__ ((aligned (8))); + +struct cpt_ipct_tuple +{ + __u32 cpt_src; + __u16 cpt_srcport; + __u16 __cpt_pad1; + + __u32 cpt_dst; + __u16 cpt_dstport; + __u8 cpt_protonum; + __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ +} __attribute__ ((aligned (8))); + +struct cpt_nat_manip +{ + __u8 cpt_direction; + __u8 cpt_hooknum; + __u8 cpt_maniptype; + __u8 __cpt_pad1; + + __u32 cpt_manip_addr; + __u16 cpt_manip_port; + __u16 __cpt_pad2; + __u32 __cpt_pad3; +} __attribute__ ((aligned (8))); + +struct cpt_nat_seq +{ + __u32 cpt_correction_pos; + __u32 cpt_offset_before; + __u32 cpt_offset_after; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +struct cpt_ip_connexpect_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_timeout; + __u32 cpt_sibling_conntrack; /* Index of child conntrack */ + __u32 cpt_seq; /* id in 2.6.15 */ + + struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ + struct cpt_ipct_tuple cpt_tuple; + struct cpt_ipct_tuple cpt_mask; + + /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ + __u32 cpt_help[3]; /* NU 2.6.15 */ + __u16 cpt_manip_proto; + __u8 cpt_dir; + __u8 cpt_flags; +} __attribute__ ((aligned (8))); + +struct cpt_ip_conntrack_image +{ + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + struct cpt_ipct_tuple cpt_tuple[2]; + __u64 cpt_status; + __u64 cpt_timeout; + __u32 cpt_index; + __u8 cpt_ct_helper; + __u8 cpt_nat_helper; + __u16 cpt_pad1; + + /* union ip_conntrack_proto. Used by tcp and icmp. */ + __u32 cpt_proto_data[12]; + + /* union ip_conntrack_help. Used by ftp and pptp helper. + * We do not support pptp... + */ + __u32 cpt_help_data[6]; + + /* nat info */ + __u32 cpt_initialized; /* NU 2.6.15 */ + __u32 cpt_num_manips; /* NU 2.6.15 */ + struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ + + struct cpt_nat_seq cpt_nat_seq[2]; + + __u32 cpt_masq_index; + __u32 cpt_id; + __u32 cpt_mark; +} __attribute__ ((aligned (8))); + +struct cpt_ubparm +{ + __u64 barrier; + __u64 limit; + __u64 held; + __u64 maxheld; + __u64 minheld; + __u64 failcnt; +} __attribute__ ((aligned (8))); + +struct cpt_beancounter_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u64 cpt_parent; + __u32 cpt_id; + __u32 __cpt_pad; + struct cpt_ubparm cpt_parms[32 * 2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_sgreg_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; + __u32 cpt_id; + __u16 cpt_resource; + __u8 cpt_regname[32]; + __u8 __cpt_pad2[2]; +} __attribute__ ((aligned (8))); + +struct cpt_slm_obj_image { + __u64 cpt_next; + __u32 cpt_object; + __u16 cpt_hdrlen; + __u16 cpt_content; + + __u32 cpt_size; + __u32 __cpt_pad1; +} __attribute__ ((aligned (8))); + +#ifdef __KERNEL__ + +static inline void __user * cpt_ptr_import(__u64 ptr) +{ + return (void*)(unsigned long)ptr; +} + +static inline __u64 cpt_ptr_export(void __user *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) +{ + memcpy(sig, &ptr, sizeof(*sig)); +} + +static inline __u64 cpt_sigset_export(sigset_t *sig) +{ + return *(__u64*)sig; +} + +static inline __u64 cpt_timespec_export(struct timespec *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; +} + +static inline void cpt_timespec_import(struct timespec *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_nsec = (val&0xFFFFFFFF); +} + +static inline __u64 cpt_timeval_export(struct timeval *tv) +{ + return (((u64)tv->tv_sec) << 32) + tv->tv_usec; +} + +static inline void cpt_timeval_import(struct timeval *tv, __u64 val) +{ + tv->tv_sec = val>>32; + tv->tv_usec = (val&0xFFFFFFFF); +} + +#endif + +#endif /* __CPT_IMAGE_H_ */ diff -uprN linux-2.6.24/include/linux/cpt_ioctl.h linux-2.6.24.ovz/include/linux/cpt_ioctl.h --- linux-2.6.24/include/linux/cpt_ioctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/cpt_ioctl.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,43 @@ +/* + * + * include/linux/cpt_ioctl.h + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _CPT_IOCTL_H_ +#define _CPT_IOCTL_H_ 1 + +#include +#include + +#define CPTCTLTYPE '-' +#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) +#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) +#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) +#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) +#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) +#define CPT_DUMP _IO(CPTCTLTYPE, 6) +#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) +#define CPT_RESUME _IO(CPTCTLTYPE, 8) +#define CPT_KILL _IO(CPTCTLTYPE, 9) +#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) +#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) +#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) +#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) +#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) +#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) +#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) +#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) +#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) +#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) +#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) +#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) + +#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) + +#endif diff -uprN linux-2.6.24/include/linux/dcache.h linux-2.6.24.ovz/include/linux/dcache.h --- linux-2.6.24/include/linux/dcache.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/dcache.h 2008-03-25 18:53:59.000000000 -0500 @@ -9,6 +9,8 @@ #include #include +#include + struct nameidata; struct vfsmount; @@ -111,6 +113,9 @@ struct dentry { struct dcookie_struct *d_cookie; /* cookie, if any */ #endif int d_mounted; +#ifdef CONFIG_BEANCOUNTERS + struct dentry_beancounter dentry_bc; +#endif unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; @@ -174,9 +179,13 @@ d_iput: no no no yes #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 +#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ + +extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d); #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ +extern struct kmem_cache *dentry_cache; extern spinlock_t dcache_lock; extern seqlock_t rename_lock; @@ -300,7 +309,12 @@ extern int d_validate(struct dentry *, s */ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); +extern int d_root_check(struct dentry *, struct vfsmount *); + extern char * d_path(struct dentry *, struct vfsmount *, char *, int); +extern char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, + struct dentry *root, struct vfsmount *rootmnt, + char *buffer, int buflen); /* Allocation counts.. */ @@ -320,6 +334,12 @@ extern char * d_path(struct dentry *, st static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { +#ifdef CONFIG_BEANCOUNTERS + preempt_disable(); + if (ub_dentry_on && ub_dget_testone(dentry)) + BUG(); + preempt_enable_no_resched(); +#endif BUG_ON(!atomic_read(&dentry->d_count)); atomic_inc(&dentry->d_count); } @@ -363,6 +383,8 @@ extern struct dentry *lookup_create(stru extern int sysctl_vfs_cache_pressure; +extern int check_area_access_ve(struct dentry *, struct vfsmount *); +extern int check_area_execute_ve(struct dentry *, struct vfsmount *); #endif /* __KERNEL__ */ #endif /* __LINUX_DCACHE_H */ diff -uprN linux-2.6.24/include/linux/device.h linux-2.6.24.ovz/include/linux/device.h --- linux-2.6.24/include/linux/device.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/device.h 2008-03-25 18:53:59.000000000 -0500 @@ -279,6 +279,8 @@ class_set_devdata (struct class_device * dev->class_data = data; } +extern struct class net_class; + extern int __must_check class_device_register(struct class_device *); extern void class_device_unregister(struct class_device *); diff -uprN linux-2.6.24/include/linux/devpts_fs.h linux-2.6.24.ovz/include/linux/devpts_fs.h --- linux-2.6.24/include/linux/devpts_fs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/devpts_fs.h 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,16 @@ int devpts_pty_new(struct tty_struct *tt struct tty_struct *devpts_get_tty(int number); /* get tty structure */ void devpts_pty_kill(int number); /* unlink */ +struct devpts_config { + int setuid; + int setgid; + uid_t uid; + gid_t gid; + umode_t mode; +}; + +extern struct devpts_config devpts_config; +extern struct file_system_type devpts_fs_type; #else /* Dummy stubs in the no-pty case */ diff -uprN linux-2.6.24/include/linux/dmi.h linux-2.6.24.ovz/include/linux/dmi.h --- linux-2.6.24/include/linux/dmi.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/dmi.h 2008-03-25 18:53:59.000000000 -0500 @@ -79,7 +79,6 @@ extern void dmi_scan_machine(void); extern int dmi_get_year(int field); extern int dmi_name_in_vendors(const char *str); extern int dmi_available; -extern char *dmi_get_slot(int slot); #else @@ -90,7 +89,6 @@ static inline const struct dmi_device * static inline int dmi_get_year(int year) { return 0; } static inline int dmi_name_in_vendors(const char *s) { return 0; } #define dmi_available 0 -static inline char *dmi_get_slot(int slot) { return NULL; } #endif diff -uprN linux-2.6.24/include/linux/drbd.h linux-2.6.24.ovz/include/linux/drbd.h --- linux-2.6.24/include/linux/drbd.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/drbd.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,284 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2007, LINBIT Information Technologies GmbH. + Copyright (C) 2001-2007, Philipp Reisner . + Copyright (C) 2001-2007, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#ifndef DRBD_H +#define DRBD_H +#include + +#include + +#ifdef __KERNEL__ +#include +#else +#include +#include +#include +#endif + +enum io_error_handler { + PassOn, /* FIXME should the better be named "Ignore"? */ + CallIOEHelper, + Detach +}; + +enum fencing_policy { + DontCare, + Resource, + Stonith +}; + +enum disconnect_handler { + Reconnect, + DropNetConf, + FreezeIO +}; + +enum after_sb_handler { + Disconnect, + DiscardYoungerPri, + DiscardOlderPri, + DiscardZeroChg, + DiscardLeastChg, + DiscardLocal, + DiscardRemote, + Consensus, + DiscardSecondary, + CallHelper, + Violently +}; + +/* KEEP the order, do not delete or insert! + * Or change the API_VERSION, too. */ +enum ret_codes { + RetCodeBase=100, + NoError, // 101 ... + LAAlreadyInUse, + OAAlreadyInUse, + LDNameInvalid, + MDNameInvalid, + LDAlreadyInUse, + LDNoBlockDev, + MDNoBlockDev, + LDOpenFailed, + MDOpenFailed, + LDDeviceTooSmall, + MDDeviceTooSmall, + LDNoConfig, + LDMounted, + MDMounted, + LDMDInvalid, + LDDeviceTooLarge, + MDIOError, + MDInvalid, + CRAMAlgNotAvail, + CRAMAlgNotDigest, + KMallocFailed, + DiscardNotAllowed, + HaveDiskConfig, + HaveNetConfig, + UnknownMandatoryTag, + MinorNotKnown, + StateNotAllowed, + GotSignal, // EINTR + NoResizeDuringResync, + APrimaryNodeNeeded, + SyncAfterInvalid, + SyncAfterCycle, + PauseFlagAlreadySet, + PauseFlagAlreadyClear, + DiskLowerThanOutdated, + UnknownNetLinkPacket, + HaveNoDiskConfig, + ProtocolCRequired, + + /* insert new ones above this line */ + AfterLastRetCode +}; + +#define DRBD_PROT_A 1 +#define DRBD_PROT_B 2 +#define DRBD_PROT_C 3 + +typedef enum { + Unknown=0, + Primary=1, // role + Secondary=2, // role + role_mask=3, +} drbd_role_t; + +/* The order of these constants is important. + * The lower ones (=WFReportParams ==> There is a socket + * + * THINK + * Skipped should be < Connected, + * so writes on a Primary after Skipped sync are not mirrored either ? + */ +typedef enum { + StandAlone, + Disconnecting, // Temporal state on the way to StandAlone. + Unconnected, // >= Unconnected -> inc_net() succeeds + Timeout, /// These temporal states are all used on the way + BrokenPipe, /// from >= Connected to Unconnected. + NetworkFailure, /// The 'disconnect reason' states + ProtocolError, /// + TearDown, /// I do not allow to change beween them. + WFConnection, + WFReportParams, // we have a socket + Connected, // we have introduced each other + StartingSyncS, // starting full sync by IOCTL. + StartingSyncT, // stariing full sync by IOCTL. + WFBitMapS, + WFBitMapT, + WFSyncUUID, + SyncSource, // The distance between original state and pause + SyncTarget, // state must be the same for source and target. (+2) + PausedSyncS, // All SyncStates are tested with this comparison + PausedSyncT, // xx >= SyncSource && xx <= PausedSyncT + conn_mask=31 +} drbd_conns_t; + +typedef enum { + Diskless, + Attaching, /* In the process of reading the meta-data */ + Failed, /* Becomes Diskless as soon as we told it the peer */ + /* when >= Failed it is legal to access mdev->bc */ + Negotiating, /* Late attaching state, we need to talk to the peer... */ + Inconsistent, + Outdated, + DUnknown, /* Only used for the peer, never for myself */ + Consistent, /* Might be Outdated, might be UpToDate ... */ + UpToDate, /* Only this disk state allows applications' IO ! */ + disk_mask=15 +} drbd_disks_t; + +typedef union { + struct { + unsigned role : 2 ; // 3/4 primary/secondary/unknown + unsigned peer : 2 ; // 3/4 primary/secondary/unknown + unsigned conn : 5 ; // 17/32 cstates + unsigned disk : 4 ; // 8/16 from Diskless to UpToDate + unsigned pdsk : 4 ; // 8/16 from Diskless to UpToDate + unsigned susp : 1 ; // 2/2 IO suspended no/yes + unsigned aftr_isp : 1 ; // isp .. imposed sync pause + unsigned peer_isp : 1 ; + unsigned user_isp : 1 ; + unsigned _pad : 11; // 0 unused + }; + unsigned int i; +} drbd_state_t; + +typedef enum { + SS_CW_NoNeed=4, + SS_CW_Success=3, + SS_NothingToDo=2, + SS_Success=1, + SS_UnknownError=0, // Used to sleep longer in _drbd_request_state + SS_TwoPrimaries=-1, + SS_NoUpToDateDisk=-2, + SS_BothInconsistent=-4, + SS_SyncingDiskless=-5, + SS_ConnectedOutdates=-6, + SS_PrimaryNOP=-7, + SS_ResyncRunning=-8, + SS_AlreadyStandAlone=-9, + SS_CW_FailedByPeer=-10, + SS_CanNotOutdateDL=-11, + SS_DeviceInUse=-12 +} set_st_err_t; + +/* from drbd_strings.c */ +extern const char* conns_to_name(drbd_conns_t); +extern const char* roles_to_name(drbd_role_t); +extern const char* disks_to_name(drbd_disks_t); +extern const char* set_st_err_name(set_st_err_t); + +#ifndef BDEVNAME_SIZE +# define BDEVNAME_SIZE 32 +#endif + +#define SHARED_SECRET_MAX 64 + +enum MetaDataFlags { + __MDF_Consistent, + __MDF_PrimaryInd, + __MDF_ConnectedInd, + __MDF_FullSync, + __MDF_WasUpToDate, + __MDF_PeerOutDated // or less/lower. +}; +#define MDF_Consistent (1<<__MDF_Consistent) +#define MDF_PrimaryInd (1<<__MDF_PrimaryInd) +#define MDF_ConnectedInd (1<<__MDF_ConnectedInd) +#define MDF_FullSync (1<<__MDF_FullSync) +#define MDF_WasUpToDate (1<<__MDF_WasUpToDate) +#define MDF_PeerOutDated (1<<__MDF_PeerOutDated) + +enum UuidIndex { + Current, + Bitmap, + History_start, + History_end, + UUID_SIZE, // In the packet we store the number of dirty bits here + UUID_FLAGS, // In the packet we store flags here. + EXT_UUID_SIZE // Everything. +}; + +#define UUID_JUST_CREATED ((__u64)4) + +#define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) + +/* these are of type "int" */ +#define DRBD_MD_INDEX_INTERNAL -1 +#define DRBD_MD_INDEX_FLEX_EXT -2 +#define DRBD_MD_INDEX_FLEX_INT -3 + +// Start of the new netlink/connector stuff + +#define DRBD_NL_CREATE_DEVICE 0x01 +#define DRBD_NL_SET_DEFAULTS 0x02 + +// The following line should be moved over to linux/connector.h +// when the time comes +#define CN_IDX_DRBD 0x4 +#define CN_VAL_DRBD 0x1 + +struct drbd_nl_cfg_req { + int packet_type; + int drbd_minor; + int flags; + unsigned short tag_list[]; +}; + +struct drbd_nl_cfg_reply { + int packet_type; + int minor; + int ret_code; // enum ret_code or set_st_err_t + unsigned short tag_list[]; // only used with get_* calls +}; + +#endif diff -uprN linux-2.6.24/include/linux/drbd_config.h linux-2.6.24.ovz/include/linux/drbd_config.h --- linux-2.6.24/include/linux/drbd_config.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/drbd_config.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,72 @@ +/* + drbd_config.h + DRBD's compile time configuration. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef DRBD_CONFIG_H +#define DRBD_CONFIG_H + +extern const char * drbd_buildtag(void); + +#define REL_VERSION "8.0.0" +#define API_VERSION 86 +#define PRO_VERSION 86 + +// undef if you need the workaround in drbd_receiver +#define HAVE_UML_TO_VIRT 1 + +#define DBG_ALL_SYMBOLS // no static functs, improves quality of OOPS traces + +//#define DBG_SPINLOCKS // enables MUST_HOLD macro (assertions for spinlocks) +//#define DBG_ASSERTS // drbd_assert_breakpoint() function +#define DUMP_MD 2 // Dump even all cstate changes (I like it!) +//#define PARANOIA // some extra checks + +// Dump every hour the usage / not usage of zero copy IO +//#define SHOW_SENDPAGE_USAGE + +// Define this to enable dynamic tracing controlled by module parameters +// at run time. This enables ALL use of dynamic tracing including packet +// and bio dumping, etc +#define ENABLE_DYNAMIC_TRACE + +// You can disable the use of the sendpage() call (= zero copy +// IO ) If you have the feeling that this might be the cause +// for troubles. +// #define DRBD_DISABLE_SENDPAGE + +// Enable fault insertion code +#define DRBD_ENABLE_FAULTS + +// RedHat's 2.6.9 kernels have the gfp_t type. Mainline has this feature +// since 2.6.16. If you build for RedHat enable the line below. +#define KERNEL_HAS_GFP_T + +// kernel.org has atomic_add_return since 2.6.10. some vendor kernels +// have it backported, though. Others don't. +//#define NEED_BACKPORT_OF_ATOMIC_ADD + +// 2.6.something has deprecated kmem_cache_t +// some older still use it. +// some have it defined as struct kmem_cache_s, some as struct kmem_cache +//#define USE_KMEM_CACHE_S + +// 2.6.something has sock_create_kern (SE-linux security context stuff) +// some older distribution kernels don't. +//#define DEFINE_SOCK_CREATE_KERN + +#endif diff -uprN linux-2.6.24/include/linux/drbd_limits.h linux-2.6.24.ovz/include/linux/drbd_limits.h --- linux-2.6.24/include/linux/drbd_limits.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/drbd_limits.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,124 @@ +/* + drbd_limits.h + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. +*/ + +/* + * Our current limitations. + * Some of them are hard limits, + * some of them are arbitrary range limits, that make it easier to provide + * feedback about nonsense settings for certain configurable values. + */ + +#ifndef DRBD_LIMITS_H +#define DRBD_LIMITS_H 1 + +#define DEBUG_RANGE_CHECK 0 + +#define DRBD_MINOR_COUNT_MIN 1 +#define DRBD_MINOR_COUNT_MAX 255 + +#define DRBD_DIALOG_REFRESH_MIN 0 +#define DRBD_DIALOG_REFRESH_MAX 600 + +/* valid port number */ +#define DRBD_PORT_MIN 1 +#define DRBD_PORT_MAX 0xffff + +/* startup { */ + /* if you want more than 3.4 days, disable */ +#define DRBD_WFC_TIMEOUT_MIN 0 +#define DRBD_WFC_TIMEOUT_MAX 300000 +#define DRBD_WFC_TIMEOUT_DEF 0 + +#define DRBD_DEGR_WFC_TIMEOUT_MIN 0 +#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 +#define DRBD_DEGR_WFC_TIMEOUT_DEF 60 + +/* }*/ + +/* net { */ + /* timeout, unit centi seconds + * more than one minute timeout is not usefull */ +#define DRBD_TIMEOUT_MIN 1 +#define DRBD_TIMEOUT_MAX 600 +#define DRBD_TIMEOUT_DEF 60 // 6 seconds + + /* active connection retries when WFConnection */ +#define DRBD_CONNECT_INT_MIN 1 +#define DRBD_CONNECT_INT_MAX 120 +#define DRBD_CONNECT_INT_DEF 10 //seconds + + /* keep-alive probes when idle */ +#define DRBD_PING_INT_MIN 1 +#define DRBD_PING_INT_MAX 120 +#define DRBD_PING_INT_DEF 10 + + /* timeout for the ping packets.*/ +#define DRBD_PING_TIMEO_MIN 1 +#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_DEF 5 + + /* max number of write requests between write barriers */ +#define DRBD_MAX_EPOCH_SIZE_MIN 1 +#define DRBD_MAX_EPOCH_SIZE_MAX 20000 +#define DRBD_MAX_EPOCH_SIZE_DEF 2048 + + /* I don't think that a tcp send buffer of more than 10M is usefull */ +#define DRBD_SNDBUF_SIZE_MIN 1 +#define DRBD_SNDBUF_SIZE_MAX 10000000 +#define DRBD_SNDBUF_SIZE_DEF (2*65535) + + /* @4k PageSize -> 128kB - 512MB */ +#define DRBD_MAX_BUFFERS_MIN 32 +#define DRBD_MAX_BUFFERS_MAX 131072 +#define DRBD_MAX_BUFFERS_DEF 2048 + + /* @4k PageSize -> 4kB - 512MB */ +#define DRBD_UNPLUG_WATERMARK_MIN 1 +#define DRBD_UNPLUG_WATERMARK_MAX 131072 +#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) + + /* 0 is disabled. + * 200 should be more than enough even for very short timeouts */ +#define DRBD_KO_COUNT_MIN 0 +#define DRBD_KO_COUNT_MAX 200 +#define DRBD_KO_COUNT_DEF 0 +/* } */ + +/* syncer { */ + /* FIXME allow rate to be zero? */ +#define DRBD_RATE_MIN 1 +#define DRBD_RATE_MAX 700000 +#define DRBD_RATE_DEF 250 // kb/second + + /* less than 7 would hit performance unneccessarily. + * 3833 is the largest prime that still does fit + * into 64 sectors of activity log */ +#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MAX 3833 +#define DRBD_AL_EXTENTS_DEF 127 + +#define DRBD_AFTER_MIN -1 +#define DRBD_AFTER_MAX 255 +#define DRBD_AFTER_DEF -1 + +/* } */ + +/* drbdsetup XY resize -d Z + * you are free to reduce the device size to nothing, if you want to. + * but more than 3998G are currently not possible */ +/* DRBD_MAX_SECTORS */ +#define DRBD_DISK_SIZE_SECT_MIN 0 +#define DRBD_DISK_SIZE_SECT_MAX ((128LLU*1024*2 - 72)*512LLU*8*8) +#define DRBD_DISK_SIZE_SECT_DEF 0 // = disabled = no user size... + +#define DRBD_ON_IO_ERROR_DEF PassOn +#define DRBD_FENCING_DEF DontCare +#define DRBD_AFTER_SB_0P_DEF Disconnect +#define DRBD_AFTER_SB_1P_DEF Disconnect +#define DRBD_AFTER_SB_2P_DEF Disconnect +#define DRBD_RR_CONFLICT_DEF Disconnect + +#undef RANGE +#endif diff -uprN linux-2.6.24/include/linux/drbd_nl.h linux-2.6.24.ovz/include/linux/drbd_nl.h --- linux-2.6.24/include/linux/drbd_nl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/drbd_nl.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,97 @@ +/* + PAKET( name, + TYPE ( pn, pr, member ) + ... + ) + + You may never reissue one of the pn arguments +*/ + +#if !defined(PACKET) || !defined(STRING) || !defined(INTEGER) || !defined(BIT) || !defined(INT64) +#error "The macros PACKET, STRING, INTEGER, INT64 and BIT needs to be defined" +#endif + +PACKET(primary, 1, + BIT( 1, T_MAY_IGNORE, overwrite_peer) +) + +PACKET(secondary, 2, ) + +PACKET(disk_conf, 3, + INT64( 2, T_MAY_IGNORE, disk_size) + STRING( 3, T_MANDATORY, backing_dev, 32) + STRING( 4, T_MANDATORY, meta_dev, 32) + INTEGER( 5, T_MANDATORY, meta_dev_idx) + INTEGER( 6, T_MAY_IGNORE, on_io_error) + INTEGER( 7, T_MAY_IGNORE, fencing) + BIT( 37, T_MAY_IGNORE, use_bmbv) +) + +PACKET(detach, 4,) + +PACKET(net_conf, 5, + STRING( 8, T_MANDATORY, my_addr, 128) + STRING( 9, T_MANDATORY, peer_addr, 128) + STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) + STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) + INTEGER( 14, T_MAY_IGNORE, timeout) + INTEGER( 15, T_MANDATORY, wire_protocol) + INTEGER( 16, T_MAY_IGNORE, try_connect_int) + INTEGER( 17, T_MAY_IGNORE, ping_int) + INTEGER( 18, T_MAY_IGNORE, max_epoch_size) + INTEGER( 19, T_MAY_IGNORE, max_buffers) + INTEGER( 20, T_MAY_IGNORE, unplug_watermark) + INTEGER( 21, T_MAY_IGNORE, sndbuf_size) + INTEGER( 22, T_MAY_IGNORE, ko_count) + INTEGER( 24, T_MAY_IGNORE, after_sb_0p) + INTEGER( 25, T_MAY_IGNORE, after_sb_1p) + INTEGER( 26, T_MAY_IGNORE, after_sb_2p) + INTEGER( 39, T_MAY_IGNORE, rr_conflict) + INTEGER( 40, T_MAY_IGNORE, ping_timeo) + BIT( 27, T_MAY_IGNORE, want_lose) + BIT( 28, T_MAY_IGNORE, two_primaries) +) + +PACKET(disconnect, 6, ) + +PACKET(resize, 7, + INT64( 29, T_MAY_IGNORE, resize_size) +) + +PACKET(syncer_conf, 8, + INTEGER( 30, T_MAY_IGNORE, rate) + INTEGER( 31, T_MAY_IGNORE, after) + INTEGER( 32, T_MAY_IGNORE, al_extents) +) + +PACKET(invalidate, 9, ) +PACKET(invalidate_peer, 10, ) +PACKET(pause_sync, 11, ) +PACKET(resume_sync, 12, ) +PACKET(suspend_io, 13, ) +PACKET(resume_io, 14, ) +PACKET(outdate, 15, ) +PACKET(get_config, 16, ) +PACKET(get_state, 17, + INTEGER( 33, T_MAY_IGNORE, state_i) +) + +PACKET(get_uuids, 18, + STRING( 34, T_MAY_IGNORE, uuids, (UUID_SIZE*sizeof(__u64))) + INTEGER( 35, T_MAY_IGNORE, uuids_flags) +) + +PACKET(get_timeout_flag, 19, + BIT( 36, T_MAY_IGNORE, use_degraded) +) + +PACKET(call_helper, 20, + STRING( 38, T_MAY_IGNORE, helper, 32) +) + +#undef PACKET +#undef INTEGER +#undef INT64 +#undef BIT +#undef STRING + diff -uprN linux-2.6.24/include/linux/drbd_tag_magic.h linux-2.6.24.ovz/include/linux/drbd_tag_magic.h --- linux-2.6.24/include/linux/drbd_tag_magic.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/drbd_tag_magic.h 2008-03-25 18:54:00.000000000 -0500 @@ -0,0 +1,77 @@ +#ifndef DRBD_TAG_MAGIC_H +#define DRBD_TAG_MAGIC_H + +#define TT_END 0 +#define TT_REMOVED 0xE000 + +// declare packet_type enums +enum packet_types { +#define PACKET(name, number, fields) P_ ## name = number, +#define INTEGER(pn,pr,member) +#define INT64(pn,pr,member) +#define BIT(pn,pr,member) +#define STRING(pn,pr,member,len) +#include "drbd_nl.h" + P_nl_after_last_packet, +}; + +// These struct are used to deduce the size of the tag lists: +#define PACKET(name, number ,fields) struct name ## _tag_len_struct { fields }; +#define INTEGER(pn,pr,member) int member; int tag_and_len ## member; +#define INT64(pn,pr,member) __u64 member; int tag_and_len ## member; +#define BIT(pn,pr,member) unsigned char member : 1; int tag_and_len ## member; +#define STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; \ + int tag_and_len ## member; +#include "linux/drbd_nl.h" + +// declate tag-list-sizes +const int tag_list_sizes[] = { +#define PACKET(name,number,fields) 2 fields , +#define INTEGER(pn,pr,member) +4+4 +#define INT64(pn,pr,member) +4+8 +#define BIT(pn,pr,member) +4+1 +#define STRING(pn,pr,member,len) +4+len +#include "drbd_nl.h" +}; + +/* The two highest bits are used for the tag type */ +#define TT_MASK 0xC000 +#define TT_INTEGER 0x0000 +#define TT_INT64 0x4000 +#define TT_BIT 0x8000 +#define TT_STRING 0xC000 +/* The next bit indicates if processing of the tag is mandatory */ +#define T_MANDATORY 0x2000 +#define T_MAY_IGNORE 0x0000 +#define TN_MASK 0x1fff +/* The remaining 13 bits are used to enumerate the tags */ + +#define tag_type(T) ((T) & TT_MASK) +#define tag_number(T) ((T) & TN_MASK) + +// declare tag enums +#define PACKET(name, number, fields) fields +enum drbd_tags { +#define INTEGER(pn,pr,member) T_ ## member = pn | TT_INTEGER | pr , +#define INT64(pn,pr,member) T_ ## member = pn | TT_INT64 | pr , +#define BIT(pn,pr,member) T_ ## member = pn | TT_BIT | pr , +#define STRING(pn,pr,member,len) T_ ## member = pn | TT_STRING | pr , +#include "drbd_nl.h" +}; + +struct tag { + const char* name; + int type_n_flags; +}; + +// declare tag names +#define PACKET(name, number, fields) fields +const struct tag tag_descriptions[] = { +#define INTEGER(pn,pr,member) [ pn ] = { #member, TT_INTEGER | pr }, +#define INT64(pn,pr,member) [ pn ] = { #member, TT_INT64 | pr }, +#define BIT(pn,pr,member) [ pn ] = { #member, TT_BIT | pr }, +#define STRING(pn,pr,member,len) [ pn ] = { #member, TT_STRING | pr }, +#include "drbd_nl.h" +}; + +#endif diff -uprN linux-2.6.24/include/linux/elevator.h linux-2.6.24.ovz/include/linux/elevator.h --- linux-2.6.24/include/linux/elevator.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/elevator.h 2008-03-25 18:53:59.000000000 -0500 @@ -56,6 +56,11 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; void (*trim)(struct io_context *); + /* In original cfq design task holds a cfqq refcount and puts it + * on exit via io context. Now async cfqqs are hold by UB, + * so we need somehow to put these queues. Use this function. + */ + void (*put_queue)(struct cfq_queue *); }; #define ELV_NAME_MAX (16) diff -uprN linux-2.6.24/include/linux/elfcore.h linux-2.6.24.ovz/include/linux/elfcore.h --- linux-2.6.24/include/linux/elfcore.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/elfcore.h 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,8 @@ #include #include +extern int sysctl_at_vsyscall; + struct elf_siginfo { int si_signo; /* signal number */ diff -uprN linux-2.6.24/include/linux/eventpoll.h linux-2.6.24.ovz/include/linux/eventpoll.h --- linux-2.6.24/include/linux/eventpoll.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/eventpoll.h 2008-03-25 18:53:59.000000000 -0500 @@ -60,6 +60,88 @@ static inline void eventpoll_init_file(s spin_lock_init(&file->f_ep_lock); } +struct epoll_filefd { + struct file *file; + int fd; +}; + +/* + * This structure is stored inside the "private_data" member of the file + * structure and rapresent the main data sructure for the eventpoll + * interface. + */ +struct eventpoll { + /* Protect the this structure access */ + spinlock_t lock; + + /* + * This mutex is used to ensure that files are not removed + * while epoll is using them. This is held during the event + * collection loop, the file cleanup path, the epoll file exit + * code and the ctl operations. + */ + struct mutex mtx; + + /* Wait queue used by sys_epoll_wait() */ + wait_queue_head_t wq; + + /* Wait queue used by file->poll() */ + wait_queue_head_t poll_wait; + + /* List of ready file descriptors */ + struct list_head rdllist; + + /* RB tree root used to store monitored fd structs */ + struct rb_root rbr; + + /* + * This is a single linked list that chains all the "struct epitem" that + * happened while transfering ready events to userspace w/out + * holding ->lock. + */ + struct epitem *ovflist; +}; + +/* + * Each file descriptor added to the eventpoll interface will + * have an entry of this type linked to the "rbr" RB tree. + */ +struct epitem { + /* RB tree node used to link this structure to the eventpoll RB tree */ + struct rb_node rbn; + + /* List header used to link this structure to the eventpoll ready list */ + struct list_head rdllink; + + /* + * Works together "struct eventpoll"->ovflist in keeping the + * single linked chain of items. + */ + struct epitem *next; + + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + + /* Number of active wait queue attached to poll operations */ + int nwait; + + /* List containing poll wait queues */ + struct list_head pwqlist; + + /* The "container" of this item */ + struct eventpoll *ep; + + /* List header used to link this item to the "struct file" items list */ + struct list_head fllink; + + /* The structure that describe the interested events and the source fd */ + struct epoll_event event; +}; + +extern struct semaphore epsem; +struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); @@ -92,6 +174,8 @@ static inline void eventpoll_release(str eventpoll_release_file(file); } +extern struct mutex epmutex; + #else static inline void eventpoll_init_file(struct file *file) {} diff -uprN linux-2.6.24/include/linux/fairsched.h linux-2.6.24.ovz/include/linux/fairsched.h --- linux-2.6.24/include/linux/fairsched.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/fairsched.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,86 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_FAIRSCHED_H__ +#define __LINUX_FAIRSCHED_H__ + +#define FAIRSCHED_SET_RATE 0 +#define FAIRSCHED_DROP_RATE 1 +#define FAIRSCHED_GET_RATE 2 + +#ifdef __KERNEL__ + +/* refcnt change protected with tasklist write lock */ +struct fairsched_node { + struct task_group *tg; + int refcnt; + unsigned id; + struct list_head nodelist; + + unsigned weight; + unsigned char rate_limited; + unsigned rate; +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif +}; + +#ifdef CONFIG_VZ_FAIRSCHED + +#define FAIRSCHED_INIT_NODE_ID INT_MAX + +extern struct fairsched_node fairsched_init_node; + +void fairsched_init_early(void); +void fairsched_init_late(void); + +static inline int task_fairsched_node_id(struct task_struct *p) +{ + return p->fsched_node->id; +} + +/* must called with tasklist write locked */ +static inline void get_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt++; +} +static inline void put_task_fairsched_node(struct task_struct *p) +{ + p->fsched_node->refcnt--; +} + +#define INIT_VZ_FAIRSCHED .fsched_node = &fairsched_init_node, + +#define FSCHWEIGHT_MAX ((1 << 16) - 1) +#define FSCHRATE_SHIFT 10 +#define FSCH_TIMESLICE 16 + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid); +asmlinkage int sys_fairsched_rmnod(unsigned int id); +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus); +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight); +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate); + +#else /* CONFIG_VZ_FAIRSCHED */ + +static inline void fairsched_init_early(void) { } +static inline void fairsched_init_late(void) { } +static inline int task_fairsched_node_id(struct task_struct *p) { return 0; } +static inline void get_task_fairsched_node(struct task_struct *p) { } +static inline void put_task_fairsched_node(struct task_struct *p) { } + +#define INIT_VZ_FAIRSCHED + +#endif /* CONFIG_VZ_FAIRSCHED */ +#endif /* __KERNEL__ */ + +#endif /* __LINUX_FAIRSCHED_H__ */ diff -uprN linux-2.6.24/include/linux/faudit.h linux-2.6.24.ovz/include/linux/faudit.h --- linux-2.6.24/include/linux/faudit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/faudit.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,45 @@ +/* + * include/linux/faudit.h + * + * Copyright (C) 2005 SWSoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __FAUDIT_H_ +#define __FAUDIT_H_ + +#include + +struct vfsmount; +struct dentry; +struct super_block; +struct kstatfs; +struct kstat; +struct pt_regs; + +struct faudit_regs_arg { + int err; + struct pt_regs *regs; +}; + +struct faudit_stat_arg { + int err; + struct vfsmount *mnt; + struct dentry *dentry; + struct kstat *stat; +}; + +struct faudit_statfs_arg { + int err; + struct super_block *sb; + struct kstatfs *stat; +}; + +#define VIRTINFO_FAUDIT (0) +#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) +#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) + +#endif diff -uprN linux-2.6.24/include/linux/fs.h linux-2.6.24.ovz/include/linux/fs.h --- linux-2.6.24/include/linux/fs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/fs.h 2008-03-25 18:53:59.000000000 -0500 @@ -49,6 +49,7 @@ struct inodes_stat_t { extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; +extern int odirect_enable; #ifdef CONFIG_DNOTIFY extern int dir_notify_enable; @@ -68,6 +69,7 @@ extern int dir_notify_enable; #define FMODE_LSEEK 4 #define FMODE_PREAD 8 #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +#define FMODE_QUOTACTL 4 /* File is being opened for execution. Primary users of this flag are distributed filesystems that can use it to achieve correct ETXTBUSY @@ -93,6 +95,8 @@ extern int dir_notify_enable; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ +#define FS_MANGLE_PROC 128 /* hide some /proc/mounts info inside VE */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. @@ -364,6 +368,9 @@ struct iattr { * Includes for diskquotas. */ #include +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) +#include +#endif /** * enum positive_aop_returns - aop return codes with specific semantics @@ -623,6 +630,9 @@ struct inode { #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_ilink i_qlnk; +#endif struct list_head i_devices; union { struct pipe_inode_info *i_pipe; @@ -678,6 +688,8 @@ enum inode_i_mutex_lock_class extern void inode_double_lock(struct inode *inode1, struct inode *inode2); extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); +extern struct kmem_cache *inode_cachep; + /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic @@ -794,6 +806,7 @@ struct file { struct fown_struct f_owner; unsigned int f_uid, f_gid; struct file_ra_state f_ra; + struct user_beancounter *f_ub; u64 f_version; #ifdef CONFIG_SECURITY @@ -808,7 +821,9 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; + struct ve_struct *owner_env; }; + extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); @@ -874,6 +889,9 @@ struct file_lock { struct file *fl_file; unsigned char fl_flags; unsigned char fl_type; +#ifdef CONFIG_BEANCOUNTERS + unsigned char fl_charged; +#endif loff_t fl_start; loff_t fl_end; @@ -1188,6 +1206,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + struct file * (*get_host)(struct file *); }; struct inode_operations { @@ -1261,6 +1280,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + struct inode *(*get_quota_root)(struct super_block *); #endif }; @@ -1420,8 +1440,14 @@ struct file_system_type { struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; struct lock_class_key i_alloc_sem_key; + + struct file_system_type *proto; + struct ve_struct *owner_env; }; +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); + extern int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int), @@ -1462,9 +1488,14 @@ extern int register_filesystem(struct fi extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *, + struct file_system_type **, struct vfsmount **); +extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *); +extern void umount_ve_fs_type(struct file_system_type *local_fs_type); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern void umount_tree(struct vfsmount *, int, struct list_head *); +#define kern_umount mntput extern void release_mounts(struct list_head *); extern long do_mount(char *, char *, char *, unsigned long, void *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); @@ -1474,6 +1505,7 @@ extern struct vfsmount *collect_mounts(s extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int faudit_statfs(struct super_block *, struct kstatfs *); /* /sys/fs */ extern struct kset fs_subsys; @@ -1613,7 +1645,7 @@ extern void chrdev_show(struct seq_file #define BLKDEV_MAJOR_HASH_SIZE 255 extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); -extern struct block_device *lookup_bdev(const char *); +extern struct block_device *lookup_bdev(const char *, int mode); extern struct block_device *open_bdev_excl(const char *, int, void *); extern void close_bdev_excl(struct block_device *); extern void blkdev_show(struct seq_file *,off_t); @@ -1648,7 +1680,8 @@ extern int check_disk_change(struct bloc extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes_check(struct super_block *, int check); +#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0) unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end, bool be_atomic); @@ -2091,10 +2124,20 @@ static inline void free_secdata(void *se { } #endif /* CONFIG_SECURITY */ +static inline void *file_private(struct file *file) +{ + struct file *host = file; + + while (host->f_op->get_host) { + host = host->f_op->get_host(host); + BUG_ON(host->f_mapping != file->f_mapping); + } + return host->private_data; +} + struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); - #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff -uprN linux-2.6.24/include/linux/futex.h linux-2.6.24.ovz/include/linux/futex.h --- linux-2.6.24/include/linux/futex.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/futex.h 2008-03-25 18:53:59.000000000 -0500 @@ -110,7 +110,7 @@ struct robust_list_head { #ifdef __KERNEL__ long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, u32 __user *uaddr2, u32 val2, u32 val3); - +long futex_wait_restart(struct restart_block *restart); extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); diff -uprN linux-2.6.24/include/linux/genhd.h linux-2.6.24.ovz/include/linux/genhd.h --- linux-2.6.24/include/linux/genhd.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/genhd.h 2008-03-25 18:53:59.000000000 -0500 @@ -435,6 +435,7 @@ static inline struct block_device *bdget return bdget(MKDEV(disk->major, disk->first_minor) + index); } +extern struct kset block_subsys; #endif #else /* CONFIG_BLOCK */ diff -uprN linux-2.6.24/include/linux/gfp.h linux-2.6.24.ovz/include/linux/gfp.h --- linux-2.6.24/include/linux/gfp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/gfp.h 2008-03-25 18:53:59.000000000 -0500 @@ -50,20 +50,25 @@ struct vm_area_struct; #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ +#define __GFP_UBC ((__force gfp_t)0x200000u)/* charge kmem in buddy and slab */ +#define __GFP_SOFT_UBC ((__force gfp_t)0x400000u)/* use soft charging */ -#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ #define GFP_ATOMIC (__GFP_HIGH) +#define GFP_ATOMIC_UBC (__GFP_HIGH | __GFP_UBC) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ diff -uprN linux-2.6.24/include/linux/grinternal.h linux-2.6.24.ovz/include/linux/grinternal.h --- linux-2.6.24/include/linux/grinternal.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/grinternal.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,91 @@ +#ifndef __GRINTERNAL_H +#define __GRINTERNAL_H + +#ifdef CONFIG_GRKERNSEC + +#include + +extern char *gr_to_filename(const struct dentry *dentry, + const struct vfsmount *mnt); +extern char *gr_to_filename2(const struct dentry *dentry, + const struct vfsmount *mnt); +extern char *gr_to_filename3(const struct dentry *dentry, + const struct vfsmount *mnt); + +#ifdef CONFIG_VE +#include +#define grsec_enable_tpe (get_exec_env()->grsec.enable_tpe) +#define grsec_tpe_gid (get_exec_env()->grsec.tpe_gid) +#define grsec_enable_tpe_all (get_exec_env()->grsec.enable_tpe_all) +#define grsec_lock (get_exec_env()->grsec.lock) +#else +extern int grsec_enable_tpe; +extern int grsec_tpe_gid; +extern int grsec_enable_tpe_all; +extern int grsec_lock; +#endif + +extern spinlock_t grsec_alert_lock; +extern unsigned long grsec_alert_wtime; +extern unsigned long grsec_alert_fyet; + +extern spinlock_t grsec_audit_lock; + +#define gr_task_fullpath(tsk) ("") + +#define gr_parent_task_fullpath(tsk) ("") + +#define DEFAULTSECARGS(task) gr_task_fullpath(task), task->comm, \ + task->pid, task->uid, \ + task->euid, task->gid, task->egid, \ + gr_parent_task_fullpath(task), \ + task->parent->comm, task->parent->pid, \ + task->parent->uid, task->parent->euid, \ + task->parent->gid, task->parent->egid + +enum { + GR_DO_AUDIT, + GR_DONT_AUDIT, + GR_DONT_AUDIT_GOOD +}; + +enum { + GR_TTYSNIFF, + GR_RBAC, + GR_RBAC_STR, + GR_STR_RBAC, + GR_RBAC_MODE2, + GR_RBAC_MODE3, + GR_FILENAME, + GR_NOARGS, + GR_ONE_INT, + GR_ONE_INT_TWO_STR, + GR_ONE_STR, + GR_STR_INT, + GR_TWO_INT, + GR_THREE_INT, + GR_FIVE_INT_TWO_STR, + GR_TWO_STR, + GR_THREE_STR, + GR_FOUR_STR, + GR_STR_FILENAME, + GR_FILENAME_STR, + GR_FILENAME_TWO_INT, + GR_FILENAME_TWO_INT_STR, + GR_TEXTREL, + GR_PTRACE, + GR_RESOURCE, + GR_CAP, + GR_SIG, + GR_CRASH1, + GR_CRASH2, + GR_PSACCT +}; + +#define gr_log_fs_generic(audit, msg, dentry, mnt) gr_log_varargs(audit, msg, GR_FILENAME, dentry, mnt) +#define gr_log_str(audit, msg, str) gr_log_varargs(audit, msg, GR_ONE_STR, str) + +extern void gr_log_varargs(int audit, const char *msg, int argtypes, ...); + +#endif +#endif diff -uprN linux-2.6.24/include/linux/grmsg.h linux-2.6.24.ovz/include/linux/grmsg.h --- linux-2.6.24/include/linux/grmsg.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/grmsg.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,3 @@ +#define DEFAULTSECMSG "%.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u, parent %.256s[%.16s:%d] uid/euid:%u/%u gid/egid:%u/%u" +#define GR_EXEC_TPE_MSG "denied untrusted exec of %.950s by " +#define GR_SYSCTL_MSG "denied modification of grsecurity sysctl value : %.32s by " diff -uprN linux-2.6.24/include/linux/grsecurity.h linux-2.6.24.ovz/include/linux/grsecurity.h --- linux-2.6.24/include/linux/grsecurity.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/grsecurity.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,13 @@ +#ifndef GR_SECURITY_H +#define GR_SECURITY_H +#include + +extern int gr_tpe_allow(const struct file *file); +extern void gr_copy_label(struct task_struct *tsk); +extern int gr_acl_handle_mmap(const struct file *file, + const unsigned long prot); +extern int gr_acl_handle_mprotect(const struct file *file, + const unsigned long prot); +extern void gr_acl_handle_exit(void); + +#endif diff -uprN linux-2.6.24/include/linux/hardirq.h linux-2.6.24.ovz/include/linux/hardirq.h --- linux-2.6.24/include/linux/hardirq.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/hardirq.h 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,9 @@ #include #include +#include +#include + /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -113,6 +116,24 @@ static inline void account_system_vtime( } #endif +#define save_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_save_context(tsk); \ + ub_save_context(tsk); \ + } \ + } while (0) + +#define restore_context() do { \ + struct task_struct *tsk; \ + if (hardirq_count() == HARDIRQ_OFFSET) { \ + tsk = current; \ + ve_restore_context(tsk); \ + ub_restore_context(tsk); \ + } \ + } while (0) + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -123,6 +144,7 @@ static inline void account_system_vtime( do { \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ + save_context(); \ trace_hardirq_enter(); \ } while (0) @@ -138,6 +160,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_system_vtime(current); \ + restore_context(); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff -uprN linux-2.6.24/include/linux/hrtimer.h linux-2.6.24.ovz/include/linux/hrtimer.h --- linux-2.6.24/include/linux/hrtimer.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/hrtimer.h 2008-03-25 18:53:59.000000000 -0500 @@ -300,10 +300,13 @@ hrtimer_forward(struct hrtimer *timer, k /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec *rqtp, - struct timespec *rmtp, + struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid); extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); +#ifdef CONFIG_COMPAT +long compat_nanosleep_restart(struct restart_block *restart); +#endif extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); diff -uprN linux-2.6.24/include/linux/if_bridge.h linux-2.6.24.ovz/include/linux/if_bridge.h --- linux-2.6.24/include/linux/if_bridge.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/if_bridge.h 2008-03-25 18:53:59.000000000 -0500 @@ -44,6 +44,7 @@ #define BRCTL_SET_PORT_PRIORITY 16 #define BRCTL_SET_PATH_COST 17 #define BRCTL_GET_FDB_ENTRIES 18 +#define BRCTL_SET_VIA_ORIG_DEV 19 #define BR_STATE_DISABLED 0 #define BR_STATE_LISTENING 1 @@ -72,6 +73,7 @@ struct __bridge_info __u32 tcn_timer_value; __u32 topology_change_timer_value; __u32 gc_timer_value; + __u8 via_phys_dev; }; struct __port_info @@ -104,9 +106,12 @@ struct __fdb_entry #include +#define BR_ALREADY_SEEN 1 + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb); +extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); extern int (*br_should_route_hook)(struct sk_buff *skb); #endif diff -uprN linux-2.6.24/include/linux/if_tun.h linux-2.6.24.ovz/include/linux/if_tun.h --- linux-2.6.24/include/linux/if_tun.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/if_tun.h 2008-03-25 18:53:59.000000000 -0500 @@ -18,10 +18,14 @@ #ifndef __IF_TUN_H #define __IF_TUN_H +#include +#include + /* Uncomment to enable debugging */ /* #define TUN_DEBUG 1 */ #ifdef __KERNEL__ +#include #ifdef TUN_DEBUG #define DBG if(tun->debug)printk @@ -35,6 +39,7 @@ struct tun_struct { struct list_head list; unsigned long flags; int attached; + void *bind_file; uid_t owner; gid_t group; @@ -92,4 +97,10 @@ struct tun_pi { }; #define TUN_PKT_STRIP 0x0001 +extern int tun_net_open(struct net_device *dev); +extern int tun_chr_open(struct inode *inode, struct file * file); +extern void tun_net_init(struct net_device *dev); +extern void tun_setup(struct net_device *dev); +extern struct list_head tun_dev_list; + #endif /* __IF_TUN_H */ diff -uprN linux-2.6.24/include/linux/if_vlan.h linux-2.6.24.ovz/include/linux/if_vlan.h --- linux-2.6.24/include/linux/if_vlan.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/if_vlan.h 2008-03-25 18:53:59.000000000 -0500 @@ -79,6 +79,9 @@ struct vlan_group { struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS]; struct rcu_head rcu; +#ifdef CONFIG_VE + struct ve_struct *owner; +#endif }; static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, int vlan_id) diff -uprN linux-2.6.24/include/linux/inetdevice.h linux-2.6.24.ovz/include/linux/inetdevice.h --- linux-2.6.24/include/linux/inetdevice.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/inetdevice.h 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,12 @@ struct ipv4_devconf }; extern struct ipv4_devconf ipv4_devconf; +extern struct ipv4_devconf ipv4_devconf_dflt; +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf)) +#else +#define ve_ipv4_devconf ipv4_devconf +#endif struct in_device { @@ -44,7 +50,7 @@ struct in_device }; #define IPV4_DEVCONF(cnf, attr) ((cnf).data[NET_IPV4_CONF_ ## attr - 1]) -#define IPV4_DEVCONF_ALL(attr) IPV4_DEVCONF(ipv4_devconf, attr) +#define IPV4_DEVCONF_ALL(attr) IPV4_DEVCONF(ve_ipv4_devconf, attr) static inline int ipv4_devconf_get(struct in_device *in_dev, int index) { @@ -136,6 +142,7 @@ extern __be32 inet_select_addr(const st extern __be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope); extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); extern void inet_forward_change(void); +extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); static __inline__ int inet_ifa_match(__be32 addr, struct in_ifaddr *ifa) { @@ -204,6 +211,16 @@ static inline void in_dev_put(struct in_ #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt) #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt) +struct ve_struct; +#ifdef CONFIG_INET +extern int devinet_sysctl_init(struct ve_struct *); +extern void devinet_sysctl_fini(struct ve_struct *); +extern void devinet_sysctl_free(struct ve_struct *); +#else +static inline int devinet_sysctl_init(struct ve_struct *ve) { return 0; } +static inline void devinet_sysctl_fini(struct ve_struct *ve) { ; } +static inline void devinet_sysctl_free(struct ve_struct *ve) { ; } +#endif #endif /* __KERNEL__ */ static __inline__ __be32 inet_make_mask(int logmask) diff -uprN linux-2.6.24/include/linux/init_task.h linux-2.6.24.ovz/include/linux/init_task.h --- linux-2.6.24/include/linux/init_task.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/init_task.h 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -69,10 +70,17 @@ .rlim = INIT_RLIMITS, \ } +#ifdef CONFIG_VE +/* one for ve0, one for init_task */ +#define INIT_NSPROXY_COUNT ATOMIC_INIT(2) +#else +#define INIT_NSPROXY_COUNT ATOMIC_INIT(1) +#endif + extern struct nsproxy init_nsproxy; #define INIT_NSPROXY(nsproxy) { \ .pid_ns = &init_pid_ns, \ - .count = ATOMIC_INIT(1), \ + .count = INIT_NSPROXY_COUNT, \ .uts_ns = &init_uts_ns, \ .mnt_ns = NULL, \ INIT_NET_NS(net_ns) \ @@ -173,6 +181,7 @@ extern struct group_info init_groups; .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_VZ_FAIRSCHED \ } diff -uprN linux-2.6.24/include/linux/inotify.h linux-2.6.24.ovz/include/linux/inotify.h --- linux-2.6.24/include/linux/inotify.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/inotify.h 2008-03-25 18:53:59.000000000 -0500 @@ -67,6 +67,7 @@ struct inotify_event { #include #include +#include /* * struct inotify_watch - represents a watch request on a specific inode @@ -84,6 +85,8 @@ struct inotify_watch { struct list_head i_list; /* entry in inode's list */ atomic_t count; /* reference count */ struct inotify_handle *ih; /* associated inotify handle */ + struct dentry *dentry; + struct vfsmount *mnt; struct inode *inode; /* associated inode */ __s32 wd; /* watch descriptor */ __u32 mask; /* event mask for this watch */ @@ -120,6 +123,8 @@ extern __s32 inotify_find_update_watch(s u32); extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *, struct inode *, __u32); +extern __s32 inotify_add_watch_dget(struct inotify_handle *, struct inotify_watch *, + struct dentry *, struct vfsmount *, __u32); extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *); extern void inotify_evict_watch(struct inotify_watch *); extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *); @@ -129,6 +134,66 @@ extern void inotify_remove_watch_locked( extern void get_inotify_watch(struct inotify_watch *); extern void put_inotify_watch(struct inotify_watch *); +/* + * struct inotify_handle - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_handle { + struct idr idr; /* idr mapping wd -> watch */ + struct mutex mutex; /* protects this bad boy */ + struct list_head watches; /* list of watches */ + atomic_t count; /* reference count */ + u32 last_wd; /* the last wd allocated */ + const struct inotify_operations *in_ops; /* inotify caller operations */ +}; + + +/* + * struct inotify_device - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_device { + wait_queue_head_t wq; /* wait queue for i/o */ + struct mutex ev_mutex; /* protects event queue */ + struct mutex up_mutex; /* synchronizes watch updates */ + struct list_head events; /* list of queued events */ + atomic_t count; /* reference count */ + struct user_struct *user; /* user who opened this dev */ + struct inotify_handle *ih; /* inotify handle */ + unsigned int queue_size; /* size of the queue (bytes) */ + unsigned int event_count; /* number of pending events */ + unsigned int max_events; /* maximum number of events */ +}; + +/* + * struct inotify_kernel_event - An inotify event, originating from a watch and + * queued for user-space. A list of these is attached to each instance of the + * device. In read(), this list is walked and all events that can fit in the + * buffer are returned. + * + * Protected by dev->ev_mutex of the device in which we are queued. + */ +struct inotify_kernel_event { + struct inotify_event event; /* the user-space event */ + struct list_head list; /* entry in inotify_device's list */ + char *name; /* filename, if any */ +}; + +/* + * struct inotify_user_watch - our version of an inotify_watch, we add + * a reference to the associated inotify_device. + */ +struct inotify_user_watch { + struct inotify_device *dev; /* associated device */ + struct inotify_watch wdata; /* inotify watch data */ +}; + +int inotify_create_watch(struct inotify_device *dev, struct dentry *d, + struct vfsmount *mnt, u32 mask); + + #else static inline void inotify_d_instantiate(struct dentry *dentry, diff -uprN linux-2.6.24/include/linux/ioprio.h linux-2.6.24.ovz/include/linux/ioprio.h --- linux-2.6.24/include/linux/ioprio.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ioprio.h 2008-03-25 18:53:59.000000000 -0500 @@ -38,6 +38,7 @@ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, + IOPRIO_WHO_UBC = 1000, }; /* diff -uprN linux-2.6.24/include/linux/ipv6.h linux-2.6.24.ovz/include/linux/ipv6.h --- linux-2.6.24/include/linux/ipv6.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ipv6.h 2008-03-25 18:53:59.000000000 -0500 @@ -457,12 +457,13 @@ static inline struct raw6_sock *raw6_sk( #define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\ +#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif,__ve)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ ((__sk)->sk_family == AF_INET6) && \ ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ + ve_accessible_strict((__sk)->owner_env, (__ve)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #endif /* __KERNEL__ */ diff -uprN linux-2.6.24/include/linux/kdev_t.h linux-2.6.24.ovz/include/linux/kdev_t.h --- linux-2.6.24/include/linux/kdev_t.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/kdev_t.h 2008-03-25 18:53:59.000000000 -0500 @@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de return dev & 0x3ffff; } +#define UNNAMED_MAJOR_COUNT 16 + +#if UNNAMED_MAJOR_COUNT > 1 + +extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; + +static inline dev_t make_unnamed_dev(int idx) +{ + /* + * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the + * unnamed device index into major number. + */ + return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], + idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return MINOR(dev) | (i << 8); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + int i; + for (i = 0; i < UNNAMED_MAJOR_COUNT && + MAJOR(dev) != unnamed_dev_majors[i]; i++); + return i < UNNAMED_MAJOR_COUNT; +} + +#else /* UNNAMED_MAJOR_COUNT */ + +static inline dev_t make_unnamed_dev(int idx) +{ + return MKDEV(0, idx); +} + +static inline int unnamed_dev_idx(dev_t dev) +{ + return MINOR(dev); +} + +static inline int is_unnamed_dev(dev_t dev) +{ + return MAJOR(dev) == 0; +} + +#endif /* UNNAMED_MAJOR_COUNT */ + #else /* __KERNEL__ */ /* diff -uprN linux-2.6.24/include/linux/kernel.h linux-2.6.24.ovz/include/linux/kernel.h --- linux-2.6.24/include/linux/kernel.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/kernel.h 2008-03-25 18:53:59.000000000 -0500 @@ -182,6 +182,12 @@ asmlinkage int printk(const char * fmt, extern int log_buf_get_len(void); extern int log_buf_read(int idx); extern int log_buf_copy(char *dest, int idx, int len); + +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); +asmlinkage int ve_printk(int, const char * fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void prepare_printk(void); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -192,8 +198,22 @@ static inline int __cold printk(const ch static inline int log_buf_get_len(void) { return 0; } static inline int log_buf_read(int idx) { return 0; } static inline int log_buf_copy(char *dest, int idx, int len) { return 0; } + +static inline int ve_printk(int d, const char *s, ...) + __attribute__ ((format (printf, 2, 3))); +static inline int ve_printk(int d, const char *s, ...) +{ + return 0; +} +static inline void prepare_printk(void) +{ +} #endif +#define VE0_LOG 1 +#define VE_LOG 2 +#define VE_LOG_BOTH (VE0_LOG | VE_LOG) + unsigned long int_sqrt(unsigned long); extern int printk_ratelimit(void); @@ -201,9 +221,14 @@ extern int __printk_ratelimit(int rateli extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); +extern int console_silence_loglevel; + static inline void console_silent(void) { - console_loglevel = 0; + if (console_loglevel > console_silence_loglevel) { + printk(KERN_EMERG "console shuts up ...\n"); + console_loglevel = 0; + } } static inline void console_verbose(void) @@ -217,8 +242,10 @@ extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int decode_call_traces; extern int panic_on_unrecovered_nmi; extern int tainted; +extern int kernel_text_csum_broken; extern const char *print_tainted(void); extern void add_taint(unsigned); diff -uprN linux-2.6.24/include/linux/kobject.h linux-2.6.24.ovz/include/linux/kobject.h --- linux-2.6.24/include/linux/kobject.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/kobject.h 2008-03-25 18:53:59.000000000 -0500 @@ -55,6 +55,8 @@ enum kobject_action { KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, + KOBJ_START, + KOBJ_STOP, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_MAX @@ -202,6 +204,9 @@ extern struct kset kernel_subsys; /* The global /sys/hypervisor/ subsystem */ extern struct kset hypervisor_subsys; +extern struct kset class_obj_subsys; +extern struct kset class_subsys; + /* * Helpers for setting the kset of registered objects. * Often, a registered object belongs to a kset embedded in a diff -uprN linux-2.6.24/include/linux/lockd/lockd.h linux-2.6.24.ovz/include/linux/lockd/lockd.h --- linux-2.6.24/include/linux/lockd/lockd.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/lockd/lockd.h 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,7 @@ struct nlm_host { struct list_head h_granted; /* Locks in GRANTED state */ struct list_head h_reclaim; /* Locks in RECLAIM state */ struct nsm_handle * h_nsmhandle; /* NSM status handle */ + struct ve_struct * owner_env; /* VE owning the host */ }; struct nsm_handle { @@ -151,8 +152,11 @@ extern struct svc_procedure nlmsvc_proce #ifdef CONFIG_LOCKD_V4 extern struct svc_procedure nlmsvc_procedures4[]; #endif -extern int nlmsvc_grace_period; -extern unsigned long nlmsvc_timeout; + +#include +extern int _nlmsvc_grace_period; +extern unsigned long _nlmsvc_timeout; + extern int nsm_use_hostnames; /* diff -uprN linux-2.6.24/include/linux/major.h linux-2.6.24.ovz/include/linux/major.h --- linux-2.6.24/include/linux/major.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/major.h 2008-03-25 18:53:59.000000000 -0500 @@ -170,4 +170,7 @@ #define VIOTAPE_MAJOR 230 +#define UNNAMED_EXTRA_MAJOR 130 +#define UNNAMED_EXTRA_MAJOR_COUNT 120 + #endif diff -uprN linux-2.6.24/include/linux/mm.h linux-2.6.24.ovz/include/linux/mm.h --- linux-2.6.24/include/linux/mm.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/mm.h 2008-03-25 18:53:59.000000000 -0500 @@ -661,15 +661,7 @@ static inline int page_mapped(struct pag extern void show_free_areas(void); -#ifdef CONFIG_SHMEM -int shmem_lock(struct file *file, int lock, struct user_struct *user); -#else -static inline int shmem_lock(struct file *file, int lock, - struct user_struct *user) -{ - return 0; -} -#endif +#define shmem_nopage filemap_nopage struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); @@ -710,7 +702,9 @@ void free_pgd_range(struct mmu_gather ** void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, + unsigned long addr, size_t size); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); diff -uprN linux-2.6.24/include/linux/mm_types.h linux-2.6.24.ovz/include/linux/mm_types.h --- linux-2.6.24/include/linux/mm_types.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/mm_types.h 2008-03-25 18:53:59.000000000 -0500 @@ -88,6 +88,13 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_BEANCOUNTERS + union { + struct user_beancounter *page_ub; + struct page_beancounter *page_pb; + struct user_beancounter **slub_ubs; + } bc; +#endif }; /* @@ -212,6 +219,9 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ + unsigned int vps_dumpable:2; + unsigned int oom_killed:1; + /* coredumping support */ int core_waiters; struct completion *core_startup_done, core_done; @@ -219,6 +229,9 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *mm_ub; +#endif }; #endif /* _LINUX_MM_TYPES_H */ diff -uprN linux-2.6.24/include/linux/mman.h linux-2.6.24.ovz/include/linux/mman.h --- linux-2.6.24/include/linux/mman.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/mman.h 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,9 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | +#ifdef MAP_GROWSUP + _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) | +#endif _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); diff -uprN linux-2.6.24/include/linux/mnt_namespace.h linux-2.6.24.ovz/include/linux/mnt_namespace.h --- linux-2.6.24/include/linux/mnt_namespace.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/mnt_namespace.h 2008-03-25 18:53:59.000000000 -0500 @@ -16,6 +16,8 @@ struct mnt_namespace { extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); +extern struct rw_semaphore namespace_sem; + extern void __put_mnt_ns(struct mnt_namespace *ns); static inline void put_mnt_ns(struct mnt_namespace *ns) diff -uprN linux-2.6.24/include/linux/mount.h linux-2.6.24.ovz/include/linux/mount.h --- linux-2.6.24/include/linux/mount.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/mount.h 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,7 @@ struct vfsmount { atomic_t mnt_count; int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; + unsigned owner; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) diff -uprN linux-2.6.24/include/linux/msg.h linux-2.6.24.ovz/include/linux/msg.h --- linux-2.6.24/include/linux/msg.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/msg.h 2008-03-25 18:53:59.000000000 -0500 @@ -97,6 +97,14 @@ extern long do_msgsnd(int msqid, long mt extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, size_t msgsz, long msgtyp, int msgflg); +int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); +int sysvipc_setup_msg(key_t key, int msqid, int msgflg); +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data); +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data); + #endif /* __KERNEL__ */ #endif /* _LINUX_MSG_H */ diff -uprN linux-2.6.24/include/linux/namei.h linux-2.6.24.ovz/include/linux/namei.h --- linux-2.6.24/include/linux/namei.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/namei.h 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA #define LOOKUP_CREATE (0x0200) #define LOOKUP_ACCESS (0x0400) #define LOOKUP_CHDIR (0x0800) +#define LOOKUP_NOAREACHECK (0x1000) /* no area check on lookup */ +#define LOOKUP_STRICT (0x2000) /* no symlinks or other filesystems */ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *)); diff -uprN linux-2.6.24/include/linux/netdevice.h linux-2.6.24.ovz/include/linux/netdevice.h --- linux-2.6.24/include/linux/netdevice.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netdevice.h 2008-03-25 18:53:59.000000000 -0500 @@ -280,6 +280,11 @@ enum netdev_state_t __LINK_STATE_QDISC_RUNNING, }; +struct netdev_bc { + struct user_beancounter *exec_ub, *owner_ub; +}; + +#define netdev_bc(dev) (&(dev)->dev_bc) /* * This structure holds at boot time configured netdevice settings. They @@ -514,6 +519,10 @@ struct net_device #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) +/* device is venet device */ +#define NETIF_F_VENET (1 << (NETIF_F_GSO_SHIFT - 1)) +/* can be registered inside VE */ +#define NETIF_F_VIRTUAL (1 << (NETIF_F_GSO_SHIFT - 2)) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) @@ -716,6 +725,9 @@ struct net_device /* macvlan */ struct macvlan_port *macvlan_port; + struct ve_struct *owner_env; /* Owner VE of the interface */ + struct netdev_bc dev_bc; + /* class/net/name entry */ struct device dev; /* space for optional statistics and wireless sysfs groups */ @@ -730,6 +742,20 @@ struct net_device }; #define to_net_dev(d) container_of(d, struct net_device, dev) +#define NETDEV_HASHBITS 8 +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +} + #define NETDEV_ALIGN 32 #define NETDEV_ALIGN_CONST (NETDEV_ALIGN - 1) @@ -1092,6 +1118,9 @@ extern int dev_ethtool(struct net *net, extern unsigned dev_get_flags(const struct net_device *); extern int dev_change_flags(struct net_device *, unsigned); extern int dev_change_name(struct net_device *, char *); +int __dev_change_net_namespace(struct net_device *, struct net *, const char *, + struct ve_struct *src_ve, struct ve_struct *dst_ve, + struct user_beancounter *exec_ub); extern int dev_change_net_namespace(struct net_device *, struct net *, const char *); extern int dev_set_mtu(struct net_device *, int); @@ -1453,6 +1482,18 @@ extern void linkwatch_run_queue(void); extern int netdev_compute_features(unsigned long all, unsigned long one); +#if defined(CONFIG_VE) && defined(CONFIG_NET) +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return !(dev->features & NETIF_F_VIRTUAL); +} +#else +static inline int ve_is_dev_movable(struct net_device *dev) +{ + return 0; +} +#endif + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff -uprN linux-2.6.24/include/linux/netfilter/x_tables.h linux-2.6.24.ovz/include/linux/netfilter/x_tables.h --- linux-2.6.24/include/linux/netfilter/x_tables.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter/x_tables.h 2008-03-25 18:53:59.000000000 -0500 @@ -259,6 +259,7 @@ struct xt_table_info { /* Size per table */ unsigned int size; + unsigned int alloc_size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ @@ -293,6 +294,10 @@ extern int xt_register_table(struct xt_t struct xt_table_info *bootstrap, struct xt_table_info *newinfo); extern void *xt_unregister_table(struct xt_table *table); +extern struct xt_table *virt_xt_register_table(struct xt_table *table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo); +extern void *virt_xt_unregister_table(struct xt_table *table); extern struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff -uprN linux-2.6.24/include/linux/netfilter/xt_hashlimit.h linux-2.6.24.ovz/include/linux/netfilter/xt_hashlimit.h --- linux-2.6.24/include/linux/netfilter/xt_hashlimit.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter/xt_hashlimit.h 2008-03-25 18:53:59.000000000 -0500 @@ -37,4 +37,10 @@ struct xt_hashlimit_info { struct xt_hashlimit_info *master; } u; }; + +struct ve_xt_hashlimit { + struct hlist_head hashlimit_htables; + struct proc_dir_entry *hashlimit_procdir4; + struct proc_dir_entry *hashlimit_procdir6; +}; #endif /*_XT_HASHLIMIT_H*/ diff -uprN linux-2.6.24/include/linux/netfilter.h linux-2.6.24.ovz/include/linux/netfilter.h --- linux-2.6.24/include/linux/netfilter.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter.h 2008-03-25 18:53:59.000000000 -0500 @@ -126,7 +126,13 @@ extern struct ctl_table nf_net_netfilter extern struct ctl_table nf_net_ipv4_netfilter_sysctl_path[]; #endif /* CONFIG_SYSCTL */ +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_hooks \ + ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) +#else extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +#define ve_nf_hooks nf_hooks +#endif /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will * disappear once iptables is replaced with pkttables. Please DO NOT use them @@ -204,7 +210,7 @@ static inline int nf_hook_thresh(int pf, if (!cond) return 1; #ifndef CONFIG_NETFILTER_DEBUG - if (list_empty(&nf_hooks[pf][hook])) + if (list_empty(&ve_nf_hooks[pf][hook])) return 1; #endif return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); diff -uprN linux-2.6.24/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.24.ovz/include/linux/netfilter_ipv4/ip_tables.h --- linux-2.6.24/include/linux/netfilter_ipv4/ip_tables.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter_ipv4/ip_tables.h 2008-03-25 18:53:59.000000000 -0500 @@ -292,7 +292,7 @@ ipt_get_target(struct ipt_entry *e) #include extern void ipt_init(void) __init; -extern int ipt_register_table(struct xt_table *table, +extern struct xt_table *ipt_register_table(struct xt_table *table, const struct ipt_replace *repl); extern void ipt_unregister_table(struct xt_table *table); diff -uprN linux-2.6.24/include/linux/netfilter_ipv4/ipt_recent.h linux-2.6.24.ovz/include/linux/netfilter_ipv4/ipt_recent.h --- linux-2.6.24/include/linux/netfilter_ipv4/ipt_recent.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter_ipv4/ipt_recent.h 2008-03-25 18:53:59.000000000 -0500 @@ -24,4 +24,10 @@ struct ipt_recent_info { u_int8_t side; }; +struct ve_ipt_recent { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_dir; +#endif +}; #endif /*_IPT_RECENT_H*/ diff -uprN linux-2.6.24/include/linux/netfilter_ipv6/ip6_tables.h linux-2.6.24.ovz/include/linux/netfilter_ipv6/ip6_tables.h --- linux-2.6.24/include/linux/netfilter_ipv6/ip6_tables.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/netfilter_ipv6/ip6_tables.h 2008-03-25 18:53:59.000000000 -0500 @@ -333,7 +333,7 @@ ip6t_get_target(struct ip6t_entry *e) #include extern void ip6t_init(void) __init; -extern int ip6t_register_table(struct xt_table *table, +extern struct xt_table *ip6t_register_table(struct xt_table *table, const struct ip6t_replace *repl); extern void ip6t_unregister_table(struct xt_table *table); extern unsigned int ip6t_do_table(struct sk_buff *skb, diff -uprN linux-2.6.24/include/linux/nfcalls.h linux-2.6.24.ovz/include/linux/nfcalls.h --- linux-2.6.24/include/linux/nfcalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/nfcalls.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,188 @@ +/* + * include/linux/nfcalls.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_NFCALLS_H +#define _LINUX_NFCALLS_H + +#include + +#ifdef CONFIG_MODULES +extern struct module no_module; + +#define DECL_KSYM_MODULE(name) \ + extern struct module *vz_mod_##name + +#define INIT_KSYM_MODULE(name) \ + struct module *vz_mod_##name = &no_module; \ + EXPORT_SYMBOL(vz_mod_##name) + +static inline void __vzksym_modresolve(struct module **modp, struct module *mod) +{ + /* + * we want to be sure, that pointer updates are visible first: + * 1. wmb() is here only for piece of sure + * (note, no rmb() in KSYMSAFECALL) + * 2. synchronize_sched() guarantees that updates are visible + * on all cpus and allows us to remove rmb() in KSYMSAFECALL + */ + wmb(); synchronize_sched(); + *modp = mod; + /* just to be sure, our changes are visible as soon as possible */ + wmb(); synchronize_sched(); +} + +static inline void __vzksym_modunresolve(struct module **modp) +{ + /* + * try_module_get() in KSYMSAFECALL should fail at this moment since + * THIS_MODULE in in unloading state (we should be called from fini), + * no need to syncronize pointers/ve_module updates. + */ + *modp = &no_module; + /* + * synchronize_sched() guarantees here that we see + * updated module pointer before the module really gets away + */ + synchronize_sched(); +} + +static inline int __vzksym_module_get(struct module *mod) +{ + /* + * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE + * and smp_read_barrier_depends() here... + */ + smp_read_barrier_depends(); /* for module loading */ + if (!try_module_get(mod)) + return -EBUSY; + + return 0; +} + +static inline void __vzksym_module_put(struct module *mod) +{ + module_put(mod); +} +#else +#define DECL_KSYM_MODULE(name) +#define INIT_KSYM_MODULE(name) +#define __vzksym_modresolve(modp, mod) +#define __vzksym_modunresolve(modp) +#define __vzksym_module_get(mod) 0 +#define __vzksym_module_put(mod) +#endif + +#define __KSYMERRCALL(err, type, mod, name, args) \ +({ \ + type ret = (type)err; \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ret = ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ + ret; \ +}) + +#define __KSYMSAFECALL_VOID(mod, name, args) \ + do { \ + if (!__vzksym_module_get(vz_mod_##mod)) { \ + if (vz_##name) \ + ((*vz_##name)args); \ + __vzksym_module_put(vz_mod_##mod); \ + } \ + } while (0) + +#define DECL_KSYM_CALL(type, name, args) \ + extern type (*vz_##name) args +#define INIT_KSYM_CALL(type, name, args) \ + type (*vz_##name) args; \ +EXPORT_SYMBOL(vz_##name) + +#define KSYMERRCALL(err, mod, name, args) \ + __KSYMERRCALL(err, int, mod, name, args) +#define KSYMSAFECALL(type, mod, name, args) \ + __KSYMERRCALL(0, type, mod, name, args) +#define KSYMSAFECALL_VOID(mod, name, args) \ + __KSYMSAFECALL_VOID(mod, name, args) +#define KSYMREF(name) vz_##name + +/* should be called _after_ KSYMRESOLVE's */ +#define KSYMMODRESOLVE(name) \ + __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) +#define KSYMMODUNRESOLVE(name) \ + __vzksym_modunresolve(&vz_mod_##name) + +#define KSYMRESOLVE(name) \ + vz_##name = &name +#define KSYMUNRESOLVE(name) \ + vz_##name = NULL + +#if defined(CONFIG_VE) +DECL_KSYM_MODULE(ip_tables); +DECL_KSYM_MODULE(ip6_tables); +DECL_KSYM_MODULE(iptable_filter); +DECL_KSYM_MODULE(ip6table_filter); +DECL_KSYM_MODULE(iptable_mangle); +DECL_KSYM_MODULE(ip6table_mangle); +DECL_KSYM_MODULE(ip_conntrack); +DECL_KSYM_MODULE(nf_conntrack); +DECL_KSYM_MODULE(nf_conntrack_ipv4); +DECL_KSYM_MODULE(nf_conntrack_ipv6); +DECL_KSYM_MODULE(xt_conntrack); +DECL_KSYM_MODULE(ip_nat); +DECL_KSYM_MODULE(nf_nat); +DECL_KSYM_MODULE(iptable_nat); + +struct sk_buff; + +DECL_KSYM_CALL(int, init_netfilter, (void)); +DECL_KSYM_CALL(int, init_iptables, (void)); +DECL_KSYM_CALL(int, init_ip6tables, (void)); +DECL_KSYM_CALL(int, init_iptable_filter, (void)); +DECL_KSYM_CALL(int, init_ip6table_filter, (void)); +DECL_KSYM_CALL(int, init_iptable_mangle, (void)); +DECL_KSYM_CALL(int, init_ip6table_mangle, (void)); +DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); +DECL_KSYM_CALL(int, nf_conntrack_init_ve, (void)); +DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); +DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); +DECL_KSYM_CALL(int, nf_nat_init, (void)); +DECL_KSYM_CALL(int, init_iptable_nat, (void)); +DECL_KSYM_CALL(int, init_nftable_nat, (void)); +DECL_KSYM_CALL(int, nf_nat_init, (void)); +DECL_KSYM_CALL(void, fini_iptable_nat, (void)); +DECL_KSYM_CALL(void, fini_nftable_nat, (void)); +DECL_KSYM_CALL(void, nf_nat_cleanup, (void)); +DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); +DECL_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); +DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); +DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); +DECL_KSYM_CALL(void, fini_iptable_filter, (void)); +DECL_KSYM_CALL(void, fini_ip6table_filter, (void)); +DECL_KSYM_CALL(void, fini_iptable_mangle, (void)); +DECL_KSYM_CALL(void, fini_ip6table_mangle, (void)); +DECL_KSYM_CALL(void, fini_iptables, (void)); +DECL_KSYM_CALL(void, fini_ip6tables, (void)); +DECL_KSYM_CALL(void, fini_netfilter, (void)); + +#include +#endif + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +DECL_KSYM_MODULE(vzethdev); +DECL_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) +DECL_KSYM_MODULE(vzmon); +DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); +#endif + +#endif /* _LINUX_NFCALLS_H */ diff -uprN linux-2.6.24/include/linux/nfs_fs_sb.h linux-2.6.24.ovz/include/linux/nfs_fs_sb.h --- linux-2.6.24/include/linux/nfs_fs_sb.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/nfs_fs_sb.h 2008-03-25 18:53:59.000000000 -0500 @@ -65,6 +65,7 @@ struct nfs_client { char cl_ipaddr[16]; unsigned char cl_id_uniquifier; #endif + struct ve_struct *owner_env; }; /* diff -uprN linux-2.6.24/include/linux/notifier.h linux-2.6.24.ovz/include/linux/notifier.h --- linux-2.6.24/include/linux/notifier.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/notifier.h 2008-03-25 18:53:59.000000000 -0500 @@ -149,8 +149,9 @@ extern int __srcu_notifier_call_chain(st #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ +#define NOTIFY_FAIL 0x0002 /* Reject */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ -#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) +#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. diff -uprN linux-2.6.24/include/linux/nsproxy.h linux-2.6.24.ovz/include/linux/nsproxy.h --- linux-2.6.24/include/linux/nsproxy.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/nsproxy.h 2008-03-25 18:53:59.000000000 -0500 @@ -66,6 +66,7 @@ int copy_namespaces(unsigned long flags, void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); @@ -76,9 +77,10 @@ static inline void put_nsproxy(struct ns } } -static inline void get_nsproxy(struct nsproxy *ns) +static inline struct nsproxy *get_nsproxy(struct nsproxy *ns) { atomic_inc(&ns->count); + return ns; } #ifdef CONFIG_CGROUP_NS diff -uprN linux-2.6.24/include/linux/page-flags.h linux-2.6.24.ovz/include/linux/page-flags.h --- linux-2.6.24/include/linux/page-flags.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/page-flags.h 2008-03-25 18:53:59.000000000 -0500 @@ -93,6 +93,8 @@ /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ #define PG_readahead PG_reclaim /* Reminder to do async read-ahead */ +#define PG_checkpointed 21 /* Page transferred */ + /* PG_owner_priv_1 users should have descriptive aliases */ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ #define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ @@ -260,6 +262,8 @@ static inline void __ClearPageTail(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define ClearPageCheckpointed(page) clear_bit(PG_checkpointed, &(page)->flags) + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); diff -uprN linux-2.6.24/include/linux/percpu.h linux-2.6.24.ovz/include/linux/percpu.h --- linux-2.6.24/include/linux/percpu.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/percpu.h 2008-03-25 18:53:59.000000000 -0500 @@ -49,6 +49,13 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) +#define static_percpu_ptr(sptr, sptrs) ({ \ + int i; \ + for (i = 0; i < NR_CPUS; i++) \ + (sptr)->ptrs[i] = &(sptrs)[i]; \ + (void *)__percpu_disguise(sptr); \ + }) + extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu); extern void percpu_depopulate(void *__pdata, int cpu); extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, @@ -60,6 +67,7 @@ extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ #define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define static_percpu_ptr(sptr, sptrs) (&sptrs[0]) static inline void percpu_depopulate(void *__pdata, int cpu) { diff -uprN linux-2.6.24/include/linux/pid.h linux-2.6.24.ovz/include/linux/pid.h --- linux-2.6.24/include/linux/pid.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/pid.h 2008-03-25 18:53:59.000000000 -0500 @@ -59,6 +59,9 @@ struct pid atomic_t count; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; +#endif struct rcu_head rcu; int level; struct upid numbers[1]; @@ -119,9 +122,12 @@ extern struct pid *find_pid(int nr); extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid); extern void FASTCALL(free_pid(struct pid *pid)); +extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *); +extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); +pid_t pid_to_vpid(pid_t nr); /* * the helpers to get the pid's id seen from different namespaces diff -uprN linux-2.6.24/include/linux/pid_namespace.h linux-2.6.24.ovz/include/linux/pid_namespace.h --- linux-2.6.24/include/linux/pid_namespace.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/pid_namespace.h 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,14 @@ struct pidmap { #define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) +/* pid namespace flags */ + +/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */ +#define PID_NS_HIDE_CHILD 0x00000001 + +/* if set newly created processes invisible from parent ns*/ +#define PID_NS_HIDDEN 0x00000002 + struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; @@ -22,6 +30,7 @@ struct pid_namespace { struct kmem_cache *pid_cachep; int level; struct pid_namespace *parent; + unsigned flags; #ifdef CONFIG_PROC_FS struct vfsmount *proc_mnt; #endif diff -uprN linux-2.6.24/include/linux/poll.h linux-2.6.24.ovz/include/linux/poll.h --- linux-2.6.24/include/linux/poll.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/poll.h 2008-03-25 18:53:59.000000000 -0500 @@ -117,6 +117,7 @@ void zero_fd_set(unsigned long nr, unsig extern int do_select(int n, fd_set_bits *fds, s64 *timeout); extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, s64 *timeout); +long do_restart_poll(struct restart_block *restart_block); #endif /* KERNEL */ diff -uprN linux-2.6.24/include/linux/proc_fs.h linux-2.6.24.ovz/include/linux/proc_fs.h --- linux-2.6.24/include/linux/proc_fs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/proc_fs.h 2008-03-25 18:53:59.000000000 -0500 @@ -5,6 +5,7 @@ #include #include #include +#include #include struct net; @@ -100,6 +101,8 @@ struct vmcore { extern struct proc_dir_entry proc_root; extern struct proc_dir_entry *proc_root_fs; +extern struct file_system_type proc_fs_type; + extern struct proc_dir_entry *proc_bus; extern struct proc_dir_entry *proc_root_driver; extern struct proc_dir_entry *proc_root_kcore; @@ -124,7 +127,17 @@ void de_put(struct proc_dir_entry *de); extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent); +extern struct proc_dir_entry *create_proc_glob_entry(const char *name, + mode_t mode, + struct proc_dir_entry *parent); extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); +extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent); + +#ifdef CONFIG_VE +#define proc_mnt(ve) (ve->proc_mnt) +#else +#define proc_mnt(ve) (proc_mnt) +#endif extern struct vfsmount *proc_mnt; struct pid_namespace; @@ -202,12 +215,22 @@ extern struct proc_dir_entry *proc_net_f const char *name, mode_t mode, const struct file_operations *fops); extern void proc_net_remove(struct net *net, const char *name); +static inline struct proc_dir_entry *proc_glob_fops_create(const char *name, + mode_t mode, const struct file_operations *fops) +{ + struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL); + if (res) + res->proc_fops = fops; + return res; +} + #else #define proc_root_driver NULL #define proc_bus NULL #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) +#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) static inline void proc_net_remove(struct net *net, const char *name) {} static inline void proc_flush_task(struct task_struct *task) @@ -216,6 +239,8 @@ static inline void proc_flush_task(struc static inline struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) { return NULL; } +static inline struct proc_dir_entry *create_proc_glob_entry(const char *name, + mode_t mode, struct proc_dir_entry *parent) { return NULL; } #define remove_proc_entry(name, parent) do {} while (0) @@ -248,6 +273,48 @@ static inline void pid_ns_release_proc(s #endif /* CONFIG_PROC_FS */ +static inline struct proc_dir_entry *create_proc_entry_mod(const char *name, + mode_t mode, + struct proc_dir_entry *parent, + struct module *owner) +{ + struct proc_dir_entry *ent; + + /* + * lock_kernel() here protects against proc_lookup() + * which can find this freshly created entry w/o owner being set. + * this can lead to module being put more times then getted. + */ + lock_kernel(); + ent = create_proc_entry(name, mode, parent); + if (ent) + ent->owner = owner; + unlock_kernel(); + + return ent; +} + +static inline struct proc_dir_entry *create_proc_glob_entry_mod(const char *name, + mode_t mode, + struct proc_dir_entry *parent, + struct module *owner) +{ + struct proc_dir_entry *ent; + + /* + * lock_kernel() here protects against proc_lookup() + * which can find this freshly created entry w/o owner being set. + * this can lead to module being put more times then getted. + */ + lock_kernel(); + ent = create_proc_glob_entry(name, mode, parent); + if (ent) + ent->owner = owner; + unlock_kernel(); + + return ent; +} + #if !defined(CONFIG_PROC_KCORE) static inline void kclist_add(struct kcore_list *new, void *addr, size_t size) { @@ -294,4 +361,11 @@ struct proc_maps_private { #endif }; +#define LPDE(inode) (PROC_I((inode))->pde) +#ifdef CONFIG_VE +#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe)) +#endif + +int proc_match(int len, const char *name, struct proc_dir_entry *de); + #endif /* _LINUX_PROC_FS_H */ diff -uprN linux-2.6.24/include/linux/quota.h linux-2.6.24.ovz/include/linux/quota.h --- linux-2.6.24/include/linux/quota.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/quota.h 2008-03-25 18:53:59.000000000 -0500 @@ -164,6 +164,10 @@ enum { #include #include +#include + +extern spinlock_t dq_data_lock; + #include #include #include @@ -274,6 +278,8 @@ struct quota_format_ops { int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ }; +struct inode; +struct iattr; /* Operations working with dquots */ struct dquot_operations { int (*initialize) (struct inode *, int); @@ -288,9 +294,11 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ + int (*rename) (struct inode *, struct inode *, struct inode *); }; /* Operations handling requests from userspace */ +struct v2_disk_dqblk; struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *); int (*quota_off)(struct super_block *, int); @@ -303,6 +311,10 @@ struct quotactl_ops { int (*set_xstate)(struct super_block *, unsigned int, int); int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); +#ifdef CONFIG_QUOTA_COMPAT + int (*get_quoti)(struct super_block *, int, unsigned int, + struct v2_disk_dqblk __user *); +#endif }; struct quota_format_type { @@ -323,6 +335,10 @@ struct quota_info { struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + struct vz_quota_master *vzdq_master; + int vzdq_count; +#endif }; /* Inline would be better but we need to dereference super_block which is not defined yet */ diff -uprN linux-2.6.24/include/linux/quotaops.h linux-2.6.24.ovz/include/linux/quotaops.h --- linux-2.6.24/include/linux/quotaops.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/quotaops.h 2008-03-25 18:53:59.000000000 -0500 @@ -170,6 +170,19 @@ static __inline__ int DQUOT_TRANSFER(str return 0; } +static __inline__ int DQUOT_RENAME(struct inode *inode, + struct inode *old_dir, struct inode *new_dir) +{ + struct dquot_operations *q_op; + + q_op = inode->i_sb->dq_op; + if (q_op && q_op->rename) { + if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) + return 1; + } + return 0; +} + /* The following two functions cannot be called inside a transaction */ #define DQUOT_SYNC(sb) sync_dquots(sb, -1) @@ -196,6 +209,7 @@ static __inline__ int DQUOT_OFF(struct s #define DQUOT_SYNC(sb) do { } while(0) #define DQUOT_OFF(sb) do { } while(0) #define DQUOT_TRANSFER(inode, iattr) (0) +#define DQUOT_RENAME(inode, old_dir, new_dir) (0) static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); diff -uprN linux-2.6.24/include/linux/rmap.h linux-2.6.24.ovz/include/linux/rmap.h --- linux-2.6.24/include/linux/rmap.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/rmap.h 2008-03-25 18:53:59.000000000 -0500 @@ -73,6 +73,8 @@ void page_add_anon_rmap(struct page *, s void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); void page_remove_rmap(struct page *, struct vm_area_struct *); +struct anon_vma *page_lock_anon_vma(struct page *page); +void page_unlock_anon_vma(struct anon_vma *anon_vma); #ifdef CONFIG_DEBUG_VM void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); diff -uprN linux-2.6.24/include/linux/sched.h linux-2.6.24.ovz/include/linux/sched.h --- linux-2.6.24/include/linux/sched.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sched.h 2008-03-25 18:53:59.000000000 -0500 @@ -28,6 +28,9 @@ #define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ +/* mask of clones which are disabled in OpenVZ VEs */ +#define CLONE_NAMESPACES_MASK (CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET) + /* * Scheduling policies */ @@ -91,6 +94,8 @@ struct sched_param { #include +#include + struct exec_domain; struct futex_pi_state; struct bio; @@ -125,15 +130,38 @@ extern unsigned long avenrun[]; /* Load load += n*(FIXED_1-exp); \ load >>= FSHIFT; +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern unsigned long nr_sleeping(void); +extern unsigned long nr_stopped(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); extern unsigned long weighted_cpuload(const int cpu); +extern atomic_t nr_dead; +extern unsigned long nr_zombie; + +#ifdef CONFIG_VE +struct ve_struct; +extern unsigned long nr_running_ve(struct ve_struct *); +extern unsigned long nr_iowait_ve(struct ve_struct *); +extern unsigned long nr_uninterruptible_ve(struct ve_struct *); +extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu); +extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu); +void ve_sched_attach(struct ve_struct *envid); +#else +#define nr_running_ve(ve) 0 +#define nr_iowait_ve(ve) 0 +#define nr_uninterruptible_ve(ve) 0 +#define ve_sched_get_idle_time(ve, cpu) 0 +#define ve_sched_get_iowait_time(ve, cpu) 0 +#endif struct seq_file; struct cfs_rq; @@ -241,6 +269,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern void smp_show_regs(struct pt_regs *, void *); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -388,6 +417,9 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +#include +#include + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -980,6 +1012,7 @@ struct task_struct { /* ??? */ unsigned int personality; unsigned did_exec:1; + unsigned did_ve_enter:1; pid_t pid; pid_t tgid; @@ -1168,6 +1201,14 @@ struct task_struct { struct rcu_head rcu; /* + * state tracking for suspend + * FIXME - ptrace is completely rewritten in this kernel + * so set_pn_state() is not set in many places correctyl + */ + __u8 pn_state; + __u8 stopped_state:1; + + /* * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; @@ -1178,6 +1219,19 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter task_bc; +#endif +#ifdef CONFIG_VE + struct ve_task_info ve_task_info; +#endif +#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) + unsigned long magic; + struct inode *ino; +#endif +#ifdef CONFIG_VZ_FAIRSCHED + struct fairsched_node *fsched_node; +#endif }; /* @@ -1353,6 +1407,43 @@ static inline void put_task_struct(struc __put_task_struct(t); } +#ifndef CONFIG_VE +#define set_pn_state(tsk, state) do { } while(0) +#define clear_pn_state(tsk) do { } while(0) +#define set_stop_state(tsk) do { } while(0) +#define clear_stop_state(tsk) do { } while(0) +#else +#define PN_STOP_TF 1 /* was not in 2.6.8 */ +#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ +#define PN_STOP_ENTRY 3 +#define PN_STOP_FORK 4 +#define PN_STOP_VFORK 5 +#define PN_STOP_SIGNAL 6 +#define PN_STOP_EXIT 7 +#define PN_STOP_EXEC 8 +#define PN_STOP_LEAVE 9 + +static inline void set_pn_state(struct task_struct *tsk, int state) +{ + tsk->pn_state = state; +} + +static inline void clear_pn_state(struct task_struct *tsk) +{ + tsk->pn_state = 0; +} + +static inline void set_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 1; +} + +static inline void clear_stop_state(struct task_struct *tsk) +{ + tsk->stopped_state = 0; +} +#endif + /* * Per process flags */ @@ -1369,6 +1460,7 @@ static inline void put_task_struct(struc #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_EXIT_RESTART 0x00004000 /* do_exit() restarted, see do_exit() */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ @@ -1431,6 +1523,21 @@ extern unsigned long long cpu_clock(int extern unsigned long long task_sched_runtime(struct task_struct *task); +static inline unsigned long cycles_to_clocks(cycles_t cycles) +{ + extern unsigned long cycles_per_clock; + do_div(cycles, cycles_per_clock); + return cycles; +} + +static inline u64 cycles_to_jiffies(cycles_t cycles) +{ + extern unsigned long cycles_per_jiffy; + do_div(cycles, cycles_per_jiffy); + return cycles; +} + + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); @@ -1542,8 +1649,6 @@ extern struct task_struct *find_task_by_ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -extern void __set_special_pids(pid_t session, pid_t pgrp); - /* per-UID process charging. */ extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); static inline struct user_struct *get_uid(struct user_struct *u) @@ -1554,6 +1659,9 @@ static inline struct user_struct *get_ui extern void free_uid(struct user_struct *); extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); +extern int set_user(uid_t uid, int dumpclear); +extern void set_special_pids(struct pid *pid); +extern void __set_special_pids(struct pid *pid); #include @@ -1685,6 +1793,13 @@ extern int disallow_signal(int); extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + long pid0); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); @@ -1699,19 +1814,19 @@ extern void wait_task_inactive(struct ta #define remove_parent(p) list_del_init(&(p)->sibling) #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task_all(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +#define for_each_process_all(p) \ + for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do +#define do_each_thread_all(g, t) \ + for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do -#define while_each_thread(g, t) \ +#define while_each_thread_all(g, t) \ while ((t = next_thread(t)) != g) /* de_thread depends on thread_group_leader not being a pid based check */ @@ -1736,8 +1851,15 @@ int same_thread_group(struct task_struct static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), + struct task_struct *tsk; + + tsk = list_entry(rcu_dereference(p->thread_group.next), struct task_struct, thread_group); +#ifdef CONFIG_VE + /* all threads should belong to ONE ve! */ + BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); +#endif + return tsk; } static inline int thread_group_empty(struct task_struct *p) @@ -1777,6 +1899,98 @@ static inline void unlock_task_sighand(s spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } +#ifndef CONFIG_VE + +#define for_each_process_ve(p) for_each_process_all(p) +#define do_each_thread_ve(g, t) do_each_thread_all(g, t) +#define while_each_thread_ve(g, t) while_each_thread_all(g, t) +#define first_task_ve() next_task_ve(&init_task) +#define __first_task_ve(owner) next_task_ve(&init_task) +#define __next_task_ve(owner, p) next_task_ve(p) +#define next_task_ve(p) \ + (next_task_all(p) != &init_task ? next_task_all(p) : NULL) + +#define ve_is_super(env) 1 +#define ve_accessible(target, owner) 1 +#define ve_accessible_strict(target, owner) 1 +#define ve_accessible_veid(target, owner) 1 +#define ve_accessible_strict_veid(target, owner) 1 + +#define VEID(ve) 0 + +#else /* CONFIG_VE */ + +#include + +#define ve_is_super(env) ((env) == get_ve0()) + +#define ve_accessible_strict(target, owner) ((target) == (owner)) +static inline int ve_accessible(struct ve_struct *target, + struct ve_struct *owner) +{ + return ve_is_super(owner) || ve_accessible_strict(target, owner); +} + +#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) +static inline int ve_accessible_veid(envid_t target, envid_t owner) +{ + return get_ve0()->veid == owner || + ve_accessible_strict_veid(target, owner); +} + +#define VEID(ve) (ve->veid) + +static inline struct task_struct *ve_lh2task(struct ve_struct *ve, + struct list_head *lh) +{ + return lh == &ve->vetask_lh ? NULL : + list_entry(lh, struct task_struct, ve_task_info.vetask_list); +} + +static inline struct task_struct *__first_task_ve(struct ve_struct *ve) +{ + struct task_struct *tsk; + + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(&init_task); + if (tsk == &init_task) + tsk = NULL; + } else { + tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next)); + } + return tsk; +} + +static inline struct task_struct *__next_task_ve(struct ve_struct *ve, + struct task_struct *tsk) +{ + if (unlikely(ve_is_super(ve))) { + tsk = next_task_all(tsk); + if (tsk == &init_task) + tsk = NULL; + } else { + BUG_ON(tsk->ve_task_info.owner_env != ve); + tsk = ve_lh2task(ve, rcu_dereference(tsk-> + ve_task_info.vetask_list.next)); + } + return tsk; +} + +#define first_task_ve() __first_task_ve(get_exec_env()) +#define next_task_ve(p) __next_task_ve(get_exec_env(), p) +/* no one uses prev_task_ve(), copy next_task_ve() if needed */ + +#define for_each_process_ve(p) \ + for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) + +#define do_each_thread_ve(g, t) \ + for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do + +#define while_each_thread_ve(g, t) \ + while ((t = next_thread(t)) != g) + +#endif /* CONFIG_VE */ + #ifndef __HAVE_THREAD_FUNCTIONS #define task_thread_info(task) ((struct thread_info *)(task)->stack) diff -uprN linux-2.6.24/include/linux/security.h linux-2.6.24.ovz/include/linux/security.h --- linux-2.6.24/include/linux/security.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/security.h 2008-03-25 18:53:59.000000000 -0500 @@ -34,11 +34,6 @@ #include #include -/* - * Bounding set - */ -extern kernel_cap_t cap_bset; - extern unsigned securebits; struct ctl_table; diff -uprN linux-2.6.24/include/linux/sem.h linux-2.6.24.ovz/include/linux/sem.h --- linux-2.6.24/include/linux/sem.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sem.h 2008-03-25 18:53:59.000000000 -0500 @@ -154,6 +154,9 @@ static inline void exit_sem(struct task_ } #endif +int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); + #endif /* __KERNEL__ */ #endif /* _LINUX_SEM_H */ diff -uprN linux-2.6.24/include/linux/shm.h linux-2.6.24.ovz/include/linux/shm.h --- linux-2.6.24/include/linux/shm.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/shm.h 2008-03-25 18:53:59.000000000 -0500 @@ -110,6 +110,9 @@ static inline int is_file_shm_hugepages( } #endif +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); + #endif /* __KERNEL__ */ #endif /* _LINUX_SHM_H_ */ diff -uprN linux-2.6.24/include/linux/shmem_fs.h linux-2.6.24.ovz/include/linux/shmem_fs.h --- linux-2.6.24/include/linux/shmem_fs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/shmem_fs.h 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,9 @@ struct shmem_inode_info { struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *shmi_ub; +#endif }; struct shmem_sb_info { @@ -60,4 +63,6 @@ static inline void shmem_acl_destroy_ino } #endif /* CONFIG_TMPFS_POSIX_ACL */ +extern struct file_system_type tmpfs_fs_type; + #endif diff -uprN linux-2.6.24/include/linux/signal.h linux-2.6.24.ovz/include/linux/signal.h --- linux-2.6.24/include/linux/signal.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/signal.h 2008-03-25 18:53:59.000000000 -0500 @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include +#include +#include /* * Real Time signals may be queued. @@ -16,6 +18,9 @@ struct sigqueue { int flags; siginfo_t info; struct user_struct *user; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *sig_ub; +#endif }; /* flags values. */ @@ -371,6 +376,8 @@ int unhandled_signal(struct task_struct (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) +extern struct kmem_cache *sigqueue_cachep; + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ diff -uprN linux-2.6.24/include/linux/skbuff.h linux-2.6.24.ovz/include/linux/skbuff.h --- linux-2.6.24/include/linux/skbuff.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/skbuff.h 2008-03-25 18:53:59.000000000 -0500 @@ -247,6 +247,8 @@ typedef unsigned char *sk_buff_data_t; * @secmark: security marking */ +#include + struct sk_buff { /* These two members must be first. */ struct sk_buff *next; @@ -289,7 +291,13 @@ struct sk_buff { ipvs_property:1, nf_trace:1; __be16 protocol; - +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) + __u8 brmark; +#endif +#ifdef CONFIG_VE + unsigned int accounted:1; + unsigned int redirected:1; +#endif void (*destructor)(struct sk_buff *skb); #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack *nfct; @@ -330,6 +338,8 @@ struct sk_buff { *data; unsigned int truesize; atomic_t users; + struct skb_beancounter skb_bc; + struct ve_struct *owner_env; }; #ifdef __KERNEL__ @@ -337,6 +347,7 @@ struct sk_buff { * Handling routines are only of interest to the kernel */ #include +#include #include @@ -1258,6 +1269,8 @@ static inline void pskb_trim_unique(stru */ static inline void skb_orphan(struct sk_buff *skb) { + ub_skb_uncharge(skb); + if (skb->destructor) skb->destructor(skb); skb->destructor = NULL; @@ -1764,6 +1777,26 @@ static inline void skb_init_secmark(stru { } #endif +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ + to->brmark = from->brmark; +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ + skb->brmark = 0; +} +#else +static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) +{ +} + +static inline void skb_init_brmark(struct sk_buff *skb) +{ +} +#endif + static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { #ifdef CONFIG_NETDEVICES_MULTIQUEUE diff -uprN linux-2.6.24/include/linux/slab.h linux-2.6.24.ovz/include/linux/slab.h --- linux-2.6.24/include/linux/slab.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/slab.h 2008-03-25 18:53:59.000000000 -0500 @@ -46,6 +46,26 @@ (unsigned long)ZERO_SIZE_PTR) /* + * allocation rules: __GFP_UBC 0 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * cache (SLAB_UBC) charge charge + * (usual caches: mm, vma, task_struct, ...) + * + * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- + * (ub_kmalloc) (kmalloc) + * + * cache (no UB flags) BUG() --- + * (nonub caches, mempools) + * + * pages charge --- + * (ub_vmalloc, (vmalloc, + * poll, fdsets, ...) non-ub allocs) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#define SLAB_UBC 0x10000000UL /* alloc space for ubs ... */ +#define SLAB_NO_CHARGE 0x20000000UL /* ... but don't charge */ + +/* * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); @@ -60,7 +80,19 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); - +extern void show_slab_info(void); +int kmem_cache_objuse(struct kmem_cache *cachep); +int kmem_obj_objuse(void *obj); +unsigned long ub_cache_growth(struct kmem_cache *cachep); + +#ifdef CONFIG_BEANCOUNTERS +void kmem_mark_nocharge(struct kmem_cache *cachep); +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj); +struct user_beancounter *slab_ub(void *obj); +#else +static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { } +static inline struct user_beancounter *slab_ub(void *obj) { return NULL; } +#endif /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. diff -uprN linux-2.6.24/include/linux/slab_def.h linux-2.6.24.ovz/include/linux/slab_def.h --- linux-2.6.24/include/linux/slab_def.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/slab_def.h 2008-03-25 18:53:59.000000000 -0500 @@ -15,6 +15,111 @@ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define SLAB_DEBUG 1 +#define SLAB_STATS 1 +#define SLAB_FORCED_DEBUG 1 +#else +#define SLAB_DEBUG 0 +#define SLAB_STATS 0 +#define SLAB_FORCED_DEBUG 0 +#endif + +/* + * struct kmem_cache + * + * manages a cache. + */ + +struct kmem_cache { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ + unsigned int batchcount; + unsigned int limit; + unsigned int shared; + + unsigned int buffer_size; + u32 reciprocal_buffer_size; +/* 3) touched by every alloc & free from the backend */ + + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + gfp_t gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + struct kmem_cache *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor) (struct kmem_cache *, void *); + +/* 5) cache creation/removal */ + const char *name; + struct list_head next; + +/* 6) statistics */ + unsigned long grown; + unsigned long reaped; + unsigned long shrunk; +#if SLAB_STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + unsigned long node_frees; + unsigned long node_overflow; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#if SLAB_DEBUG + /* + * If debugging is enabled, then the allocator can add additional + * fields and/or padding to every object. buffer_size contains the total + * object size including these internal fields, the following two + * variables contain the offset to the user object and its size. + */ + int obj_offset; + int obj_size; +#endif +#ifdef CONFIG_BEANCOUNTERS + int objuse; +#endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ +}; + /* Size description struct for general caches. */ struct cache_sizes { size_t cs_size; @@ -24,6 +129,7 @@ struct cache_sizes { #endif }; extern struct cache_sizes malloc_sizes[]; +extern int malloc_cache_num; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); @@ -48,6 +154,8 @@ static inline void *kmalloc(size_t size, __you_cannot_kmalloc_that_much(); } found: + if (flags & __GFP_UBC) + i += malloc_cache_num; #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, diff -uprN linux-2.6.24/include/linux/slub_def.h linux-2.6.24.ovz/include/linux/slub_def.h --- linux-2.6.24/include/linux/slub_def.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/slub_def.h 2008-03-25 18:53:59.000000000 -0500 @@ -58,6 +58,10 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #endif +#ifdef CONFIG_BEANCOUNTERS + atomic_t grown; + int objuse; +#endif #ifdef CONFIG_NUMA int defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; @@ -86,6 +90,19 @@ struct kmem_cache { */ extern struct kmem_cache kmalloc_caches[PAGE_SHIFT]; +#ifdef CONFIG_BEANCOUNTERS +extern struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx) +{ + return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx]; +} +#else +static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx) +{ + return &kmalloc_caches[idx]; +} +#endif + /* * Sorry that the following has to be that ugly but some versions of GCC * have trouble with constant propagation and loops. @@ -142,14 +159,14 @@ static __always_inline int kmalloc_index * This ought to end up with a global pointer to the right cache * in kmalloc_caches. */ -static __always_inline struct kmem_cache *kmalloc_slab(size_t size) +static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { int index = kmalloc_index(size); if (index == 0) return NULL; - return &kmalloc_caches[index]; + return __kmalloc_cache(flags, index); } #ifdef CONFIG_ZONE_DMA @@ -170,7 +187,7 @@ static __always_inline void *kmalloc(siz get_order(size)); if (!(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; @@ -189,7 +206,7 @@ static __always_inline void *kmalloc_nod { if (__builtin_constant_p(size) && size <= PAGE_SIZE / 2 && !(flags & SLUB_DMA)) { - struct kmem_cache *s = kmalloc_slab(size); + struct kmem_cache *s = kmalloc_slab(size, flags); if (!s) return ZERO_SIZE_PTR; diff -uprN linux-2.6.24/include/linux/smp.h linux-2.6.24.ovz/include/linux/smp.h --- linux-2.6.24/include/linux/smp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/smp.h 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,9 @@ extern void cpu_idle(void); +struct pt_regs; +typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); + #ifdef CONFIG_SMP #include @@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum) */ extern void smp_cpus_done(unsigned int max_cpus); +extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); + /* * Call a function on all other processors */ @@ -111,6 +116,12 @@ static inline void smp_send_reschedule(i #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) +static inline int smp_nmi_call_function(smp_nmi_function func, + void *info, int wait) +{ + return 0; +} + #endif /* !SMP */ /* diff -uprN linux-2.6.24/include/linux/socket.h linux-2.6.24.ovz/include/linux/socket.h --- linux-2.6.24/include/linux/socket.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/socket.h 2008-03-25 18:53:59.000000000 -0500 @@ -297,6 +297,16 @@ struct ucred { #define IPX_TYPE 1 #ifdef __KERNEL__ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len); @@ -310,6 +320,8 @@ extern int memcpy_toiovec(struct iovec * extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int vz_security_family_check(int family); +extern int vz_security_protocol_check(int protocol); #endif #endif /* not kernel and not glibc */ diff -uprN linux-2.6.24/include/linux/sunrpc/clnt.h linux-2.6.24.ovz/include/linux/sunrpc/clnt.h --- linux-2.6.24/include/linux/sunrpc/clnt.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sunrpc/clnt.h 2008-03-25 18:53:59.000000000 -0500 @@ -44,6 +44,7 @@ struct rpc_clnt { cl_intr : 1,/* interruptible */ cl_discrtry : 1,/* disconnect before retry */ cl_autobind : 1;/* use getport() */ + unsigned int cl_broken : 1;/* no responce for too long */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ @@ -55,6 +56,7 @@ struct rpc_clnt { struct rpc_clnt * cl_parent; /* Points to parent of clones */ struct rpc_rtt cl_rtt_default; struct rpc_program * cl_program; + unsigned long cl_pr_time; char cl_inline_name[32]; }; diff -uprN linux-2.6.24/include/linux/sunrpc/xprt.h linux-2.6.24.ovz/include/linux/sunrpc/xprt.h --- linux-2.6.24/include/linux/sunrpc/xprt.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sunrpc/xprt.h 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,14 @@ #define RPC_MAX_SLOT_TABLE (128U) /* + * Grand abort timeout (stop the client if occures) + */ +extern int xprt_abort_timeout; + +#define RPC_MIN_ABORT_TIMEOUT 300 +#define RPC_MAX_ABORT_TIMEOUT INT_MAX + +/* * This describes a timeout strategy */ struct rpc_timeout { @@ -119,6 +127,7 @@ struct rpc_xprt_ops { struct rpc_xprt { struct kref kref; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ + struct ve_struct * owner_env; /* VE owner of mount */ struct rpc_timeout timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ diff -uprN linux-2.6.24/include/linux/swap.h linux-2.6.24.ovz/include/linux/swap.h --- linux-2.6.24/include/linux/swap.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/swap.h 2008-03-25 18:53:59.000000000 -0500 @@ -17,6 +17,7 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 +#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */ static inline int current_is_kswapd(void) { @@ -92,6 +93,7 @@ struct address_space; struct sysinfo; struct writeback_control; struct zone; +struct user_beancounter; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of @@ -121,6 +123,7 @@ enum { SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_READONLY = (1 << 2), }; #define SWAP_CLUSTER_MAX 32 @@ -131,6 +134,7 @@ enum { /* * The in-memory structure used to track swap areas. */ +struct user_beancounter; struct swap_info_struct { unsigned int flags; int prio; /* swap priority */ @@ -148,6 +152,9 @@ struct swap_info_struct { unsigned int max; unsigned int inuse_pages; int next; /* next entry on swap list */ +#ifdef CONFIG_BC_SWAP_ACCOUNTING + struct user_beancounter **swap_ubs; +#endif }; struct swap_list_t { @@ -155,9 +162,19 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +extern struct swap_list_t swap_list; +extern struct swap_info_struct swap_info[MAX_SWAPFILES]; + /* Swap 50% full? Release swapcache more aggressively.. */ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) +/* linux/mm/oom_kill.c */ +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern int register_oom_notifier(struct notifier_block *nb); +extern int unregister_oom_notifier(struct notifier_block *nb); +extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, const char *message); +extern struct task_struct *oom_select_bad_process(struct user_beancounter *ub); + /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); @@ -224,6 +241,9 @@ extern struct address_space swapper_spac extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, gfp_t); extern void __delete_from_swap_cache(struct page *); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); +extern int __add_to_swap_cache(struct page *page, + swp_entry_t entry, gfp_t gfp_mask); extern void delete_from_swap_cache(struct page *); extern int move_to_swap_cache(struct page *, swp_entry_t); extern int move_from_swap_cache(struct page *, unsigned long, @@ -237,7 +257,7 @@ extern struct page * read_swap_cache_asy extern long total_swap_pages; extern unsigned int nr_swapfiles; extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct user_beancounter *); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); @@ -250,6 +270,7 @@ extern sector_t swapdev_block(int, pgoff extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +extern int try_to_remove_exclusive_swap_page(struct page *); struct backing_dev_info; extern spinlock_t swap_lock; @@ -350,7 +371,7 @@ static inline int remove_exclusive_swap_ return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct user_beancounter *ub) { swp_entry_t entry; entry.val = 0; diff -uprN linux-2.6.24/include/linux/sysctl.h linux-2.6.24.ovz/include/linux/sysctl.h --- linux-2.6.24/include/linux/sysctl.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sysctl.h 2008-03-25 18:53:59.000000000 -0500 @@ -159,6 +159,9 @@ enum KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ +#ifdef CONFIG_GRKERNSEC_SYSCTL + KERN_GRSECURITY=98, /* grsecurity */ +#endif KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, @@ -948,6 +951,7 @@ struct ctl_table; extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev); extern void sysctl_head_finish(struct ctl_table_header *prev); extern int sysctl_perm(struct ctl_table *table, int op); +extern int ve_allow_kthreads; typedef struct ctl_table ctl_table; @@ -1034,6 +1038,8 @@ extern ctl_handler sysctl_ms_jiffies; */ /* A sysctl table is an array of struct ctl_table: */ +struct ve_struct; + struct ctl_table { int ctl_name; /* Binary ID */ @@ -1047,6 +1053,8 @@ struct ctl_table ctl_handler *strategy; /* Callback function for all r/w */ void *extra1; void *extra2; + struct ve_struct *owner_env; + int virt_handler; }; /* struct ctl_table_header is used to maintain dynamic lists of @@ -1060,10 +1068,14 @@ struct ctl_table_header }; struct ctl_table_header *register_sysctl_table(struct ctl_table * table); +struct ctl_table_header *register_glob_sysctl_table(struct ctl_table * table); void unregister_sysctl_table(struct ctl_table_header * table); int sysctl_check_table(struct ctl_table *table); +ctl_table *clone_sysctl_template(ctl_table *tmpl); +void free_sysctl_clone(ctl_table *clone); + #else /* __KERNEL__ */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.24/include/linux/sysfs.h linux-2.6.24.ovz/include/linux/sysfs.h --- linux-2.6.24/include/linux/sysfs.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/sysfs.h 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,7 @@ struct kobject; struct module; +struct sysfs_open_dirent; /* FIXME * The *owner field is no longer used, but leave around @@ -76,6 +77,66 @@ struct sysfs_ops { ssize_t (*store)(struct kobject *,struct attribute *,const char *, size_t); }; +/* type-specific structures for sysfs_dirent->s_* union members */ +struct sysfs_elem_dir { + struct kobject *kobj; + /* children list starts here and goes through sd->s_sibling */ + struct sysfs_dirent *children; +}; + +struct sysfs_elem_symlink { + struct sysfs_dirent *target_sd; +}; + +struct sysfs_elem_attr { + struct attribute *attr; + struct sysfs_open_dirent *open; +}; + +struct sysfs_elem_bin_attr { + struct bin_attribute *bin_attr; +}; + +/* + * sysfs_dirent - the building block of sysfs hierarchy. Each and + * every sysfs node is represented by single sysfs_dirent. + * + * As long as s_count reference is held, the sysfs_dirent itself is + * accessible. Dereferencing s_elem or any other outer entity + * requires s_active reference. + */ +struct sysfs_dirent { + atomic_t s_count; + atomic_t s_active; + struct sysfs_dirent *s_parent; + struct sysfs_dirent *s_sibling; + const char *s_name; + + union { + struct sysfs_elem_dir s_dir; + struct sysfs_elem_symlink s_symlink; + struct sysfs_elem_attr s_attr; + struct sysfs_elem_bin_attr s_bin_attr; + }; + + unsigned int s_flags; + ino_t s_ino; + umode_t s_mode; + struct iattr *s_iattr; +}; + +#define SD_DEACTIVATED_BIAS INT_MIN + +#define SYSFS_TYPE_MASK 0x00ff +#define SYSFS_DIR 0x0001 +#define SYSFS_KOBJ_ATTR 0x0002 +#define SYSFS_KOBJ_BIN_ATTR 0x0004 +#define SYSFS_KOBJ_LINK 0x0008 +#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) + +#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK +#define SYSFS_FLAG_REMOVED 0x0200 + #ifdef CONFIG_SYSFS int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), @@ -114,6 +175,8 @@ void sysfs_notify(struct kobject *kobj, extern int __must_check sysfs_init(void); +extern struct file_system_type sysfs_fs_type; + #else /* CONFIG_SYSFS */ static inline int sysfs_schedule_callback(struct kobject *kobj, diff -uprN linux-2.6.24/include/linux/task_io_accounting_ops.h linux-2.6.24.ovz/include/linux/task_io_accounting_ops.h --- linux-2.6.24/include/linux/task_io_accounting_ops.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/task_io_accounting_ops.h 2008-03-25 18:53:59.000000000 -0500 @@ -5,10 +5,12 @@ #define __TASK_IO_ACCOUNTING_OPS_INCLUDED #include +#include #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { + ub_io_account_read(bytes); current->ioac.read_bytes += bytes; } @@ -21,8 +23,14 @@ static inline unsigned long task_io_get_ return p->ioac.read_bytes >> 9; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { + if (sync) + ub_io_account_write(bytes); + else + ub_io_account_dirty(page, bytes); + current->ioac.write_bytes += bytes; } @@ -37,6 +45,7 @@ static inline unsigned long task_io_get_ static inline void task_io_account_cancelled_write(size_t bytes) { + ub_io_account_write_cancelled(bytes); current->ioac.cancelled_write_bytes += bytes; } @@ -56,7 +65,8 @@ static inline unsigned long task_io_get_ return 0; } -static inline void task_io_account_write(size_t bytes) +static inline void task_io_account_write(struct page *page, size_t bytes, + int sync) { } diff -uprN linux-2.6.24/include/linux/tty.h linux-2.6.24.ovz/include/linux/tty.h --- linux-2.6.24/include/linux/tty.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/tty.h 2008-03-25 18:53:59.000000000 -0500 @@ -241,6 +241,7 @@ struct tty_struct { spinlock_t read_lock; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; + struct ve_struct *owner_env; }; /* tty magic number */ @@ -270,6 +271,7 @@ struct tty_struct { #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_FLUSHING 19 /* Flushing to ldisc in progress */ #define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */ +#define TTY_CHARGED 21 /* Charged as ub resource */ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) diff -uprN linux-2.6.24/include/linux/tty_driver.h linux-2.6.24.ovz/include/linux/tty_driver.h --- linux-2.6.24/include/linux/tty_driver.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/tty_driver.h 2008-03-25 18:53:59.000000000 -0500 @@ -222,8 +222,19 @@ struct tty_driver { unsigned int set, unsigned int clear); struct list_head tty_drivers; + struct ve_struct *owner_env; }; +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +#ifdef CONFIG_LEGACY_PTYS +extern struct tty_driver *pty_driver; +extern struct tty_driver *pty_slave_driver; +#endif + extern struct list_head tty_drivers; struct tty_driver *alloc_tty_driver(int lines); @@ -231,6 +242,9 @@ void put_tty_driver(struct tty_driver *d void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op); +struct class *init_ve_tty_class(void); +void fini_ve_tty_class(struct class *ve_tty_class); + /* tty driver magic number */ #define TTY_DRIVER_MAGIC 0x5402 diff -uprN linux-2.6.24/include/linux/types.h linux-2.6.24.ovz/include/linux/types.h --- linux-2.6.24/include/linux/types.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/types.h 2008-03-25 18:53:59.000000000 -0500 @@ -29,6 +29,11 @@ typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; typedef __kernel_mqd_t mqd_t; +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + #ifdef __KERNEL__ typedef _Bool bool; diff -uprN linux-2.6.24/include/linux/utsname.h linux-2.6.24.ovz/include/linux/utsname.h --- linux-2.6.24/include/linux/utsname.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/utsname.h 2008-03-25 18:53:59.000000000 -0500 @@ -42,6 +42,7 @@ struct uts_namespace { struct new_utsname name; }; extern struct uts_namespace init_uts_ns; +extern struct new_utsname virt_utsname; static inline void get_uts_ns(struct uts_namespace *ns) { diff -uprN linux-2.6.24/include/linux/ve.h linux-2.6.24.ovz/include/linux/ve.h --- linux-2.6.24/include/linux/ve.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ve.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,409 @@ +/* + * include/linux/ve.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VE_H +#define _LINUX_VE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef VZMON_DEBUG +# define VZTRACE(fmt,args...) \ + printk(KERN_DEBUG fmt, ##args) +#else +# define VZTRACE(fmt,args...) +#endif /* VZMON_DEBUG */ + +struct tty_driver; +struct devpts_config; +struct task_struct; +struct new_utsname; +struct file_system_type; +struct icmp_mib; +struct ip_mib; +struct tcp_mib; +struct udp_mib; +struct linux_mib; +struct fib_info; +struct fib_rule; +struct veip_struct; +struct ve_monitor; +struct nsproxy; +struct ve_sit_tunnels; + +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct fib_table; +#ifdef CONFIG_VE_IPTABLES +struct xt_table; +struct nf_conn; + +#define FRAG6Q_HASHSZ 64 + +struct ve_nf_conntrack { + struct hlist_head *_bysource; + struct nf_nat_protocol **_nf_nat_protos; + int _nf_nat_vmalloced; + struct xt_table *_nf_nat_table; + struct nf_conntrack_l3proto *_nf_nat_l3proto; + atomic_t _nf_conntrack_count; + int _nf_conntrack_max; + struct hlist_head *_nf_conntrack_hash; + int _nf_conntrack_checksum; + int _nf_conntrack_vmalloc; + struct hlist_head _unconfirmed; + struct hlist_head *_nf_ct_expect_hash; + unsigned int _nf_ct_expect_vmalloc; + unsigned int _nf_ct_expect_count; + unsigned int _nf_ct_expect_max; + struct hlist_head *_nf_ct_helper_hash; + unsigned int _nf_ct_helper_vmalloc; + struct inet_frags _nf_frags6; + struct inet_frags_ctl _nf_frags6_ctl; +#ifdef CONFIG_SYSCTL + /* l4 stuff: */ + unsigned long _nf_ct_icmp_timeout; + unsigned long _nf_ct_icmpv6_timeout; + unsigned int _nf_ct_udp_timeout; + unsigned int _nf_ct_udp_timeout_stream; + unsigned int _nf_ct_generic_timeout; + unsigned int _nf_ct_log_invalid; + unsigned int _nf_ct_tcp_timeout_max_retrans; + int _nf_ct_tcp_be_liberal; + int _nf_ct_tcp_loose; + int _nf_ct_tcp_max_retrans; + unsigned int _nf_ct_tcp_timeouts[10]; + struct ctl_table_header *_icmp_sysctl_header; + unsigned int _tcp_sysctl_table_users; + struct ctl_table_header *_tcp_sysctl_header; + unsigned int _udp_sysctl_table_users; + struct ctl_table_header *_udp_sysctl_header; + struct ctl_table_header *_icmpv6_sysctl_header; + struct ctl_table_header *_generic_sysctl_header; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + struct ctl_table_header *_icmp_compat_sysctl_header; + struct ctl_table_header *_tcp_compat_sysctl_header; + struct ctl_table_header *_udp_compat_sysctl_header; + struct ctl_table_header *_generic_compat_sysctl_header; +#endif + /* l4 protocols sysctl tables: */ + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmp; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmpv6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp4; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp6; + struct nf_conntrack_l4proto *_nf_conntrack_l4proto_generic; + struct nf_conntrack_l4proto **_nf_ct_protos[PF_MAX]; + /* l3 protocols sysctl tables: */ + struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv4; + struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv6; + struct nf_conntrack_l3proto *_nf_ct_l3protos[AF_MAX]; + /* sysctl standalone stuff: */ + struct ctl_table_header *_nf_ct_sysctl_header; + ctl_table *_nf_ct_sysctl_table; + ctl_table *_nf_ct_netfilter_table; + ctl_table *_nf_ct_net_table; + ctl_table *_ip_ct_net_table; + ctl_table *_ip_ct_netfilter_table; + struct ctl_table_header *_ip_ct_sysctl_header; + int _nf_ct_log_invalid_proto_min; + int _nf_ct_log_invalid_proto_max; +#endif /* CONFIG_SYSCTL */ +}; +#endif +#endif + +struct ve_cpu_stats { + cycles_t idle_time; + cycles_t iowait_time; + cycles_t strt_idle_time; + cycles_t used_time; + seqcount_t stat_lock; + int nr_running; + int nr_unint; + int nr_iowait; + cputime64_t user; + cputime64_t nice; + cputime64_t system; +} ____cacheline_aligned; + +struct ve_ipt_recent; +struct ve_xt_hashlimit; + +struct ve_struct { + struct list_head ve_list; + + envid_t veid; + struct list_head vetask_lh; + /* capability bounding set */ + kernel_cap_t ve_cap_bset; + atomic_t pcounter; + /* ref counter to ve from ipc */ + atomic_t counter; + unsigned int class_id; + struct rw_semaphore op_sem; + int is_running; + int is_locked; + atomic_t suspend; + /* see vzcalluser.h for VE_FEATURE_XXX definitions */ + __u64 features; + +/* VE's root */ + struct vfsmount *fs_rootmnt; + struct dentry *fs_root; + +/* sysctl */ + struct list_head sysctl_lh; + struct ctl_table_header *uts_header; + struct file_system_type *proc_fstype; + struct vfsmount *proc_mnt; + struct proc_dir_entry *proc_root; + struct proc_dir_entry *proc_sys_root; + struct proc_dir_entry *_proc_net; + struct proc_dir_entry *_proc_net_stat; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct proc_dir_entry *_proc_net_devsnmp6; +#endif + +/* BSD pty's */ +#ifdef CONFIG_LEGACY_PTYS + struct tty_driver *pty_driver; + struct tty_driver *pty_slave_driver; +#endif +#ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; + struct tty_driver *pts_driver; + struct idr *allocated_ptys; + struct file_system_type *devpts_fstype; + struct vfsmount *devpts_mnt; + struct dentry *devpts_root; + struct devpts_config *devpts_config; +#endif + + struct ve_nfs_context *nfs_context; + + struct file_system_type *shmem_fstype; + struct vfsmount *shmem_mnt; +#ifdef CONFIG_SYSFS + struct file_system_type *sysfs_fstype; + struct vfsmount *sysfs_mnt; + struct super_block *sysfs_sb; + struct sysfs_dirent *_sysfs_root; +#endif +#ifndef CONFIG_SYSFS_DEPRECATED + struct kobject *_virtual_dir; +#endif + struct kset *class_subsys; + struct kset *class_obj_subsys; + struct kset *devices_subsys; + struct class *tty_class; + struct class *mem_class; + +#ifdef CONFIG_NET + struct class *net_class; +#ifdef CONFIG_INET + struct ipv4_devconf *_ipv4_devconf; + struct ipv4_devconf *_ipv4_devconf_dflt; + struct ctl_table_header *forward_header; + struct ctl_table *forward_table; + unsigned long rt_flush_required; + struct neigh_table *ve_arp_tbl; + struct ve_sit_tunnels *_sit_tunnels; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipv6_devconf *_ipv6_devconf; + struct ipv6_devconf *_ipv6_devconf_dflt; + struct neigh_table *ve_nd_tbl; +#endif +#endif +#endif +#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE) + struct veip_struct *veip; + struct net_device *_venet_dev; +#endif + +/* per VE CPU stats*/ + struct timespec start_timespec; + u64 start_jiffies; /* Deprecated */ + cycles_t start_cycles; + unsigned long avenrun[3]; /* loadavg data */ + + cycles_t cpu_used_ve; + struct kstat_lat_pcpu_struct sched_lat_ve; + +#ifdef CONFIG_INET + struct hlist_head *_fib_info_hash; + struct hlist_head *_fib_info_laddrhash; + int _fib_hash_size; + int _fib_info_cnt; + + struct fib_rule *_local_rule; + struct list_head _fib_rules; + /* XXX: why a magic constant? */ +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct hlist_head _fib_table_hash[256]; +#else + struct hlist_head _fib_table_hash[1]; + struct fib_table *_main_table; + struct fib_table *_local_table; +#endif + struct icmp_mib *_icmp_statistics[2]; + struct icmpmsg_mib *_icmpmsg_statistics[2]; + struct ipstats_mib *_ip_statistics[2]; + struct tcp_mib *_tcp_statistics[2]; + struct udp_mib *_udp_statistics[2]; + struct udp_mib *_udplite_statistics[2]; + struct linux_mib *_net_statistics[2]; + struct venet_stat *stat; +#ifdef CONFIG_VE_IPTABLES +/* core/netfilter.c virtualization */ + void *_nf_hooks; + struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ + struct xt_table *_ve_ip6t_filter_pf; + struct xt_table *_ipt_mangle_table; + struct xt_table *_ip6t_mangle_table; + struct list_head _xt_tables[NPROTO]; + + __u64 _iptables_modules; + struct ve_nf_conntrack *_nf_conntrack; + struct ve_ipt_recent *_ipt_recent; + struct ve_xt_hashlimit *_xt_hashlimit; +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + struct hlist_head _fib6_table_hash[256]; + struct fib6_table *_fib6_local_table; +#else + struct hlist_head _fib6_table_hash[1]; +#endif + struct fib6_table *_fib6_table; + struct ipstats_mib *_ipv6_statistics[2]; + struct icmpv6_mib *_icmpv6_statistics[2]; + struct icmpv6msg_mib *_icmpv6msg_statistics[2]; + struct udp_mib *_udp_stats_in6[2]; + struct udp_mib *_udplite_stats_in6[2]; +#endif +#endif + wait_queue_head_t *_log_wait; + unsigned long *_log_start; + unsigned long *_log_end; + unsigned long *_logged_chars; + char *log_buf; +#define VE_DEFAULT_LOG_BUF_LEN 4096 + + struct ve_cpu_stats *cpu_stats; + unsigned long down_at; + struct list_head cleanup_list; +#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) + struct list_head _fuse_conn_list; + struct super_block *_fuse_control_sb; + + struct file_system_type *fuse_fs_type; + struct file_system_type *fuse_ctl_fs_type; +#endif +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + struct proc_dir_entry *_proc_vlan_dir; + struct proc_dir_entry *_proc_vlan_conf; +#endif + unsigned long jiffies_fixup; + unsigned char disable_net; + struct ve_monitor *monitor; + struct proc_dir_entry *monitor_proc; + unsigned long meminfo_val; + +#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) \ + || defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) + unsigned int _nlmsvc_users; + pid_t _nlmsvc_pid; + int _nlmsvc_grace_period; + unsigned long _nlmsvc_timeout; +#endif + + struct nsproxy *ve_ns; +#ifdef CONFIG_GRKERNSEC + struct { + int lock; +#ifdef CONFIG_GRKERNSEC_TPE + int enable_tpe; + int tpe_gid; +#ifdef CONFIG_GRKERNSEC_TPE_ALL + int enable_tpe_all; +#endif +#endif /*CONFIG_GRKERNSEC_TPE */ + } grsec; +#endif /* CONFIG_GRKERNSEC */ +}; + +#define VE_CPU_STATS(ve, cpu) (per_cpu_ptr((ve)->cpu_stats, cpu)) + +extern int nr_ve; + +#ifdef CONFIG_VE + +void do_update_load_avg_ve(void); +void do_env_free(struct ve_struct *ptr); + +static inline struct ve_struct *get_ve(struct ve_struct *ptr) +{ + if (ptr != NULL) + atomic_inc(&ptr->counter); + return ptr; +} + +static inline void put_ve(struct ve_struct *ptr) +{ + if (ptr && atomic_dec_and_test(&ptr->counter)) { + if (atomic_read(&ptr->pcounter) > 0) + BUG(); + if (ptr->is_running) + BUG(); + do_env_free(ptr); + } +} + +static inline void pget_ve(struct ve_struct *ptr) +{ + atomic_inc(&ptr->pcounter); +} + +void ve_cleanup_schedule(struct ve_struct *); +static inline void pput_ve(struct ve_struct *ptr) +{ + if (unlikely(atomic_dec_and_test(&ptr->pcounter))) + ve_cleanup_schedule(ptr); +} + +extern spinlock_t ve_cleanup_lock; +extern struct list_head ve_cleanup_list; +extern struct task_struct *ve_cleanup_thread; + +extern unsigned long long ve_relative_clock(struct timespec * ts); + +#ifdef CONFIG_FAIRSCHED +#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) +#else +#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) +#endif +#else /* CONFIG_VE */ +#define ve_utsname system_utsname +#define get_ve(ve) (NULL) +#define put_ve(ve) do { } while (0) +#define pget_ve(ve) do { } while (0) +#define pput_ve(ve) do { } while (0) +#endif /* CONFIG_VE */ + +#endif /* _LINUX_VE_H */ diff -uprN linux-2.6.24/include/linux/ve_nfs.h linux-2.6.24.ovz/include/linux/ve_nfs.h --- linux-2.6.24/include/linux/ve_nfs.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ve_nfs.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,29 @@ +/* + * linux/include/ve_nfs.h + * + * VE context for NFS + * + * Copyright (C) 2007 SWsoft + */ + +#ifndef __VE_NFS_H__ +#define __VE_NFS_H__ + +#ifdef CONFIG_VE + +#include + +#define NFS_CTX_FIELD(arg) (get_exec_env()->_##arg) + +#define nlmsvc_grace_period NFS_CTX_FIELD(nlmsvc_grace_period) +#define nlmsvc_timeout NFS_CTX_FIELD(nlmsvc_timeout) +#define nlmsvc_users NFS_CTX_FIELD(nlmsvc_users) +#define nlmsvc_pid NFS_CTX_FIELD(nlmsvc_pid) +#else +#define nlmsvc_grace_period _nlmsvc_timeout +#define nlmsvc_timeout _nlmsvc_grace_period +#define nlmsvc_pid _nlmsvc_pid +#define nlmsvc_timeout _nlmsvc_timeout +#endif + +#endif diff -uprN linux-2.6.24/include/linux/ve_proto.h linux-2.6.24.ovz/include/linux/ve_proto.h --- linux-2.6.24/include/linux/ve_proto.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ve_proto.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,89 @@ +/* + * include/linux/ve_proto.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_H__ +#define __VE_H__ + +#ifdef CONFIG_VE + +struct ve_struct; + +#ifdef CONFIG_INET +void ip_fragment_cleanup(struct ve_struct *envid); +void tcp_v4_kill_ve_sockets(struct ve_struct *envid); +#ifdef CONFIG_VE_NETDEV +int venet_init(void); +#endif +#else +static inline void ip_fragment_cleanup(struct ve_struct *ve) { ; } +#endif + +extern struct list_head ve_list_head; +#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list) +extern rwlock_t ve_list_lock; +extern struct ve_struct *get_ve_by_id(envid_t); +extern struct ve_struct *__find_ve_by_id(envid_t); + +struct env_create_param3; +extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, + struct env_create_param3 *data, int datalen); +extern void ve_move_task(struct task_struct *, struct ve_struct *); + +int set_device_perms_ve(envid_t veid, unsigned type, dev_t dev, unsigned mask); +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); +void clean_device_perms_ve(envid_t veid); +extern struct file_operations proc_devperms_ops; + +enum { + VE_SS_CHAIN, + + VE_MAX_CHAINS +}; + +typedef int ve_hook_init_fn(void *data); +typedef void ve_hook_fini_fn(void *data); + +struct ve_hook +{ + ve_hook_init_fn *init; + ve_hook_fini_fn *fini; + struct module *owner; + + /* Functions are called in ascending priority */ + int priority; + + /* Private part */ + struct list_head list; +}; + +enum { + HOOK_PRIO_DEFAULT = 0, + + HOOK_PRIO_FS = HOOK_PRIO_DEFAULT, + + HOOK_PRIO_NET_PRE, + HOOK_PRIO_NET, + HOOK_PRIO_NET_POST, + + HOOK_PRIO_AFTERALL = INT_MAX +}; + +extern int ve_hook_iterate_init(int chain, void *data); +extern void ve_hook_iterate_fini(int chain, void *data); + +extern void ve_hook_register(int chain, struct ve_hook *vh); +extern void ve_hook_unregister(struct ve_hook *vh); +#else /* CONFIG_VE */ +#define ve_hook_register(ch, vh) do { } while (0) +#define ve_hook_unregister(ve) do { } while (0) + +#define get_device_perms_ve(t, d, a) (0) +#endif /* CONFIG_VE */ +#endif diff -uprN linux-2.6.24/include/linux/ve_task.h linux-2.6.24.ovz/include/linux/ve_task.h --- linux-2.6.24/include/linux/ve_task.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/ve_task.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,68 @@ +/* + * include/linux/ve_task.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_TASK_H__ +#define __VE_TASK_H__ + +#include +#include + +struct ve_task_info { +/* virtualization */ + struct ve_struct *owner_env; + struct ve_struct *exec_env; + struct ve_struct *saved_env; + struct list_head vetask_list; + struct dentry *glob_proc_dentry; +/* statistics: scheduling latency */ + cycles_t sleep_time; + cycles_t sched_time; + cycles_t sleep_stamp; + cycles_t wakeup_stamp; + seqcount_t wakeup_lock; +}; + +#define VE_TASK_INFO(task) (&(task)->ve_task_info) +#define VE_TASK_LIST_2_TASK(lh) \ + list_entry(lh, struct task_struct, ve_task_info.vetask_list) + +#ifdef CONFIG_VE +extern struct ve_struct ve0; +#define get_ve0() (&ve0) + +#define ve_save_context(t) do { \ + t->ve_task_info.saved_env = \ + t->ve_task_info.exec_env; \ + t->ve_task_info.exec_env = get_ve0(); \ + } while (0) +#define ve_restore_context(t) do { \ + t->ve_task_info.exec_env = \ + t->ve_task_info.saved_env; \ + } while (0) + +#define get_exec_env() (current->ve_task_info.exec_env) +#define set_exec_env(ve) ({ \ + struct ve_task_info *vi; \ + struct ve_struct *old; \ + \ + vi = ¤t->ve_task_info; \ + old = vi->exec_env; \ + vi->exec_env = ve; \ + old; \ + }) +#else +#define get_ve0() (NULL) +#define get_exec_env() (NULL) +#define set_exec_env(new_env) (NULL) +#define ve_save_context(t) do { } while (0) +#define ve_restore_context(t) do { } while (0) +#endif + +#endif /* __VE_TASK_H__ */ diff -uprN linux-2.6.24/include/linux/veip.h linux-2.6.24.ovz/include/linux/veip.h --- linux-2.6.24/include/linux/veip.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/veip.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,15 @@ +#ifndef __VE_IP_H_ +#define __VE_IP_H_ + +struct ve_addr_struct { + int family; + __u32 key[4]; +}; + +struct sockaddr; + +extern void veaddr_print(char *, int, struct ve_addr_struct *); +extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, + struct ve_addr_struct *veaddr); + +#endif diff -uprN linux-2.6.24/include/linux/venet.h linux-2.6.24.ovz/include/linux/venet.h --- linux-2.6.24/include/linux/venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/venet.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,72 @@ +/* + * include/linux/venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VENET_H +#define _VENET_H + +#include +#include +#include +#include + +#define VEIP_HASH_SZ 512 + +struct ve_struct; +struct venet_stat; +struct ip_entry_struct +{ + struct ve_addr_struct addr; + struct ve_struct *active_env; + struct venet_stat *stat; + struct veip_struct *veip; + struct list_head ip_hash; + struct list_head ve_list; +}; + +struct veip_struct +{ + struct list_head src_lh; + struct list_head dst_lh; + struct list_head ip_lh; + struct list_head list; + envid_t veid; +}; + +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); +/* veip_hash_lock should be taken for write by caller */ +void ip_entry_unhash(struct ip_entry_struct *entry); +/* veip_hash_lock should be taken for read by caller */ +struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *); + +/* veip_hash_lock should be taken for read by caller */ +struct veip_struct *veip_find(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +struct veip_struct *veip_findcreate(envid_t veid); +/* veip_hash_lock should be taken for write by caller */ +void veip_put(struct veip_struct *veip); + +extern struct list_head veip_lh; + +int veip_start(struct ve_struct *ve); +void veip_stop(struct ve_struct *ve); +__exit void veip_cleanup(void); +int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); +int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); +int venet_change_skb_owner(struct sk_buff *skb); + +extern struct list_head ip_entry_hash_table[]; +extern rwlock_t veip_hash_lock; + +#ifdef CONFIG_PROC_FS +int veip_seq_show(struct seq_file *m, void *v); +#endif + +#endif diff -uprN linux-2.6.24/include/linux/veprintk.h linux-2.6.24.ovz/include/linux/veprintk.h --- linux-2.6.24/include/linux/veprintk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/veprintk.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,38 @@ +/* + * include/linux/veprintk.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VE_PRINTK_H__ +#define __VE_PRINTK_H__ + +#ifdef CONFIG_VE + +#define ve_log_wait (*(get_exec_env()->_log_wait)) +#define ve_log_start (*(get_exec_env()->_log_start)) +#define ve_log_end (*(get_exec_env()->_log_end)) +#define ve_logged_chars (*(get_exec_env()->_logged_chars)) +#define ve_log_buf (get_exec_env()->log_buf) +#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ + log_buf_len : VE_DEFAULT_LOG_BUF_LEN) +#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) +#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) + +#else + +#define ve_log_wait log_wait +#define ve_log_start log_start +#define ve_log_end log_end +#define ve_logged_chars logged_chars +#define ve_log_buf log_buf +#define ve_log_buf_len log_buf_len +#define VE_LOG_BUF_MASK LOG_BUF_MASK +#define VE_LOG_BUF(idx) LOG_BUF(idx) + +#endif /* CONFIG_VE */ +#endif /* __VE_PRINTK_H__ */ diff -uprN linux-2.6.24/include/linux/virtinfo.h linux-2.6.24.ovz/include/linux/virtinfo.h --- linux-2.6.24/include/linux/virtinfo.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/virtinfo.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,99 @@ +/* + * include/linux/virtinfo.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VIRTINFO_H +#define __LINUX_VIRTINFO_H + +#include +#include +#include + +struct vnotifier_block +{ + int (*notifier_call)(struct vnotifier_block *self, + unsigned long, void *, int); + struct vnotifier_block *next; + int priority; +}; + +extern struct semaphore virtinfo_sem; +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_register(int type, struct vnotifier_block *nb); +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); +int virtinfo_notifier_call(int type, unsigned long n, void *data); + +struct page_info { + unsigned long nr_file_dirty; + unsigned long nr_writeback; + unsigned long nr_anon_pages; + unsigned long nr_file_mapped; + unsigned long nr_slab_rec; + unsigned long nr_slab_unrec; + unsigned long nr_pagetable; + unsigned long nr_unstable_nfs; + unsigned long nr_bounce; +}; + +struct meminfo { + struct sysinfo si; + struct page_info pi; + unsigned long active, inactive; + unsigned long cache, swapcache; + unsigned long committed_space; + unsigned long allowed; + unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; +}; + +#define VIRTINFO_MEMINFO 0 +#define VIRTINFO_ENOUGHMEM 1 +#define VIRTINFO_DOFORK 2 +#define VIRTINFO_DOEXIT 3 +#define VIRTINFO_DOEXECVE 4 +#define VIRTINFO_DOFORKRET 5 +#define VIRTINFO_DOFORKPOST 6 +#define VIRTINFO_EXIT 7 +#define VIRTINFO_EXITMMAP 8 +#define VIRTINFO_EXECMMAP 9 +#define VIRTINFO_OUTOFMEM 10 +#define VIRTINFO_PAGEIN 11 +#define VIRTINFO_SYSINFO 12 +#define VIRTINFO_NEWUBC 13 +#define VIRTINFO_VMSTAT 14 + +enum virt_info_types { + VITYPE_GENERAL, + VITYPE_FAUDIT, + VITYPE_QUOTA, + VITYPE_SCP, + + VIRT_TYPES +}; + +#ifdef CONFIG_VZ_GENCALLS + +static inline int virtinfo_gencall(unsigned long n, void *data) +{ + int r; + + r = virtinfo_notifier_call(VITYPE_GENERAL, n, data); + if (r & NOTIFY_FAIL) + return -ENOBUFS; + if (r & NOTIFY_OK) + return -ERESTARTNOINTR; + return 0; +} + +#else + +#define virtinfo_gencall(n, data) 0 + +#endif + +#endif /* __LINUX_VIRTINFO_H */ diff -uprN linux-2.6.24/include/linux/virtinfoscp.h linux-2.6.24.ovz/include/linux/virtinfoscp.h --- linux-2.6.24/include/linux/virtinfoscp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/virtinfoscp.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,21 @@ +#ifndef __VIRTINFO_SCP_H__ +#define __VIRTINFO_SCP_H__ + +/* + * Dump and restore operations are non-symmetric. + * With respect to finish/fail hooks, 2 dump hooks are called from + * different proc operations, but restore hooks are called from a single one. + */ +#define VIRTINFO_SCP_COLLECT 0x10 +#define VIRTINFO_SCP_DUMP 0x11 +#define VIRTINFO_SCP_DMPFIN 0x12 +#define VIRTINFO_SCP_RSTCHECK 0x13 +#define VIRTINFO_SCP_RESTORE 0x14 +#define VIRTINFO_SCP_RSTFAIL 0x15 + +#define VIRTINFO_SCP_RSTTSK 0x20 +#define VIRTINFO_SCP_RSTMM 0x21 + +#define VIRTNOTIFY_CHANGE 0x100 + +#endif /* __VIRTINFO_SCP_H__ */ diff -uprN linux-2.6.24/include/linux/vmalloc.h linux-2.6.24.ovz/include/linux/vmalloc.h --- linux-2.6.24/include/linux/vmalloc.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vmalloc.h 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,10 @@ struct vm_area_struct; #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ #endif +/* align size to 2^n page boundary */ +#define POWER2_PAGE_ALIGN(size) \ + ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) + struct vm_struct { /* keep next,addr,size together to speedup lookups */ struct vm_struct *next; @@ -37,12 +41,16 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); +extern void *ub_vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); +extern void *ub_vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *vmalloc_best(unsigned long size); +extern void *ub_vmalloc_best(unsigned long size); extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); extern void vfree(void *addr); @@ -68,6 +76,9 @@ static inline size_t get_vm_area_size(co extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct * get_vm_area_best(unsigned long size, + unsigned long flags); +extern void vprintstat(void); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); diff -uprN linux-2.6.24/include/linux/vmstat.h linux-2.6.24.ovz/include/linux/vmstat.h --- linux-2.6.24/include/linux/vmstat.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vmstat.h 2008-03-25 18:53:59.000000000 -0500 @@ -79,6 +79,7 @@ static inline void count_vm_events(enum put_cpu(); } +extern unsigned long vm_events(enum vm_event_item i); extern void all_vm_events(unsigned long *); #ifdef CONFIG_HOTPLUG extern void vm_events_fold_cpu(int cpu); diff -uprN linux-2.6.24/include/linux/vzcalluser.h linux-2.6.24.ovz/include/linux/vzcalluser.h --- linux-2.6.24/include/linux/vzcalluser.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzcalluser.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,242 @@ +/* + * include/linux/vzcalluser.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCALLUSER_H +#define _LINUX_VZCALLUSER_H + +#include +#include + +#define KERN_VZ_PRIV_RANGE 51 + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * VE management ioctls + */ + +struct vzctl_old_env_create { + envid_t veid; + unsigned flags; +#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ +#define VE_EXCLUSIVE 2 /* Fail if exists */ +#define VE_ENTER 4 /* Enter existing VE */ +#define VE_TEST 8 /* Test if VE exists */ +#define VE_LOCK 16 /* Do not allow entering created VE */ +#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ + __u32 addr; +}; + +struct vzctl_mark_env_to_down { + envid_t veid; +}; + +struct vzctl_setdevperms { + envid_t veid; + unsigned type; +#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ +#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ +#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ + unsigned dev; + unsigned mask; +}; + +struct vzctl_ve_netdev { + envid_t veid; + int op; +#define VE_NETDEV_ADD 1 +#define VE_NETDEV_DEL 2 + char __user *dev_name; +}; + +struct vzctl_ve_meminfo { + envid_t veid; + unsigned long val; +}; + +/* these masks represent modules */ +#define VE_IP_IPTABLES_MOD (1U<<0) +#define VE_IP_FILTER_MOD (1U<<1) +#define VE_IP_MANGLE_MOD (1U<<2) +#define VE_IP_CONNTRACK_MOD (1U<<14) +#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) +#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) +#define VE_IP_NAT_MOD (1U<<20) +#define VE_IP_NAT_FTP_MOD (1U<<21) +#define VE_IP_NAT_IRC_MOD (1U<<22) +#define VE_IP_IPTABLES6_MOD (1U<<26) +#define VE_IP_FILTER6_MOD (1U<<27) +#define VE_IP_MANGLE6_MOD (1U<<28) +#define VE_IP_IPTABLE_NAT_MOD (1U<<29) +#define VE_NF_CONNTRACK_MOD (1U<<30) + +/* these masks represent modules with their dependences */ +#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) +#define VE_IP_FILTER (VE_IP_FILTER_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD) +#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6) +#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6) +#define VE_NF_CONNTRACK (VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ + | VE_IP_IPTABLES) +#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT (VE_IP_NAT_MOD \ + | VE_IP_CONNTRACK) +#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_FTP) +#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ + | VE_IP_NAT | VE_IP_CONNTRACK_IRC) +#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK) + +/* safe iptables mask to be used by default */ +#define VE_IP_DEFAULT \ + (VE_IP_IPTABLES | \ + VE_IP_FILTER | VE_IP_MANGLE) + +#define VE_IPT_CMP(x,y) (((x) & (y)) == (y)) + +struct vzctl_env_create_cid { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct vzctl_env_create { + envid_t veid; + unsigned flags; + __u32 class_id; +}; + +struct env_create_param { + __u64 iptables_mask; +}; + +#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) + +struct env_create_param2 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; /* 0 - don't care, same as in host */ +}; + +struct env_create_param3 { + __u64 iptables_mask; + __u64 feature_mask; + __u32 total_vcpus; + __u32 pad; + __u64 known_features; +}; + +#define VE_FEATURE_SYSFS (1ULL << 0) +#define VE_FEATURE_NFS (1ULL << 1) +#define VE_FEATURE_DEF_PERMS (1ULL << 2) + +#define VE_FEATURES_OLD (VE_FEATURE_SYSFS) +#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \ + VE_FEATURE_DEF_PERMS) + +typedef struct env_create_param3 env_create_param_t; +#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t) + +struct vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + env_create_param_t __user *data; + int datalen; +}; + +struct vz_load_avg { + int val_int; + int val_frac; +}; + +struct vz_cpu_stat { + unsigned long user_jif; + unsigned long nice_jif; + unsigned long system_jif; + unsigned long uptime_jif; + __u64 idle_clk; + __u64 strv_clk; + __u64 uptime_clk; + struct vz_load_avg avenrun[3]; /* loadavg data */ +}; + +struct vzctl_cpustatctl { + envid_t veid; + struct vz_cpu_stat __user *cpustat; +}; + +#define VZCTLTYPE '.' +#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ + struct vzctl_old_env_create) +#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ + struct vzctl_mark_env_to_down) +#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ + struct vzctl_setdevperms) +#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ + struct vzctl_env_create_cid) +#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ + struct vzctl_env_create) +#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ + struct vzctl_cpustatctl) +#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct vzctl_env_create_data) +#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct vzctl_ve_netdev) +#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct vzctl_ve_meminfo) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include + +struct compat_vzctl_ve_netdev { + envid_t veid; + int op; + compat_uptr_t dev_name; +}; + +struct compat_vzctl_ve_meminfo { + envid_t veid; + compat_ulong_t val; +}; + +struct compat_vzctl_env_create_data { + envid_t veid; + unsigned flags; + __u32 class_id; + compat_uptr_t data; + int datalen; +}; + +#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ + struct compat_vzctl_env_create_data) +#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \ + struct compat_vzctl_ve_netdev) +#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ + struct compat_vzctl_ve_meminfo) +#endif +#endif + +#endif diff -uprN linux-2.6.24/include/linux/vzctl.h linux-2.6.24.ovz/include/linux/vzctl.h --- linux-2.6.24/include/linux/vzctl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzctl.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,30 @@ +/* + * include/linux/vzctl.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _LINUX_VZCTL_H +#define _LINUX_VZCTL_H + +#include + +struct module; +struct inode; +struct file; +struct vzioctlinfo { + unsigned type; + int (*ioctl)(struct file *, unsigned int, unsigned long); + int (*compat_ioctl)(struct file *, unsigned int, unsigned long); + struct module *owner; + struct list_head list; +}; + +extern void vzioctl_register(struct vzioctlinfo *inf); +extern void vzioctl_unregister(struct vzioctlinfo *inf); + +#endif diff -uprN linux-2.6.24/include/linux/vzctl_quota.h linux-2.6.24.ovz/include/linux/vzctl_quota.h --- linux-2.6.24/include/linux/vzctl_quota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzctl_quota.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,74 @@ +/* + * include/linux/vzctl_quota.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __LINUX_VZCTL_QUOTA_H__ +#define __LINUX_VZCTL_QUOTA_H__ + +#include + +#ifndef __KERNEL__ +#define __user +#endif + +/* + * Quota management ioctl + */ + +struct vz_quota_stat; +struct vzctl_quotactl { + int cmd; + unsigned int quota_id; + struct vz_quota_stat __user *qstat; + char __user *ve_root; +}; + +struct vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + void *addr; /* user-level buffer */ +}; + +#define VZDQCTLTYPE '+' +#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct vzctl_quotactl) +#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct vzctl_quotaugidctl) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_quotactl { + int cmd; + unsigned int quota_id; + compat_uptr_t qstat; + compat_uptr_t ve_root; +}; + +struct compat_vzctl_quotaugidctl { + int cmd; /* subcommand */ + unsigned int quota_id; /* quota id where it applies to */ + unsigned int ugid_index;/* for reading statistic. index of first + uid/gid record to read */ + unsigned int ugid_size; /* size of ugid_buf array */ + compat_uptr_t addr; /* user-level buffer */ +}; + +#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \ + struct compat_vzctl_quotactl) +#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ + struct compat_vzctl_quotaugidctl) +#endif +#endif + +#endif /* __LINUX_VZCTL_QUOTA_H__ */ diff -uprN linux-2.6.24/include/linux/vzctl_venet.h linux-2.6.24.ovz/include/linux/vzctl_venet.h --- linux-2.6.24/include/linux/vzctl_venet.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzctl_venet.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,51 @@ +/* + * include/linux/vzctl_venet.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VENET_H +#define _VZCTL_VENET_H + +#include +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_ip_map { + envid_t veid; + int op; +#define VE_IP_ADD 1 +#define VE_IP_DEL 2 + struct sockaddr *addr; + int addrlen; +}; + +#define VENETCTLTYPE '(' + +#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct vzctl_ve_ip_map) + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +struct compat_vzctl_ve_ip_map { + envid_t veid; + int op; + compat_uptr_t addr; + int addrlen; +}; + +#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ + struct compat_vzctl_ve_ip_map) +#endif +#endif + +#endif diff -uprN linux-2.6.24/include/linux/vzctl_veth.h linux-2.6.24.ovz/include/linux/vzctl_veth.h --- linux-2.6.24/include/linux/vzctl_veth.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzctl_veth.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,42 @@ +/* + * include/linux/vzctl_veth.h + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZCTL_VETH_H +#define _VZCTL_VETH_H + +#include +#include + +#ifndef __ENVID_T_DEFINED__ +typedef unsigned envid_t; +#define __ENVID_T_DEFINED__ +#endif + +struct vzctl_ve_hwaddr { + envid_t veid; + int op; +#define VE_ETH_ADD 1 +#define VE_ETH_DEL 2 +#define VE_ETH_ALLOW_MAC_CHANGE 3 +#define VE_ETH_DENY_MAC_CHANGE 4 + unsigned char dev_addr[6]; + int addrlen; + char dev_name[16]; + unsigned char dev_addr_ve[6]; + int addrlen_ve; + char dev_name_ve[16]; +}; + +#define VETHCTLTYPE '[' + +#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ + struct vzctl_ve_hwaddr) + +#endif diff -uprN linux-2.6.24/include/linux/vzdq_tree.h linux-2.6.24.ovz/include/linux/vzdq_tree.h --- linux-2.6.24/include/linux/vzdq_tree.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzdq_tree.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,99 @@ +/* + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota tree definition + */ + +#ifndef _VZDQ_TREE_H +#define _VZDQ_TREE_H + +#include +#include + +typedef unsigned int quotaid_t; +#define QUOTAID_BITS 32 +#define QUOTAID_BBITS 4 +#define QUOTAID_EBITS 8 + +#if QUOTAID_EBITS % QUOTAID_BBITS +#error Quota bit assumption failure +#endif + +#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) +#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) +#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ + / QUOTAID_BBITS) +#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ + / QUOTAID_EBITS) +#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) + +/* + * Depth of keeping unused node (not inclusive). + * 0 means release all nodes including root, + * QUOTATREE_DEPTH means never release nodes. + * Current value: release all nodes strictly after QUOTATREE_EDEPTH + * (measured in external shift units). + */ +#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ + - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ + + 1) + +/* + * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. + * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), + * and each node contains 2^QUOTAID_BBITS pointers. + * Level 0 is a (single) tree root node. + * + * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. + * Nodes of lower levels contain pointers to nodes. + * + * Double pointer in array of i-level node, pointing to a (i+1)-level node + * (such as inside quotatree_find_state) are marked by level (i+1), not i. + * Level 0 double pointer is a pointer to root inside tree struct. + * + * The tree is permanent, i.e. all index blocks allocated are keeped alive to + * preserve the blocks numbers in the quota file tree to keep its changes + * locally. + */ +struct quotatree_node { + struct list_head list; + quotaid_t num; + void *blocks[QUOTATREE_BSIZE]; +}; + +struct quotatree_level { + struct list_head usedlh, freelh; + quotaid_t freenum; +}; + +struct quotatree_tree { + struct quotatree_level levels[QUOTATREE_DEPTH]; + struct quotatree_node *root; + unsigned int leaf_num; +}; + +struct quotatree_find_state { + void **block; + int level; +}; + +/* number of leafs (objects) and leaf level of the tree */ +#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) +#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) + +struct quotatree_tree *quotatree_alloc(void); +void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st); +int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, + struct quotatree_find_state *st, void *data); +void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); +void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); +void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); +void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); + +#endif /* _VZDQ_TREE_H */ + diff -uprN linux-2.6.24/include/linux/vzevent.h linux-2.6.24.ovz/include/linux/vzevent.h --- linux-2.6.24/include/linux/vzevent.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzevent.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,13 @@ +#ifndef __LINUX_VZ_EVENT_H__ +#define __LINUX_VZ_EVENT_H__ + +#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE) +extern int vzevent_send(int msg, const char *attrs_fmt, ...); +#else +static inline int vzevent_send(int msg, const char *attrs_fmt, ...) +{ + return 0; +} +#endif + +#endif /* __LINUX_VZ_EVENT_H__ */ diff -uprN linux-2.6.24/include/linux/vzquota.h linux-2.6.24.ovz/include/linux/vzquota.h --- linux-2.6.24/include/linux/vzquota.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzquota.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,380 @@ +/* + * + * Copyright (C) 2001-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * This file contains Virtuozzo disk quota implementation + */ + +#ifndef _VZDQUOTA_H +#define _VZDQUOTA_H + +#include +#include + +/* vzquotactl syscall commands */ +#define VZ_DQ_CREATE 5 /* create quota master block */ +#define VZ_DQ_DESTROY 6 /* destroy qmblk */ +#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ +#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ +#define VZ_DQ_SETLIMIT 9 /* set new limits */ +#define VZ_DQ_GETSTAT 10 /* get usage statistic */ +#define VZ_DQ_OFF_FORCED 11 /* forced off */ +/* set of syscalls to maintain UGID quotas */ +#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ +#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ +#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ +#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ +#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ +#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ +#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ +#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ + +/* common structure for vz and ugid quota */ +struct dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +/* One second resolution for grace times */ +#define CURRENT_TIME_SECONDS (get_seconds()) + +/* Values for dq_info->flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +struct dq_info { + time_t bexpire; /* expire timeout for excessive disk use */ + time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct vz_quota_stat { + struct dq_stat dq_stat; + struct dq_info dq_info; +}; + +/* UID/GID interface record - for user-kernel level exchange */ +struct vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct dq_stat qi_stat; /* limits, options, usage stats */ +}; + +#ifdef CONFIG_COMPAT +#include +struct compat_dq_stat { + /* blocks limits */ + __u64 bhardlimit; /* absolute limit in bytes */ + __u64 bsoftlimit; /* preferred limit in bytes */ + compat_time_t btime; /* time limit for excessive disk use */ + __u64 bcurrent; /* current bytes count */ + /* inodes limits */ + __u32 ihardlimit; /* absolute limit on allocated inodes */ + __u32 isoftlimit; /* preferred inode limit */ + compat_time_t itime; /* time limit for excessive inode use */ + __u32 icurrent; /* current # allocated inodes */ +}; + +struct compat_dq_info { + compat_time_t bexpire; /* expire timeout for excessive disk use */ + compat_time_t iexpire; /* expire timeout for excessive inode use */ + unsigned flags; /* see previos defines */ +}; + +struct compat_vz_quota_stat { + struct compat_dq_stat dq_stat; + struct compat_dq_info dq_info; +}; + +struct compat_vz_quota_iface { + unsigned int qi_id; /* UID/GID this applies to */ + unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ + struct compat_dq_stat qi_stat; /* limits, options, usage stats */ +}; + +static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs, + struct dq_stat *dqs) +{ + dqs->bhardlimit = odqs->bhardlimit; + dqs->bsoftlimit = odqs->bsoftlimit; + dqs->bcurrent = odqs->bcurrent; + dqs->btime = odqs->btime; + + dqs->ihardlimit = odqs->ihardlimit; + dqs->isoftlimit = odqs->isoftlimit; + dqs->icurrent = odqs->icurrent; + dqs->itime = odqs->itime; +} + +static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi, + struct dq_info *dqi) +{ + dqi->bexpire = odqi->bexpire; + dqi->iexpire = odqi->iexpire; + dqi->flags = odqi->flags; +} + +static inline void dqstat2compat_dqstat(struct dq_stat *dqs, + struct compat_dq_stat *odqs) +{ + odqs->bhardlimit = dqs->bhardlimit; + odqs->bsoftlimit = dqs->bsoftlimit; + odqs->bcurrent = dqs->bcurrent; + odqs->btime = (compat_time_t)dqs->btime; + + odqs->ihardlimit = dqs->ihardlimit; + odqs->isoftlimit = dqs->isoftlimit; + odqs->icurrent = dqs->icurrent; + odqs->itime = (compat_time_t)dqs->itime; +} + +static inline void dqinfo2compat_dqinfo(struct dq_info *dqi, + struct compat_dq_info *odqi) +{ + odqi->bexpire = (compat_time_t)dqi->bexpire; + odqi->iexpire = (compat_time_t)dqi->iexpire; + odqi->flags = dqi->flags; +} +#endif + +/* values for flags and dq_flags */ +/* this flag is set if the userspace has been unable to provide usage + * information about all ugids + * if the flag is set, we don't allocate new UG quota blocks (their + * current usage is unknown) or free existing UG quota blocks (not to + * lose information that this block is ok) */ +#define VZDQUG_FIXED_SET 0x01 +/* permit to use ugid quota */ +#define VZDQUG_ON 0x02 +#define VZDQ_USRQUOTA 0x10 +#define VZDQ_GRPQUOTA 0x20 +#define VZDQ_NOACT 0x1000 /* not actual */ +#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ + +struct vz_quota_ugid_stat { + unsigned int limit; /* max amount of ugid records */ + unsigned int count; /* amount of ugid records */ + unsigned int flags; +}; + +struct vz_quota_ugid_setlimit { + unsigned int type; /* quota type (USR/GRP) */ + unsigned int id; /* ugid */ + struct if_dqblk dqb; /* limits info */ +}; + +struct vz_quota_ugid_setinfo { + unsigned int type; /* quota type (USR/GRP) */ + struct if_dqinfo dqi; /* grace info */ +}; + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include + +/* Values for dq_info flags */ +#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ +#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ + +/* values for dq_state */ +#define VZDQ_STARTING 0 /* created, not turned on yet */ +#define VZDQ_WORKING 1 /* quota created, turned on */ +#define VZDQ_STOPING 2 /* created, turned on and off */ + +/* master quota record - one per veid */ +struct vz_quota_master { + struct list_head dq_hash; /* next quota in hash list */ + atomic_t dq_count; /* inode reference count */ + unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ + unsigned int dq_state; /* see values above */ + unsigned int dq_id; /* VEID this applies to */ + struct dq_stat dq_stat; /* limits, grace, usage stats */ + struct dq_info dq_info; /* grace times and flags */ + spinlock_t dq_data_lock; /* for dq_stat */ + + struct semaphore dq_sem; /* semaphore to protect + ugid tree */ + + struct list_head dq_ilink_list; /* list of vz_quota_ilink */ + struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ + struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ + unsigned int dq_ugid_count; /* amount of ugid records */ + unsigned int dq_ugid_max; /* max amount of ugid records */ + struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ + + struct dentry *dq_root_dentry;/* dentry of fs tree */ + struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */ + struct super_block *dq_sb; /* superblock of our quota root */ +}; + +/* UID/GID quota record - one per pair (quota_master, uid or gid) */ +struct vz_quota_ugid { + unsigned int qugid_id; /* UID/GID this applies to */ + struct dq_stat qugid_stat; /* limits, options, usage stats */ + int qugid_type; /* USRQUOTA|GRPQUOTA */ + atomic_t qugid_count; /* reference count */ +}; + +#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) + +struct vz_quota_datast { + struct vz_quota_ilink qlnk; +}; + +#define VIRTINFO_QUOTA_GETSTAT 0 +#define VIRTINFO_QUOTA_ON 1 +#define VIRTINFO_QUOTA_OFF 2 +#define VIRTINFO_QUOTA_DISABLE 3 + +struct virt_info_quota { + struct super_block *super; + struct dq_stat *qstat; +}; + +/* + * Interface to VZ quota core + */ +#define INODE_QLNK(inode) (&(inode)->i_qlnk) +#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) + +#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) + +#define VZ_QUOTAO_SETE 1 +#define VZ_QUOTAO_INIT 2 +#define VZ_QUOTAO_DESTR 3 +#define VZ_QUOTAO_SWAP 4 +#define VZ_QUOTAO_INICAL 5 +#define VZ_QUOTAO_DRCAL 6 +#define VZ_QUOTAO_QSET 7 +#define VZ_QUOTAO_TRANS 8 +#define VZ_QUOTAO_ACT 9 +#define VZ_QUOTAO_DTREE 10 +#define VZ_QUOTAO_DET 11 +#define VZ_QUOTAO_ON 12 +#define VZ_QUOTAO_RE_LOCK 13 + +#define DQUOT_CMD_ALLOC 0 +#define DQUOT_CMD_PREALLOC 1 +#define DQUOT_CMD_CHECK 12 +#define DQUOT_CMD_FORCE 13 + +extern struct semaphore vz_quota_sem; +void inode_qmblk_lock(struct super_block *sb); +void inode_qmblk_unlock(struct super_block *sb); +void qmblk_data_read_lock(struct vz_quota_master *qmblk); +void qmblk_data_read_unlock(struct vz_quota_master *qmblk); +void qmblk_data_write_lock(struct vz_quota_master *qmblk); +void qmblk_data_write_unlock(struct vz_quota_master *qmblk); + +/* for quota operations */ +void vzquota_inode_init_call(struct inode *inode); +void vzquota_inode_drop_call(struct inode *inode); +int vzquota_inode_transfer_call(struct inode *, struct iattr *); +struct vz_quota_master *vzquota_inode_data(struct inode *inode, + struct vz_quota_datast *); +void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); +int vzquota_rename_check(struct inode *inode, + struct inode *old_dir, struct inode *new_dir); +struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); +/* for second-level quota */ +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +/* for management operations */ +struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, + struct vz_quota_stat *qstat); +void vzquota_free_master(struct vz_quota_master *); +struct vz_quota_master *vzquota_find_master(unsigned int quota_id); +int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, + struct vz_quota_master *qmblk, char __user *buf); +int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, + char __user *buf, int force); +int vzquota_get_super(struct super_block *sb); +void vzquota_put_super(struct super_block *sb); + +static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) +{ + if (!atomic_read(&qmblk->dq_count)) + BUG(); + atomic_inc(&qmblk->dq_count); + return qmblk; +} + +static inline void __qmblk_put(struct vz_quota_master *qmblk) +{ + atomic_dec(&qmblk->dq_count); +} + +static inline void qmblk_put(struct vz_quota_master *qmblk) +{ + if (!atomic_dec_and_test(&qmblk->dq_count)) + return; + vzquota_free_master(qmblk); +} + +extern struct list_head vzquota_hash_table[]; +extern int vzquota_hash_size; + +/* + * Interface to VZ UGID quota + */ +extern struct quotactl_ops vz_quotactl_operations; +extern struct dquot_operations vz_quota_operations2; +extern struct quota_format_type vz_quota_empty_v2_format; + +#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ + qmblk->dq_uid_tree : \ + qmblk->dq_gid_tree) + +#define VZDQUG_FIND_DONT_ALLOC 1 +#define VZDQUG_FIND_FAKE 2 +struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, + unsigned int quota_id, int type, int flags); +struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); +void vzquota_put_ugid(struct vz_quota_master *qmblk, + struct vz_quota_ugid *qugid); +void vzquota_kill_ugid(struct vz_quota_master *qmblk); +int vzquota_ugid_init(void); +void vzquota_ugid_release(void); +int vzquota_transfer_usage(struct inode *inode, int mask, + struct vz_quota_ilink *qlnk); +void vzquota_inode_off(struct inode *inode); + +long do_vzquotaugidctl(int cmd, unsigned int quota_id, + unsigned int ugid_index, unsigned int ugid_size, + void *addr, int compat); + +/* + * Other VZ quota parts + */ +extern struct dquot_operations vz_quota_operations; + +long do_vzquotactl(int cmd, unsigned int quota_id, + struct vz_quota_stat __user *qstat, const char __user *ve_root, + int compat); +int vzquota_proc_init(void); +void vzquota_proc_release(void); +struct vz_quota_master *vzquota_find_qmblk(struct super_block *); +extern struct semaphore vz_quota_sem; + +void vzaquota_init(void); +void vzaquota_fini(void); + +#endif /* __KERNEL__ */ + +#endif /* _VZDQUOTA_H */ diff -uprN linux-2.6.24/include/linux/vzquota_qlnk.h linux-2.6.24.ovz/include/linux/vzquota_qlnk.h --- linux-2.6.24/include/linux/vzquota_qlnk.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzquota_qlnk.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,25 @@ +/* + * include/linux/vzquota_qlnk.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef _VZDQUOTA_QLNK_H +#define _VZDQUOTA_QLNK_H + +struct vz_quota_master; +struct vz_quota_ugid; + +/* inode link, used to track inodes using quota via dq_ilink_list */ +struct vz_quota_ilink { + struct vz_quota_master *qmblk; + struct vz_quota_ugid *qugid[MAXQUOTAS]; + struct list_head list; + unsigned char origin[2]; +}; + +#endif /* _VZDQUOTA_QLNK_H */ diff -uprN linux-2.6.24/include/linux/vzratelimit.h linux-2.6.24.ovz/include/linux/vzratelimit.h --- linux-2.6.24/include/linux/vzratelimit.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzratelimit.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,28 @@ +/* + * include/linux/vzratelimit.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZ_RATELIMIT_H__ +#define __VZ_RATELIMIT_H__ + +/* + * Generic ratelimiting stuff. + */ + +struct vz_rate_info { + int burst; + int interval; /* jiffy_t per event */ + int bucket; /* kind of leaky bucket */ + unsigned long last; /* last event */ +}; + +/* Return true if rate limit permits. */ +int vz_ratelimit(struct vz_rate_info *p); + +#endif /* __VZ_RATELIMIT_H__ */ diff -uprN linux-2.6.24/include/linux/vzstat.h linux-2.6.24.ovz/include/linux/vzstat.h --- linux-2.6.24/include/linux/vzstat.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/vzstat.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,182 @@ +/* + * include/linux/vzstat.h + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#ifndef __VZSTAT_H__ +#define __VZSTAT_H__ + +struct swap_cache_info_struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; + unsigned long remove_race; +}; + +struct kstat_lat_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; +}; +struct kstat_lat_pcpu_snap_struct { + cycles_t maxlat, totlat; + unsigned long count; + seqcount_t lock; +} ____cacheline_aligned_in_smp; + +struct kstat_lat_struct { + struct kstat_lat_snap_struct cur, last; + cycles_t avg[3]; +}; +struct kstat_lat_pcpu_struct { + struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; + cycles_t max_snap; + struct kstat_lat_snap_struct last; + cycles_t avg[3]; +}; + +struct kstat_perf_snap_struct { + cycles_t wall_tottime, cpu_tottime; + cycles_t wall_maxdur, cpu_maxdur; + unsigned long count; +}; +struct kstat_perf_struct { + struct kstat_perf_snap_struct cur, last; +}; + +struct kstat_zone_avg { + unsigned long free_pages_avg[3], + nr_active_avg[3], + nr_inactive_avg[3]; +}; + +#define KSTAT_ALLOCSTAT_NR 5 + +struct kernel_stat_glob { + unsigned long nr_unint_avg[3]; + + unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; + struct kstat_lat_pcpu_struct sched_lat; + struct kstat_lat_struct swap_in; + + struct kstat_perf_struct ttfp, cache_reap, + refill_inact, shrink_icache, shrink_dcache; + + struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ +} ____cacheline_aligned; + +extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; +extern spinlock_t kstat_glb_lock; + +#ifdef CONFIG_VE +#define KSTAT_PERF_ENTER(name) \ + unsigned long flags; \ + cycles_t start, sleep_time; \ + \ + start = get_cycles(); \ + sleep_time = VE_TASK_INFO(current)->sleep_time; \ + +#define KSTAT_PERF_LEAVE(name) \ + spin_lock_irqsave(&kstat_glb_lock, flags); \ + kstat_glob.name.cur.count++; \ + start = get_cycles() - start; \ + if (kstat_glob.name.cur.wall_maxdur < start) \ + kstat_glob.name.cur.wall_maxdur = start;\ + kstat_glob.name.cur.wall_tottime += start; \ + start -= VE_TASK_INFO(current)->sleep_time - \ + sleep_time; \ + if (kstat_glob.name.cur.cpu_maxdur < start) \ + kstat_glob.name.cur.cpu_maxdur = start; \ + kstat_glob.name.cur.cpu_tottime += start; \ + spin_unlock_irqrestore(&kstat_glb_lock, flags); \ + +#else +#define KSTAT_PERF_ENTER(name) +#define KSTAT_PERF_LEAVE(name) +#endif + +/* + * Add another statistics reading. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, + cycles_t dur) +{ + p->cur.count++; + if (p->cur.maxlat < dur) + p->cur.maxlat = dur; + p->cur.totlat += dur; +} + +static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, + cycles_t dur) +{ + struct kstat_lat_pcpu_snap_struct *cur; + + cur = &p->cur[cpu]; + write_seqcount_begin(&cur->lock); + cur->count++; + if (cur->maxlat < dur) + cur->maxlat = dur; + cur->totlat += dur; + write_seqcount_end(&cur->lock); +} + +/* + * Move current statistics to last, clear last. + * Serialization is the caller's due. + */ +static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) +{ + cycles_t m; + memcpy(&p->last, &p->cur, sizeof(p->last)); + p->cur.maxlat = 0; + m = p->last.maxlat; + CALC_LOAD(p->avg[0], EXP_1, m) + CALC_LOAD(p->avg[1], EXP_5, m) + CALC_LOAD(p->avg[2], EXP_15, m) +} + +static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) +{ + unsigned i, cpu; + struct kstat_lat_pcpu_snap_struct snap, *cur; + cycles_t m; + + memset(&p->last, 0, sizeof(p->last)); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cur = &p->cur[cpu]; + do { + i = read_seqcount_begin(&cur->lock); + memcpy(&snap, cur, sizeof(snap)); + } while (read_seqcount_retry(&cur->lock, i)); + /* + * read above and this update of maxlat is not atomic, + * but this is OK, since it happens rarely and losing + * a couple of peaks is not essential. xemul + */ + cur->maxlat = 0; + + p->last.count += snap.count; + p->last.totlat += snap.totlat; + if (p->last.maxlat < snap.maxlat) + p->last.maxlat = snap.maxlat; + } + + m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); + CALC_LOAD(p->avg[0], EXP_1, m); + CALC_LOAD(p->avg[1], EXP_5, m); + CALC_LOAD(p->avg[2], EXP_15, m); + /* reset max_snap to calculate it correctly next time */ + p->max_snap = 0; +} + +#endif /* __VZSTAT_H__ */ diff -uprN linux-2.6.24/include/linux/wait.h linux-2.6.24.ovz/include/linux/wait.h --- linux-2.6.24/include/linux/wait.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/linux/wait.h 2008-03-25 18:53:59.000000000 -0500 @@ -161,6 +161,22 @@ wait_queue_head_t *FASTCALL(bit_waitqueu #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE) #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* + * macro to avoid include hell + */ +#define wake_up_nested(x, s) \ +do { \ + unsigned long flags; \ + \ + spin_lock_irqsave_nested(&(x)->lock, flags, (s)); \ + wake_up_locked(x); \ + spin_unlock_irqrestore(&(x)->lock, flags); \ +} while (0) +#else +#define wake_up_nested(x, s) wake_up(x) +#endif + #define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ diff -uprN linux-2.6.24/include/net/addrconf.h linux-2.6.24.ovz/include/net/addrconf.h --- linux-2.6.24/include/net/addrconf.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/addrconf.h 2008-03-25 18:53:59.000000000 -0500 @@ -246,5 +246,19 @@ extern int if6_proc_init(void); extern void if6_proc_exit(void); #endif +int addrconf_ifdown(struct net_device *dev, int how); +int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, + __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int addrconf_sysctl_init(struct ve_struct *ve); +void addrconf_sysctl_fini(struct ve_struct *ve); +void addrconf_sysctl_free(struct ve_struct *ve); +#else +#define addrconf_sysctl_init(ve) (0) +#define addrconf_sysctl_fini(ve) do { } while (0) +#define addrconf_sysctl_free(ve) do { } while (0) +#endif + #endif #endif diff -uprN linux-2.6.24/include/net/af_unix.h linux-2.6.24.ovz/include/net/af_unix.h --- linux-2.6.24/include/net/af_unix.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/af_unix.h 2008-03-25 18:53:59.000000000 -0500 @@ -9,6 +9,7 @@ extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); +extern void unix_destruct_fds(struct sk_buff *skb); #define UNIX_HASH_SIZE 256 diff -uprN linux-2.6.24/include/net/arp.h linux-2.6.24.ovz/include/net/arp.h --- linux-2.6.24/include/net/arp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/arp.h 2008-03-25 18:53:59.000000000 -0500 @@ -7,7 +7,12 @@ #define HAVE_ARP_CREATE -extern struct neigh_table arp_tbl; +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define arp_tbl (*(get_exec_env()->ve_arp_tbl)) +#else +extern struct neigh_table global_arp_tbl; +#define arp_tbl global_arp_tbl +#endif extern void arp_init(void); extern int arp_find(unsigned char *haddr, struct sk_buff *skb); diff -uprN linux-2.6.24/include/net/flow.h linux-2.6.24.ovz/include/net/flow.h --- linux-2.6.24/include/net/flow.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/flow.h 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include +struct ve_struct; struct flowi { int oif; int iif; @@ -77,6 +78,9 @@ struct flowi { #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi #define fl_mh_type uli_u.mht.type +#ifdef CONFIG_VE + struct ve_struct *owner_env; +#endif __u32 secid; /* used by xfrm; see secid.txt */ } __attribute__((__aligned__(BITS_PER_LONG/8))); diff -uprN linux-2.6.24/include/net/icmp.h linux-2.6.24.ovz/include/net/icmp.h --- linux-2.6.24/include/net/icmp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/icmp.h 2008-03-25 18:53:59.000000000 -0500 @@ -31,15 +31,24 @@ struct icmp_err { extern struct icmp_err icmp_err_convert[]; DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); DECLARE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics); -#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) -#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) -#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) -#define ICMPMSGOUT_INC_STATS(field) SNMP_INC_STATS(icmpmsg_statistics, field+256) -#define ICMPMSGOUT_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpmsg_statistics, field+256) -#define ICMPMSGOUT_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field+256) -#define ICMPMSGIN_INC_STATS(field) SNMP_INC_STATS(icmpmsg_statistics, field) -#define ICMPMSGIN_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpmsg_statistics, field) -#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field) + +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) +#define ve_icmpmsg_statistics (get_exec_env()->_icmpmsg_statistics) +#else +#define ve_icmp_statistics icmp_statistics +#define ve_icmpmsg_statistics icmpmsg_statistics +#endif + +#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) +#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) +#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) +#define ICMPMSGOUT_INC_STATS(field) SNMP_INC_STATS(ve_icmpmsg_statistics, field+256) +#define ICMPMSGOUT_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field+256) +#define ICMPMSGOUT_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field+256) +#define ICMPMSGIN_INC_STATS(field) SNMP_INC_STATS(ve_icmpmsg_statistics, field) +#define ICMPMSGIN_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field) +#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field) struct dst_entry; struct net_proto_family; diff -uprN linux-2.6.24/include/net/if_inet6.h linux-2.6.24.ovz/include/net/if_inet6.h --- linux-2.6.24/include/net/if_inet6.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/if_inet6.h 2008-03-25 18:53:59.000000000 -0500 @@ -194,7 +194,14 @@ struct inet6_dev struct rcu_head rcu; }; -extern struct ipv6_devconf ipv6_devconf; +extern struct ipv6_devconf global_ipv6_devconf; +extern struct ipv6_devconf global_ipv6_devconf_dflt; + +#ifdef CONFIG_VE +#define ve_ipv6_devconf (*(get_exec_env()->_ipv6_devconf)) +#else +#define ve_ipv6_devconf global_ipv6_devconf +#endif static inline void ipv6_eth_mc_map(struct in6_addr *addr, char *buf) { diff -uprN linux-2.6.24/include/net/inet6_hashtables.h linux-2.6.24.ovz/include/net/inet6_hashtables.h --- linux-2.6.24/include/net/inet6_hashtables.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/inet6_hashtables.h 2008-03-25 18:53:59.000000000 -0500 @@ -29,9 +29,10 @@ struct inet_hashinfo; /* I have no idea if this is a good hash for v6 or not. -DaveM */ static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const __be16 fport) + const struct in6_addr *faddr, const __be16 fport, + const envid_t veid) { - u32 ports = (lport ^ (__force u16)fport); + u32 ports = (lport ^ (__force u16)fport) ^ (veid ^ (veid >> 16)); return jhash_3words((__force u32)laddr->s6_addr32[3], (__force u32)faddr->s6_addr32[3], @@ -46,7 +47,7 @@ static inline int inet6_sk_ehashfn(const const struct in6_addr *faddr = &np->daddr; const __u16 lport = inet->num; const __be16 fport = inet->dport; - return inet6_ehashfn(laddr, lport, faddr, fport); + return inet6_ehashfn(laddr, lport, faddr, fport, VEID(sk->owner_env)); } extern void __inet6_hash(struct inet_hashinfo *hashinfo, struct sock *sk); diff -uprN linux-2.6.24/include/net/inet_frag.h linux-2.6.24.ovz/include/net/inet_frag.h --- linux-2.6.24/include/net/inet_frag.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/inet_frag.h 2008-03-25 18:53:59.000000000 -0500 @@ -16,6 +16,10 @@ struct inet_frag_queue { #define COMPLETE 4 #define FIRST_IN 2 #define LAST_IN 1 + +#ifdef CONFIG_VE + struct ve_struct *owner_ve; +#endif }; #define INETFRAGS_HASHSZ 64 diff -uprN linux-2.6.24/include/net/inet_hashtables.h linux-2.6.24.ovz/include/net/inet_hashtables.h --- linux-2.6.24/include/net/inet_hashtables.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/inet_hashtables.h 2008-03-25 18:53:59.000000000 -0500 @@ -74,6 +74,7 @@ struct inet_ehash_bucket { * ports are created in O(1) time? I thought so. ;-) -DaveM */ struct inet_bind_bucket { + struct ve_struct *owner_env; unsigned short port; signed short fastreuse; struct hlist_node node; @@ -195,37 +196,43 @@ static inline void inet_ehash_locks_free extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, + struct ve_struct *env); extern void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); -static inline int inet_bhashfn(const __u16 lport, const int bhash_size) +static inline int inet_bhashfn(const __u16 lport, const int bhash_size, + unsigned veid) { - return lport & (bhash_size - 1); + return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); } extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum); /* These can have wildcards, don't try too hard. */ -static inline int inet_lhashfn(const unsigned short num) +static inline int inet_lhashfn(const unsigned short num, unsigned veid) { - return num & (INET_LHTABLE_SIZE - 1); + return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); } static inline int inet_sk_listen_hashfn(const struct sock *sk) { - return inet_lhashfn(inet_sk(sk)->num); + return inet_lhashfn(inet_sk(sk)->num, VEID(sk->owner_env)); } /* Caller must disable local BH processing. */ static inline void __inet_inherit_port(struct inet_hashinfo *table, struct sock *sk, struct sock *child) { - const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); - struct inet_bind_hashbucket *head = &table->bhash[bhash]; + int bhash; + struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; + bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, + VEID(child->owner_env)); + head = &table->bhash[bhash]; + spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; sk_add_bind_node(child, &tb->owners); @@ -365,25 +372,25 @@ typedef __u64 __bitwise __addrpair; (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))); #endif /* __BIG_ENDIAN */ -#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ +#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ +#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #else /* 32-bit arch */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) -#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ +#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_hash == (__hash)) && \ (inet_sk(__sk)->daddr == (__saddr)) && \ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ +#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ (((__sk)->sk_hash == (__hash)) && \ (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ @@ -391,6 +398,18 @@ typedef __u64 __bitwise __addrpair; (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #endif /* 64-bit arch */ +#define INET_MATCH(__sk, __hash, __cookie, __saddr, \ + __daddr, __ports, __dif, __ve) \ + (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ + (__daddr), (__ports), (__dif)) \ + && ve_accessible_strict((__sk)->owner_env, (__ve))) + +#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \ + __daddr, __ports, __dif, __ve) \ + (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ + (__daddr), (__ports), (__dif)) \ + && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve))) + /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM @@ -410,20 +429,22 @@ static inline struct sock * /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + struct ve_struct *ve = get_exec_env(); + unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(ve)); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); read_lock(lock); sk_for_each(sk, node, &head->chain) { - if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk, hash, acookie, saddr, daddr, + ports, dif, ve)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &head->twchain) { - if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif, ve)) goto hit; } sk = NULL; diff -uprN linux-2.6.24/include/net/inet_sock.h linux-2.6.24.ovz/include/net/inet_sock.h --- linux-2.6.24/include/net/inet_sock.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/inet_sock.h 2008-03-25 18:53:59.000000000 -0500 @@ -173,9 +173,10 @@ extern u32 inet_ehash_secret; extern void build_ehash_secret(void); static inline unsigned int inet_ehashfn(const __be32 laddr, const __u16 lport, - const __be32 faddr, const __be16 fport) + const __be32 faddr, const __be16 fport, + const envid_t veid) { - return jhash_2words((__force __u32) laddr ^ (__force __u32) faddr, + return jhash_2words((__force __u32) laddr ^ (__force __u32) faddr ^ (veid ^ (veid >> 16)), ((__u32) lport) << 16 | (__force __u32)fport, inet_ehash_secret); } @@ -187,8 +188,9 @@ static inline int inet_sk_ehashfn(const const __u16 lport = inet->num; const __be32 faddr = inet->daddr; const __be16 fport = inet->dport; + envid_t veid = VEID(sk->owner_env); - return inet_ehashfn(laddr, lport, faddr, fport); + return inet_ehashfn(laddr, lport, faddr, fport, veid); } diff -uprN linux-2.6.24/include/net/inet_timewait_sock.h linux-2.6.24.ovz/include/net/inet_timewait_sock.h --- linux-2.6.24/include/net/inet_timewait_sock.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/inet_timewait_sock.h 2008-03-25 18:53:59.000000000 -0500 @@ -81,6 +81,7 @@ struct inet_timewait_death_row { struct inet_hashinfo *hashinfo; int sysctl_tw_recycle; int sysctl_max_tw_buckets; + int ub_managed; }; extern void inet_twdr_hangman(unsigned long data); @@ -134,6 +135,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + envid_t tw_owner_env; }; static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, diff -uprN linux-2.6.24/include/net/ip.h linux-2.6.24.ovz/include/net/ip.h --- linux-2.6.24/include/net/ip.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ip.h 2008-03-25 18:53:59.000000000 -0500 @@ -157,16 +157,28 @@ struct ipv4_config extern struct ipv4_config ipv4_config; DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); -#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) -#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) -#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) -#define IP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ip_statistics, field, val) + +#ifdef CONFIG_VE +#define ve_ip_statistics (get_exec_env()->_ip_statistics) +#else +#define ve_ip_statistics ip_statistics +#endif +#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) +#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) +#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) +#define IP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_ip_statistics, field, val) + DECLARE_SNMP_STAT(struct linux_mib, net_statistics); -#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) -#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) -#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) -#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) -#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_net_statistics (get_exec_env()->_net_statistics) +#else +#define ve_net_statistics net_statistics +#endif +#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) +#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) +#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) +#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) +#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) extern unsigned long snmp_fold_field(void *mib[], int offt); extern int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign); @@ -393,4 +405,11 @@ extern int ip_misc_proc_init(void); extern struct ctl_table ipv4_table[]; +#ifdef CONFIG_SYSCTL +extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, + int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); +#endif #endif /* _IP_H */ diff -uprN linux-2.6.24/include/net/ip6_fib.h linux-2.6.24.ovz/include/net/ip6_fib.h --- linux-2.6.24/include/net/ip6_fib.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ip6_fib.h 2008-03-25 18:53:59.000000000 -0500 @@ -116,6 +116,8 @@ static inline struct inet6_dev *ip6_dst_ return ((struct rt6_info *)dst)->rt6i_idev; } +extern struct list_head fib6_table_list; + struct fib6_walker_t { struct fib6_walker_t *prev, *next; @@ -164,6 +166,7 @@ struct fib6_table { u32 tb6_id; rwlock_t tb6_lock; struct fib6_node tb6_root; + struct ve_struct *owner_env; }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC @@ -220,6 +223,8 @@ extern void fib6_run_gc(unsigned long extern void fib6_gc_cleanup(void); extern void fib6_init(void); +extern void fib6_tables_init(void); +extern void fib6_tables_cleanup(void); extern void fib6_rules_init(void); extern void fib6_rules_cleanup(void); diff -uprN linux-2.6.24/include/net/ip6_route.h linux-2.6.24.ovz/include/net/ip6_route.h --- linux-2.6.24/include/net/ip6_route.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ip6_route.h 2008-03-25 18:53:59.000000000 -0500 @@ -162,5 +162,13 @@ static inline int ipv6_unicast_destinati return rt->rt6i_flags & RTF_LOCAL; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int init_ve_route6(struct ve_struct *ve); +void fini_ve_route6(struct ve_struct *ve); +#else +#define init_ve_route6(ve) (0) +#define fini_ve_route6(ve) do { } while (0) +#endif + #endif #endif diff -uprN linux-2.6.24/include/net/ip_fib.h linux-2.6.24.ovz/include/net/ip_fib.h --- linux-2.6.24/include/net/ip_fib.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ip_fib.h 2008-03-25 18:53:59.000000000 -0500 @@ -153,10 +153,23 @@ struct fib_table { unsigned char tb_data[0]; }; +struct fn_zone; +struct fn_hash { + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + #ifndef CONFIG_IP_MULTIPLE_TABLES -extern struct fib_table *ip_fib_local_table; -extern struct fib_table *ip_fib_main_table; +#ifdef CONFIG_VE +#define ip_fib_local_table get_exec_env()->_local_table +#define ip_fib_main_table get_exec_env()->_main_table +#else +extern struct fib_table *__ip_fib_local_table; +extern struct fib_table *__ip_fib_main_table; +#define ip_fib_local_table __ip_fib_local_table +#define ip_fib_main_table __ip_fib_main_table +#endif static inline struct fib_table *fib_get_table(u32 id) { @@ -200,6 +213,10 @@ extern struct fib_table *fib_new_table(u extern struct fib_table *fib_get_table(u32 id); extern void fib_select_default(const struct flowi *flp, struct fib_result *res); +extern int fib_rules_create(void); +extern void fib_rules_destroy(void); +extern int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb); + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -220,6 +237,15 @@ extern __be32 __fib_res_prefsrc(struct /* Exported by fib_hash.c */ extern struct fib_table *fib_hash_init(u32 id); +#if defined(CONFIG_VE) && defined(CONFIG_INET) +struct ve_struct; +extern int init_ve_route(struct ve_struct *ve); +extern void fini_ve_route(struct ve_struct *ve); +#else +#define init_ve_route(ve) 0 +#define fini_ve_route(ve) do { } while (0) +#endif + static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { #ifdef CONFIG_NET_CLS_ROUTE diff -uprN linux-2.6.24/include/net/ipv6.h linux-2.6.24.ovz/include/net/ipv6.h --- linux-2.6.24/include/net/ipv6.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ipv6.h 2008-03-25 18:53:59.000000000 -0500 @@ -117,7 +117,7 @@ extern int sysctl_mld_max_msf; struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS##modifier((_idev)->stats.statname, (field)); \ - SNMP_INC_STATS##modifier(statname##_statistics, (field)); \ + SNMP_INC_STATS##modifier(ve_##statname##_statistics, (field)); \ }) #define _DEVADD(statname, modifier, idev, field, val) \ @@ -125,9 +125,22 @@ extern int sysctl_mld_max_msf; struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_ADD_STATS##modifier((_idev)->stats.statname, (field), (val)); \ - SNMP_ADD_STATS##modifier(statname##_statistics, (field), (val));\ + SNMP_ADD_STATS##modifier(ve_##statname##_statistics, (field), (val));\ }) +#ifdef CONFIG_VE +#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics) +#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics) +#define ve_icmpv6msg_statistics (get_exec_env()->_icmpv6msg_statistics) +#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6) +#define ve_udplite_stats_in6 (get_exec_env()->_udplite_stats_in6) +#else +#define ve_ipv6_statistics ipv6_statistics +#define ve_icmpv6_statistics icmpv6_statistics +#define ve_icmpv6msg_statistics icmpv6msg_statistics +#define ve_udplite_stats_in6 udplite_stats_in6 +#endif + /* MIBs */ DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); @@ -167,11 +180,29 @@ DECLARE_SNMP_STAT(struct icmpv6msg_mib, DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6); #define UDP6_INC_STATS_BH(field, is_udplite) do { \ - if (is_udplite) SNMP_INC_STATS_BH(udplite_stats_in6, field); \ - else SNMP_INC_STATS_BH(udp_stats_in6, field); } while(0) + if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_stats_in6, field); \ + else SNMP_INC_STATS_BH(ve_udp_stats_in6, field); } while(0) #define UDP6_INC_STATS_USER(field, is_udplite) do { \ - if (is_udplite) SNMP_INC_STATS_USER(udplite_stats_in6, field); \ - else SNMP_INC_STATS_USER(udp_stats_in6, field); } while(0) + if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_stats_in6, field); \ + else SNMP_INC_STATS_USER(ve_udp_stats_in6, field); } while(0) + +int snmp6_register_dev(struct inet6_dev *idev); +int snmp6_unregister_dev(struct inet6_dev *idev); +int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign); +void snmp6_mib_free(void *ptr[2]); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +int ve_snmp_proc_init(struct ve_struct *ve); +void ve_snmp_proc_fini(struct ve_struct *ve); +#else +static inline int ve_snmp_proc_init(struct ve_struct *ve) +{ + return 0; +} +static inline void ve_snmp_proc_fini(struct ve_struct *ve) +{ +} +#endif struct ip6_ra_chain { diff -uprN linux-2.6.24/include/net/ndisc.h linux-2.6.24.ovz/include/net/ndisc.h --- linux-2.6.24/include/net/ndisc.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/ndisc.h 2008-03-25 18:53:59.000000000 -0500 @@ -52,7 +52,12 @@ struct net_device; struct net_proto_family; struct sk_buff; -extern struct neigh_table nd_tbl; +#ifdef CONFIG_VE +#define nd_tbl (*(get_exec_env()->ve_nd_tbl)) +#else +#define nd_tbl global_nd_tbl +extern struct neigh_table global_nd_tbl; +#endif struct nd_msg { struct icmp6hdr icmph; @@ -130,6 +135,7 @@ extern int ndisc_ifinfo_sysctl_change extern void inet6_ifinfo_notify(int event, struct inet6_dev *idev); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline struct neighbour * ndisc_get_neigh(struct net_device *dev, struct in6_addr *addr) { @@ -138,6 +144,7 @@ static inline struct neighbour * ndisc_g return NULL; } +#endif #endif /* __KERNEL__ */ diff -uprN linux-2.6.24/include/net/neighbour.h linux-2.6.24.ovz/include/net/neighbour.h --- linux-2.6.24/include/net/neighbour.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/neighbour.h 2008-03-25 18:53:59.000000000 -0500 @@ -161,6 +161,8 @@ struct neigh_table atomic_t entries; rwlock_t lock; unsigned long last_rand; + struct ve_struct *owner_env; + struct user_beancounter *owner_ub; struct kmem_cache *kmem_cachep; struct neigh_statistics *stats; struct neighbour **hash_buckets; @@ -180,8 +182,8 @@ struct neigh_table #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 -extern void neigh_table_init(struct neigh_table *tbl); -extern void neigh_table_init_no_netlink(struct neigh_table *tbl); +extern int neigh_table_init(struct neigh_table *tbl); +extern int neigh_table_init_no_netlink(struct neigh_table *tbl); extern int neigh_table_clear(struct neigh_table *tbl); extern struct neighbour * neigh_lookup(struct neigh_table *tbl, const void *pkey, diff -uprN linux-2.6.24/include/net/net_namespace.h linux-2.6.24.ovz/include/net/net_namespace.h --- linux-2.6.24/include/net/net_namespace.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/net_namespace.h 2008-03-25 18:53:59.000000000 -0500 @@ -29,6 +29,13 @@ struct net { struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + + int ifindex; + +#ifdef CONFIG_VE + struct completion *sysfs_completion; + struct ve_struct *owner_ve; +#endif }; #ifdef CONFIG_NET diff -uprN linux-2.6.24/include/net/netfilter/ipv4/nf_conntrack_ipv4.h linux-2.6.24.ovz/include/net/netfilter/ipv4/nf_conntrack_ipv4.h --- linux-2.6.24/include/net/netfilter/ipv4/nf_conntrack_ipv4.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/ipv4/nf_conntrack_ipv4.h 2008-03-25 18:53:59.000000000 -0500 @@ -18,8 +18,18 @@ extern struct nf_conntrack_l4proto nf_co extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; +#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) extern int nf_conntrack_ipv4_compat_init(void); extern void nf_conntrack_ipv4_compat_fini(void); +#else +static inline int nf_conntrack_ipv4_compat_init(void) +{ + return 0; +} +static inline void nf_conntrack_ipv4_compat_fini(void) +{ +} +#endif extern void need_ipv4_conntrack(void); diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack.h --- linux-2.6.24/include/net/netfilter/nf_conntrack.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack.h 2008-03-25 18:53:59.000000000 -0500 @@ -27,6 +27,10 @@ #include +#ifdef CONFIG_VE_IPTABLES +#include +#endif + /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ @@ -92,6 +96,10 @@ struct nf_conn_help { #include #include +#ifdef CONFIG_VE_IPTABLES +#include +#endif + struct nf_conn { /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, @@ -129,6 +137,10 @@ struct nf_conn /* Extensions */ struct nf_ct_ext *ext; + +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *ct_owner_env; +#endif }; static inline struct nf_conn * @@ -183,6 +195,11 @@ extern void nf_conntrack_hash_insert(str extern void nf_conntrack_flush(void); +struct nf_conntrack_helper * nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple); +void nf_ct_helper_put(struct nf_conntrack_helper *helper); + +struct nf_conntrack_helper * __nf_conntrack_helper_find_byname(const char *name); + extern int nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, @@ -231,7 +248,8 @@ nf_ct_iterate_cleanup(int (*iter)(struct extern void nf_conntrack_free(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_tuple *repl); + const struct nf_conntrack_tuple *repl, + struct user_beancounter *); /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(struct nf_conn *ct) @@ -254,6 +272,8 @@ extern unsigned int nf_conntrack_htable_ extern int nf_conntrack_checksum; extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; +extern int nf_conntrack_disable_ve0; +extern int ip_conntrack_disable_ve0; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack_core.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_core.h --- linux-2.6.24/include/net/netfilter/nf_conntrack_core.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_core.h 2008-03-25 18:53:59.000000000 -0500 @@ -62,6 +62,47 @@ nf_conntrack_find_get(const struct nf_co extern int __nf_conntrack_confirm(struct sk_buff *skb); +#if defined(CONFIG_VE_IPTABLES) +#include +#define ve_nf_conntrack_hash (get_exec_env()->_nf_conntrack->_nf_conntrack_hash) +#define ve_nf_conntrack_vmalloc (get_exec_env()->_nf_conntrack->_nf_conntrack_vmalloc) +#define ve_unconfirmed (get_exec_env()->_nf_conntrack->_unconfirmed) +#else +#define ve_nf_conntrack_hash nf_conntrack_hash +#define ve_nf_conntrack_vmalloc nf_conntrack_vmalloc +#define ve_unconfirmed unconfirmed +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +#define ve_nf_ct_sysctl_header \ + (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_header) +#define ve_nf_ct_sysctl_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_table) +#define ve_nf_ct_netfilter_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_netfilter_table) +#define ve_nf_ct_net_table \ + (get_exec_env()->_nf_conntrack->_nf_ct_net_table) +extern void nf_ct_proto_generic_sysctl_cleanup(void); +extern int nf_ct_proto_generic_sysctl_init(void); +#else +#define ve_nf_ct_sysctl_header nf_ct_sysctl_header +#define ve_nf_ct_sysctl_table nf_ct_sysctl_table +#define ve_nf_ct_netfilter_table nf_ct_netfilter_table +#define ve_nf_ct_net_table nf_ct_net_table +static inline int nf_ct_proto_generic_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_generic_sysctl_cleanup(void) +{ +} +#endif /* CONFIG_VE_IPTABLES */ + +#if defined(CONFIG_VE_IPTABLES) +extern int nf_conntrack_init_ve(void); +extern void nf_conntrack_cleanup_ve(void); +#endif /* CONFIG_VE_IPTABLES */ + /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int nf_conntrack_confirm(struct sk_buff *skb) { diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack_ecache.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_ecache.h --- linux-2.6.24/include/net/netfilter/nf_conntrack_ecache.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_ecache.h 2008-03-25 18:53:59.000000000 -0500 @@ -34,6 +34,9 @@ nf_conntrack_event_cache(enum ip_conntra struct nf_conn *ct = (struct nf_conn *)skb->nfct; struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(nf_conntrack_ecache); if (ct != ecache->ct) @@ -45,7 +48,7 @@ nf_conntrack_event_cache(enum ip_conntra static inline void nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && ve_is_super(get_exec_env())) atomic_notifier_call_chain(&nf_conntrack_chain, event, ct); } @@ -57,7 +60,8 @@ static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp) { - atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); + if (ve_is_super(get_exec_env())) + atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); } #else /* CONFIG_NF_CONNTRACK_EVENTS */ diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack_expect.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_expect.h --- linux-2.6.24/include/net/netfilter/nf_conntrack_expect.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_expect.h 2008-03-25 18:53:59.000000000 -0500 @@ -9,6 +9,14 @@ extern struct hlist_head *nf_ct_expect_hash; extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_nf_ct_expect_hash (get_exec_env()->_nf_conntrack->_nf_ct_expect_hash) +#define ve_nf_ct_expect_max (get_exec_env()->_nf_conntrack->_nf_ct_expect_max) +#else +#define ve_nf_ct_expect_hash nf_ct_expect_hash +#define ve_nf_ct_expect_max nf_ct_expect_max +#endif struct nf_conntrack_expect { diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack_l3proto.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_l3proto.h --- linux-2.6.24/include/net/netfilter/nf_conntrack_l3proto.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_l3proto.h 2008-03-25 18:53:59.000000000 -0500 @@ -56,6 +56,9 @@ struct nf_conntrack_l3proto */ int (*new)(struct nf_conn *conntrack, const struct sk_buff *skb); + /* Called when a conntrack entry is destroyed */ + void (*destroy)(struct nf_conn *conntrack); + /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb @@ -81,6 +84,39 @@ struct nf_conntrack_l3proto struct module *me; }; +/* virtualization of l3 protocol's sysctl tables: */ +#if defined(CONFIG_VE_IPTABLES) +#include +#define ve_nf_ct3 (get_exec_env()->_nf_conntrack) +#endif + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +#define ve_nf_ct_l3protos ve_nf_ct3->_nf_ct_l3protos +#define ve_nf_conntrack_l3proto_ipv4 (ve_nf_ct3->_nf_conntrack_l3proto_ipv4) +#define ve_nf_conntrack_l3proto_ipv6 (ve_nf_ct3->_nf_conntrack_l3proto_ipv6) +#define ve_nf_conntrack_max (ve_nf_ct3->_nf_conntrack_max) +#define ve_nf_conntrack_count (ve_nf_ct3->_nf_conntrack_count) +#define ve_nf_conntrack_checksum (ve_nf_ct3->_nf_conntrack_checksum) +#define ve_nf_ct_frag6_timeout (ve_nf_ct3->_nf_frags6_ctl.timeout) +#define ve_nf_ct_frag6_low_thresh (ve_nf_ct3->_nf_frags6_ctl.low_thresh) +#define ve_nf_ct_frag6_high_thresh (ve_nf_ct3->_nf_frags6_ctl.high_thresh) +#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ +#define ve_nf_ct_l3protos nf_ct_l3protos +#define ve_nf_conntrack_l3proto_ipv4 &nf_conntrack_l3proto_ipv4 +#define ve_nf_conntrack_l3proto_ipv6 &nf_conntrack_l3proto_ipv6 +#define ve_nf_conntrack_max nf_conntrack_max +#define ve_nf_conntrack_count nf_conntrack_count +#define ve_nf_conntrack_checksum nf_conntrack_checksum +#define ve_nf_ct_frag6_timeout nf_ct_frag6_timeout +#define ve_nf_ct_frag6_low_thresh nf_ct_frag6_low_thresh +#define ve_nf_ct_frag6_high_thresh nf_ct_frag6_high_thresh +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + +extern int init_nf_ct_l3proto_ipv4(void); +extern void fini_nf_ct_l3proto_ipv4(void); +extern int init_nf_ct_l3proto_ipv6(void); +extern void fini_nf_ct_l3proto_ipv6(void); + extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; /* Protocol registration. */ @@ -97,7 +133,11 @@ __nf_ct_l3proto_find(u_int16_t l3proto) { if (unlikely(l3proto >= AF_MAX)) return &nf_conntrack_l3proto_generic; - return rcu_dereference(nf_ct_l3protos[l3proto]); +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_nf_conntrack) + return &nf_conntrack_l3proto_generic; +#endif + return rcu_dereference(ve_nf_ct_l3protos[l3proto]); } #endif /*_NF_CONNTRACK_L3PROTO_H*/ diff -uprN linux-2.6.24/include/net/netfilter/nf_conntrack_l4proto.h linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_l4proto.h --- linux-2.6.24/include/net/netfilter/nf_conntrack_l4proto.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_conntrack_l4proto.h 2008-03-25 18:53:59.000000000 -0500 @@ -99,6 +99,7 @@ extern struct nf_conntrack_l4proto nf_co extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO 256 +extern struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX]; extern struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); @@ -119,16 +120,142 @@ extern int nf_ct_port_nlattr_to_tuple(st struct nf_conntrack_tuple *t); extern const struct nla_policy nf_ct_port_nla_policy[]; +#ifdef CONFIG_SYSCTL /* Log invalid packets */ extern unsigned int nf_ct_log_invalid; +#endif + +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_nf_ct4 (get_exec_env()->_nf_conntrack) +#endif + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) + +#define ve_nf_ct_protos (ve_nf_ct4->_nf_ct_protos) +#define ve_nf_conntrack_l4proto_icmp (ve_nf_ct4->_nf_conntrack_l4proto_icmp) +#define ve_nf_conntrack_l4proto_icmpv6 \ + (ve_nf_ct4->_nf_conntrack_l4proto_icmpv6) +#define ve_nf_conntrack_l4proto_tcp4 (ve_nf_ct4->_nf_conntrack_l4proto_tcp4) +#define ve_nf_conntrack_l4proto_tcp6 (ve_nf_ct4->_nf_conntrack_l4proto_tcp6) +#define ve_nf_conntrack_l4proto_udp4 (ve_nf_ct4->_nf_conntrack_l4proto_udp4) +#define ve_nf_conntrack_l4proto_udp6 (ve_nf_ct4->_nf_conntrack_l4proto_udp6) +#define ve_nf_conntrack_l4proto_generic \ + (ve_nf_ct4->_nf_conntrack_l4proto_generic) +#define ve_nf_ct_log_invalid (ve_nf_ct4->_nf_ct_log_invalid) +/* TCP: */ +#define ve_nf_ct_tcp_timeouts (ve_nf_ct4->_nf_ct_tcp_timeouts) +#define ve_nf_ct_tcp_timeout_max_retrans \ + (ve_nf_ct4->_nf_ct_tcp_timeout_max_retrans) +#define ve_nf_ct_tcp_max_retrans (ve_nf_ct4->_nf_ct_tcp_max_retrans) +#define ve_nf_ct_tcp_loose (ve_nf_ct4->_nf_ct_tcp_loose) +#define ve_nf_ct_tcp_be_liberal (ve_nf_ct4->_nf_ct_tcp_be_liberal) +#define ve_tcp_sysctl_table_users (ve_nf_ct4->_tcp_sysctl_table_users) +#define ve_tcp_sysctl_header (ve_nf_ct4->_tcp_sysctl_header) +#define ve_tcp_compat_sysctl_header (ve_nf_ct4->_tcp_compat_sysctl_header) +/* UDP: */ +#define ve_nf_ct_udp_timeout (ve_nf_ct4->_nf_ct_udp_timeout) +#define ve_nf_ct_udp_timeout_stream (ve_nf_ct4->_nf_ct_udp_timeout_stream) +#define ve_udp_sysctl_table_users (ve_nf_ct4->_udp_sysctl_table_users) +#define ve_udp_sysctl_header (ve_nf_ct4->_udp_sysctl_header) +#define ve_udp_compat_sysctl_header (ve_nf_ct4->_udp_compat_sysctl_header) +/* ICMP: */ +#define ve_nf_ct_icmp_timeout (ve_nf_ct4->_nf_ct_icmp_timeout) +#define ve_icmp_sysctl_header (ve_nf_ct4->_icmp_sysctl_header) +#define ve_icmp_compat_sysctl_header (ve_nf_ct4->_icmp_compat_sysctl_header) +/* ICMPV6: */ +#define ve_nf_ct_icmpv6_timeout (ve_nf_ct4->_nf_ct_icmpv6_timeout) +#define ve_icmpv6_sysctl_header (ve_nf_ct4->_icmpv6_sysctl_header) +/* GENERIC: */ +#define ve_nf_ct_generic_timeout (ve_nf_ct4->_nf_ct_generic_timeout) +#define ve_generic_sysctl_header (ve_nf_ct4->_generic_sysctl_header) +#define ve_generic_compat_sysctl_header (ve_nf_ct4->_generic_compat_sysctl_header) + +extern void nf_ct_proto_icmp_sysctl_cleanup(void); +extern int nf_ct_proto_icmp_sysctl_init(void); +extern void nf_ct_proto_icmpv6_sysctl_cleanup(void); +extern int nf_ct_proto_icmpv6_sysctl_init(void); +extern void nf_ct_proto_tcp_sysctl_cleanup(void); +extern int nf_ct_proto_tcp_sysctl_init(void); +extern void nf_ct_proto_udp_sysctl_cleanup(void); +extern int nf_ct_proto_udp_sysctl_init(void); + +#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ + +#define ve_nf_ct_protos nf_ct_protos +#define ve_nf_conntrack_l4proto_icmp &nf_conntrack_l4proto_icmp +#define ve_nf_conntrack_l4proto_icmpv6 &nf_conntrack_l4proto_icmpv6 +#define ve_nf_conntrack_l4proto_tcp4 &nf_conntrack_l4proto_tcp4 +#define ve_nf_conntrack_l4proto_tcp6 &nf_conntrack_l4proto_tcp6 +#define ve_nf_conntrack_l4proto_udp4 &nf_conntrack_l4proto_udp4 +#define ve_nf_conntrack_l4proto_udp6 &nf_conntrack_l4proto_udp6 +#define ve_nf_conntrack_l4proto_generic &nf_conntrack_l4proto_generic + +#if defined(CONFIG_SYSCTL) + +#define ve_nf_ct_log_invalid nf_ct_log_invalid +/* TCP: */ +#define ve_nf_ct_tcp_timeouts *tcp_timeouts +#define ve_nf_ct_tcp_timeout_max_retrans \ + nf_ct_tcp_timeout_max_retrans +#define ve_nf_ct_tcp_max_retrans nf_ct_tcp_max_retrans +#define ve_nf_ct_tcp_loose nf_ct_tcp_loose +#define ve_nf_ct_tcp_be_liberal nf_ct_tcp_be_liberal +#define ve_tcp_sysctl_table_users tcp_sysctl_table_users +#define ve_tcp_sysctl_header tcp_sysctl_header +/* UDP:*/ +#define ve_nf_ct_udp_timeout nf_ct_udp_timeout +#define ve_nf_ct_udp_timeout_stream nf_ct_udp_timeout_stream +#define ve_udp_sysctl_table_users udp_sysctl_table_users +#define ve_udp_sysctl_header udp_sysctl_header +/* ICMP: */ +#define ve_nf_ct_icmp_timeout nf_ct_icmp_timeout +#define ve_icmp_sysctl_header icmp_sysctl_header +/* ICMPV6: */ +#define ve_nf_ct_icmpv6_timeout nf_ct_icmpv6_timeout +#define ve_icmpv6_sysctl_header icmpv6_sysctl_header +/* GENERIC: */ +#define ve_nf_ct_generic_timeout nf_ct_generic_timeout +#define ve_generic_sysctl_header generic_sysctl_header +#endif /* CONFIG_SYSCTL */ + +static inline int nf_ct_proto_icmp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_icmp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_tcp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_tcp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_udp_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_udp_sysctl_cleanup(void) +{ +} +static inline int nf_ct_proto_icmpv6_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_icmpv6_sysctl_cleanup(void) +{ +} +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ - (nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) + (ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) #else #define LOG_INVALID(proto) \ - ((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \ + ((ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) \ && net_ratelimit()) #endif #else diff -uprN linux-2.6.24/include/net/netfilter/nf_nat.h linux-2.6.24.ovz/include/net/netfilter/nf_nat.h --- linux-2.6.24/include/net/netfilter/nf_nat.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_nat.h 2008-03-25 18:53:59.000000000 -0500 @@ -84,6 +84,7 @@ extern unsigned int nf_nat_setup_info(st /* Is this tuple already taken? (not by us)*/ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); +extern void ip_nat_hash_conntrack(struct nf_conn *ct); static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { diff -uprN linux-2.6.24/include/net/netfilter/nf_nat_rule.h linux-2.6.24.ovz/include/net/netfilter/nf_nat_rule.h --- linux-2.6.24/include/net/netfilter/nf_nat_rule.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netfilter/nf_nat_rule.h 2008-03-25 18:53:59.000000000 -0500 @@ -4,7 +4,7 @@ #include #include -extern int nf_nat_rule_init(void) __init; +extern int nf_nat_rule_init(void); extern void nf_nat_rule_cleanup(void); extern int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, diff -uprN linux-2.6.24/include/net/netlink_sock.h linux-2.6.24.ovz/include/net/netlink_sock.h --- linux-2.6.24/include/net/netlink_sock.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/include/net/netlink_sock.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,23 @@ +#ifndef __NET_NETLINK_SOCK_H +#define __NET_NETLINK_SOCK_H + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + struct module *module; +}; + +#endif /* __NET_NETLINK_SOCK_H */ diff -uprN linux-2.6.24/include/net/route.h linux-2.6.24.ovz/include/net/route.h --- linux-2.6.24/include/net/route.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/route.h 2008-03-25 18:53:59.000000000 -0500 @@ -135,6 +135,7 @@ static inline void ip_rt_put(struct rtab #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern const __u8 ip_tos2prio[16]; +extern int ip_rt_src_check; static inline char rt_tos2priority(u8 tos) { @@ -201,4 +202,14 @@ static inline struct inet_peer *rt_get_p extern ctl_table ipv4_route_table[]; +#ifdef CONFIG_SYSCTL +extern int ipv4_flush_delay; +extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); +extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, int nlen, void __user *oldval, + size_t __user *oldlenp, void __user *newval, + size_t newlen); +#endif #endif /* _ROUTE_H */ diff -uprN linux-2.6.24/include/net/sock.h linux-2.6.24.ovz/include/net/sock.h --- linux-2.6.24/include/net/sock.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/sock.h 2008-03-25 18:53:59.000000000 -0500 @@ -58,6 +58,8 @@ #include #include +#include + /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -263,6 +265,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_beancounter sk_bc; + struct ve_struct *owner_env; }; /* @@ -498,6 +502,8 @@ static inline void sk_add_backlog(struct }) extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount); extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); extern void sk_stream_wait_close(struct sock *sk, long timeo_p); extern int sk_stream_error(struct sock *sk, int flags, int err); @@ -768,8 +774,11 @@ static inline void sk_stream_mem_reclaim static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) { - return (int)skb->truesize <= sk->sk_forward_alloc || - sk_stream_mem_schedule(sk, skb->truesize, 1); + if ((int)skb->truesize > sk->sk_forward_alloc && + !sk_stream_mem_schedule(sk, skb->truesize, 1)) + /* The situation is bad according to mainstream. Den */ + return 0; + return ub_tcprcvbuf_charge(sk, skb) == 0; } static inline int sk_stream_wmem_schedule(struct sock *sk, int size) @@ -855,6 +864,11 @@ extern struct sk_buff *sock_alloc_send unsigned long size, int noblock, int *errcode); +extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, + unsigned long size, + unsigned long size2, + int noblock, + int *errcode); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); @@ -1153,6 +1167,7 @@ static inline int skb_copy_to_page(struc static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); sock_hold(sk); skb->sk = sk; skb->destructor = sock_wfree; @@ -1161,6 +1176,7 @@ static inline void skb_set_owner_w(struc static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { + WARN_ON(skb->destructor); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); diff -uprN linux-2.6.24/include/net/tcp.h linux-2.6.24.ovz/include/net/tcp.h --- linux-2.6.24/include/net/tcp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/tcp.h 2008-03-25 18:53:59.000000000 -0500 @@ -42,6 +42,13 @@ #include #include +#include + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +#define TW_WSCALE_MASK 0x0f +#define TW_WSCALE_SPEC 0x10 extern struct inet_hashinfo tcp_hashinfo; @@ -218,7 +225,9 @@ extern int sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; extern int sysctl_tcp_app_win; +#ifndef sysctl_tcp_adv_win_scale extern int sysctl_tcp_adv_win_scale; +#endif extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_frto_response; @@ -233,6 +242,10 @@ extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; +extern int sysctl_tcp_use_sg; +extern int sysctl_tcp_max_tw_kmem_fraction; +extern int sysctl_tcp_max_tw_buckets_ub; + extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -265,12 +278,17 @@ static inline int tcp_too_many_orphans(s extern struct proto tcp_prot; DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); -#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) -#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) -#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) -#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) -#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) -#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) +#if defined(CONFIG_VE) && defined(CONFIG_INET) +#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) +#else +#define ve_tcp_statistics tcp_statistics +#endif +#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) +#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) +#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) +#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) +#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) +#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) extern void tcp_v4_err(struct sk_buff *skb, u32); @@ -533,7 +551,11 @@ extern u32 __tcp_select_window(struct so * to use only the low 32-bits of jiffies and hide the ugly * casts with the following macro. */ +#ifdef CONFIG_VE +#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) +#else #define tcp_time_stamp ((__u32)(jiffies)) +#endif /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission diff -uprN linux-2.6.24/include/net/udp.h linux-2.6.24.ovz/include/net/udp.h --- linux-2.6.24/include/net/udp.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/include/net/udp.h 2008-03-25 18:53:59.000000000 -0500 @@ -138,16 +138,29 @@ extern int udp_lib_setsockopt(struct so char __user *optval, int optlen, int (*push_pending_frames)(struct sock *)); +static inline int udp_hashfn(u16 num, unsigned veid) +{ + return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); +} + DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); /* * SNMP statistics for UDP and UDP-Lite */ +#ifdef CONFIG_VE +#define ve_udp_statistics (get_exec_env()->_udp_statistics) +#define ve_udplite_statistics (get_exec_env()->_udplite_statistics) +#else +#define ve_udp_statistics udp_statistics +#define ve_udplite_statistics udplite_statistics +#endif + #define UDP_INC_STATS_USER(field, is_udplite) do { \ - if (is_udplite) SNMP_INC_STATS_USER(udplite_statistics, field); \ - else SNMP_INC_STATS_USER(udp_statistics, field); } while(0) + if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_statistics, field); \ + else SNMP_INC_STATS_USER(ve_udp_statistics, field); } while(0) #define UDP_INC_STATS_BH(field, is_udplite) do { \ - if (is_udplite) SNMP_INC_STATS_BH(udplite_statistics, field); \ - else SNMP_INC_STATS_BH(udp_statistics, field); } while(0) + if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_statistics, field); \ + else SNMP_INC_STATS_BH(ve_udp_statistics, field); } while(0) /* /proc */ struct udp_seq_afinfo { diff -uprN linux-2.6.24/init/Kconfig linux-2.6.24.ovz/init/Kconfig --- linux-2.6.24/init/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/init/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -199,7 +199,7 @@ config TASK_XACCT config TASK_IO_ACCOUNTING bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" - depends on TASK_XACCT + depends on TASK_XACCT && BEANCOUNTERS help Collect information on the number of bytes of storage I/O which this task has caused. @@ -338,6 +338,7 @@ choice config FAIR_USER_SCHED bool "user id" + depends on !VE help This option will choose userid as the basis for grouping tasks, thus providing equal CPU bandwidth to each user. @@ -352,6 +353,12 @@ config FAIR_CGROUP_SCHED Refer to Documentation/cgroups.txt for more information on "cgroup" pseudo filesystem. +config VZ_FAIRSCHED + bool "OpenVZ groups" + help + This option add customizable task groups with OpenVZ compatible + syscall and procfs interface. + endchoice config CGROUP_CPUACCT diff -uprN linux-2.6.24/init/calibrate.c linux-2.6.24.ovz/init/calibrate.c --- linux-2.6.24/init/calibrate.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/init/calibrate.c 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -105,6 +106,60 @@ static unsigned long __devinit calibrate static unsigned long __devinit calibrate_delay_direct(void) {return 0;} #endif +unsigned long cycles_per_jiffy, cycles_per_clock; + +static __devinit void calibrate_cycles(void) +{ + unsigned long ticks; + cycles_t time; + + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + time = get_cycles(); + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + + time = get_cycles() - time; + cycles_per_jiffy = time; + if ((time >> 32) != 0) { + printk("CPU too fast! timings are incorrect\n"); + cycles_per_jiffy = -1; + } +} + +EXPORT_SYMBOL(cycles_per_jiffy); +EXPORT_SYMBOL(cycles_per_clock); + +static __devinit void calc_cycles_per_jiffy(void) +{ +#if 0 + extern unsigned long fast_gettimeoffset_quotient; + unsigned long low, high; + + if (fast_gettimeoffset_quotient != 0) { + __asm__("divl %2" + :"=a" (low), "=d" (high) + :"r" (fast_gettimeoffset_quotient), + "0" (0), "1" (1000000/HZ)); + + cycles_per_jiffy = low; + } +#endif + if (cycles_per_jiffy == 0) + calibrate_cycles(); + + if (cycles_per_jiffy == 0) { + printk(KERN_WARNING "Cycles are stuck! " + "Some statistics will not be available."); + /* to prevent division by zero in cycles_to_(clocks|jiffies) */ + cycles_per_jiffy = 1; + cycles_per_clock = 1; + } else + cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); +} + /* * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little @@ -170,4 +225,5 @@ void __devinit calibrate_delay(void) loops_per_jiffy); } + calc_cycles_per_jiffy(); } diff -uprN linux-2.6.24/init/main.c linux-2.6.24.ovz/init/main.c --- linux-2.6.24/init/main.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/init/main.c 2008-03-25 18:53:59.000000000 -0500 @@ -57,6 +57,9 @@ #include #include #include +#include + +#include #include #include @@ -101,10 +104,25 @@ static inline void mark_rodata_ro(void) #ifdef CONFIG_TC extern void tc_init(void); #endif +extern void grsecurity_init(void); enum system_states system_state; EXPORT_SYMBOL(system_state); +#ifdef CONFIG_VE +extern void init_ve_system(void); +extern void init_ve0(void); +extern void prepare_ve0_process(struct task_struct *tsk); +extern void prepare_ve0_proc_root(void); +extern void prepare_ve0_sysctl(void); +#else +#define init_ve_system() do { } while (0) +#define init_ve0() do { } while (0) +#define prepare_ve0_process(tsk) do { } while (0) +#define prepare_ve0_proc_root() do { } while (0) +#define prepare_ve0_sysctl() do { } while (0) +#endif + /* * Boot command-line arguments */ @@ -511,6 +529,9 @@ asmlinkage void __init start_kernel(void smp_setup_processor_id(); + prepare_ve0_process(&init_task); + init_ve0(); + /* * Need to run as early as possible, to initialize the * lockdep hash: @@ -528,6 +549,7 @@ asmlinkage void __init start_kernel(void * enable them */ lock_kernel(); + ub_init_early(); tick_init(); boot_cpu_init(); page_address_init(); @@ -623,6 +645,7 @@ asmlinkage void __init start_kernel(void #endif fork_init(num_physpages); proc_caches_init(); + ub_init_late(); buffer_init(); unnamed_dev_init(); key_init(); @@ -633,6 +656,8 @@ asmlinkage void __init start_kernel(void /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS + prepare_ve0_proc_root(); + prepare_ve0_sysctl(); proc_root_init(); #endif cgroup_init(); @@ -644,6 +669,10 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ +#ifdef CONFIG_BC_RSS_ACCOUNTING + ub_init_pbc(); +#endif + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -728,6 +757,8 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { + init_ve_system(); + /* drivers will send hotplug events */ init_workqueues(); usermodehelper_init(); @@ -820,7 +851,7 @@ static int __init kernel_init(void * unu */ init_pid_ns.child_reaper = current; - __set_special_pids(1, 1); + __set_special_pids(&init_struct_pid); cad_pid = task_pid(current); smp_prepare_cpus(max_cpus); @@ -828,6 +859,7 @@ static int __init kernel_init(void * unu do_pre_smp_initcalls(); smp_init(); + fairsched_init_late(); sched_init_smp(); cpuset_init_smp(); @@ -847,6 +879,8 @@ static int __init kernel_init(void * unu prepare_namespace(); } + grsecurity_init(); + /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the diff -uprN linux-2.6.24/init/version.c linux-2.6.24.ovz/init/version.c --- linux-2.6.24/init/version.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/init/version.c 2008-03-25 18:53:59.000000000 -0500 @@ -33,6 +33,12 @@ struct uts_namespace init_uts_ns = { }; EXPORT_SYMBOL_GPL(init_uts_ns); +struct new_utsname virt_utsname = { + /* we need only this field */ + .release = UTS_RELEASE, +}; +EXPORT_SYMBOL(virt_utsname); + /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" diff -uprN linux-2.6.24/ipc/ipc_sysctl.c linux-2.6.24.ovz/ipc/ipc_sysctl.c --- linux-2.6.24/ipc/ipc_sysctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/ipc_sysctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -102,6 +102,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_doulongvec_minmax, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_SHMALL, @@ -111,6 +112,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_doulongvec_minmax, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_SHMMNI, @@ -120,6 +122,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_dointvec, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMAX, @@ -129,6 +132,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_dointvec, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMNI, @@ -138,6 +142,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_dointvec, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_MSGMNB, @@ -147,6 +152,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_dointvec, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, { .ctl_name = KERN_SEM, @@ -156,6 +162,7 @@ static struct ctl_table ipc_kern_table[] .mode = 0644, .proc_handler = proc_ipc_dointvec, .strategy = sysctl_ipc_data, + .virt_handler = 1, }, {} }; @@ -172,7 +179,7 @@ static struct ctl_table ipc_root_table[] static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + register_glob_sysctl_table(ipc_root_table); return 0; } diff -uprN linux-2.6.24/ipc/msg.c linux-2.6.24.ovz/ipc/msg.c --- linux-2.6.24/ipc/msg.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/msg.c 2008-03-25 18:53:59.000000000 -0500 @@ -184,6 +184,7 @@ static int newque(struct ipc_namespace * int id, retval; key_t key = params->key; int msgflg = params->flg; + int msqid = params->id; msq = ipc_rcu_alloc(sizeof(*msq)); if (!msq) @@ -202,7 +203,7 @@ static int newque(struct ipc_namespace * /* * ipc_addid() locks msq */ - id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid); if (id < 0) { security_msg_queue_free(msq); ipc_rcu_putref(msq); @@ -324,6 +325,7 @@ asmlinkage long sys_msgget(key_t key, in msg_params.key = key; msg_params.flg = msgflg; + msg_params.id = -1; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } @@ -552,7 +554,7 @@ asmlinkage long sys_msgctl(int msqid, in err = -EPERM; if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) + current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) /* We _could_ check for CAP_CHOWN above, but we don't */ goto out_unlock_up; @@ -968,3 +970,55 @@ static int sysvipc_msg_proc_show(struct msq->q_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_msg(key_t key, int msqid, int msgflg) +{ + struct ipc_namespace *ns; + struct ipc_ops msg_ops; + struct ipc_params msg_params; + + ns = current->nsproxy->ipc_ns; + + msg_ops.getnew = newque; + msg_ops.associate = msg_security; + msg_ops.more_checks = NULL; + + msg_params.key = key; + msg_params.flg = msgflg | IPC_CREAT; + msg_params.id = msqid; + + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_msg); + +int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) +{ + int err = 0; + struct msg_queue * msq; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&msg_ids(ns).rw_mutex); + in_use = msg_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + msq = idr_find(&msg_ids(ns).ipcs_idr, next_id); + if (msq == NULL) + continue; + ipc_lock_by_ptr(&msq->q_perm); + err = func(msg_buildid(next_id, msq->q_perm.seq), msq, arg); + msg_unlock(msq); + if (err) + break; + total++; + } + up_write(&msg_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_msg); +#endif diff -uprN linux-2.6.24/ipc/msgutil.c linux-2.6.24.ovz/ipc/msgutil.c --- linux-2.6.24/ipc/msgutil.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/msgutil.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,6 +8,7 @@ * See the file COPYING for more details. */ +#include #include #include #include @@ -17,6 +18,8 @@ #include "util.h" +#include + struct msg_msgseg { struct msg_msgseg* next; /* the next part of the message follows immediately */ @@ -25,52 +28,53 @@ struct msg_msgseg { #define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) -struct msg_msg *load_msg(const void __user *src, int len) +struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, + void * data), int len, void * data) { struct msg_msg *msg; struct msg_msgseg **pseg; int err; int alen; + int offset = 0; alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC); if (msg == NULL) return ERR_PTR(-ENOMEM); msg->next = NULL; msg->security = NULL; - if (copy_from_user(msg + 1, src, alen)) { + if (load(msg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } len -= alen; - src = ((char __user *)src) + alen; + offset += alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - seg = kmalloc(sizeof(*seg) + alen, - GFP_KERNEL); + seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC); if (seg == NULL) { err = -ENOMEM; goto out_err; } *pseg = seg; seg->next = NULL; - if (copy_from_user(seg + 1, src, alen)) { + if (load(seg + 1, alen, offset, data)) { err = -EFAULT; goto out_err; } pseg = &seg->next; len -= alen; - src = ((char __user *)src) + alen; + offset += alen; } err = security_msg_msg_alloc(msg); @@ -83,33 +87,58 @@ out_err: free_msg(msg); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(sysv_msg_load); -int store_msg(void __user *dest, struct msg_msg *msg, int len) +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + return copy_from_user(dst, data + offset, len); +} + +struct msg_msg *load_msg(const void __user *src, int len) +{ + return sysv_msg_load(do_load_msg, len, (void*)src); +} + +int sysv_msg_store(struct msg_msg *msg, + int (*store)(void * src, int len, int offset, void * data), + int len, void * data) { int alen; + int offset = 0; struct msg_msgseg *seg; - + alen = len; if (alen > DATALEN_MSG) alen = DATALEN_MSG; - if (copy_to_user(dest, msg + 1, alen)) + if (store(msg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = msg->next; while (len > 0) { alen = len; if (alen > DATALEN_SEG) alen = DATALEN_SEG; - if (copy_to_user(dest, seg + 1, alen)) + if (store(seg + 1, alen, offset, data)) return -1; len -= alen; - dest = ((char __user *)dest) + alen; + offset += alen; seg = seg->next; } return 0; } +EXPORT_SYMBOL_GPL(sysv_msg_store); + +static int do_store_msg(void * src, int len, int offset, void * data) +{ + return copy_to_user(data + offset, src, len); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + return sysv_msg_store(msg, do_store_msg, len, dest); +} void free_msg(struct msg_msg *msg) { diff -uprN linux-2.6.24/ipc/sem.c linux-2.6.24.ovz/ipc/sem.c --- linux-2.6.24/ipc/sem.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/sem.c 2008-03-25 18:53:59.000000000 -0500 @@ -86,6 +86,8 @@ #include #include "util.h" +#include + #define sem_ids(ns) (*((ns)->ids[IPC_SEM_IDS])) #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) @@ -259,6 +261,7 @@ static int newary(struct ipc_namespace * key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; + int semid = params->id; if (!nsems) return -EINVAL; @@ -282,7 +285,7 @@ static int newary(struct ipc_namespace * return retval; } - id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid); if (id < 0) { security_sem_free(sma); ipc_rcu_putref(sma); @@ -347,6 +350,7 @@ asmlinkage long sys_semget(key_t key, in sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; + sem_params.id = -1; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } @@ -925,7 +929,7 @@ static int semctl_down(struct ipc_namesp goto out_unlock; } if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { + current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) { err=-EPERM; goto out_unlock; } @@ -1016,7 +1020,7 @@ static inline int get_undo_list(struct s undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1074,7 +1078,8 @@ static struct sem_undo *find_undo(struct ipc_rcu_getref(sma); sem_unlock(sma); - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, + GFP_KERNEL_UBC); if (!new) { ipc_lock_by_ptr(&sma->sem_perm); ipc_rcu_putref(sma); @@ -1134,7 +1139,7 @@ asmlinkage long sys_semtimedop(int semid if (nsops > ns->sc_semopm) return -E2BIG; if(nsops > SEMOPM_FAST) { - sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC); if(sops==NULL) return -ENOMEM; } @@ -1415,3 +1420,57 @@ static int sysvipc_sem_proc_show(struct sma->sem_ctime); } #endif + +#ifdef CONFIG_VE +#include + +int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) +{ + struct ipc_namespace *ns; + struct ipc_ops sem_ops; + struct ipc_params sem_params; + + ns = current->nsproxy->ipc_ns; + + sem_ops.getnew = newary; + sem_ops.associate = sem_security; + sem_ops.more_checks = sem_more_checks; + + sem_params.key = key; + sem_params.flg = semflg | IPC_CREAT; + sem_params.u.nsems = size; + sem_params.id = semid; + + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); +} +EXPORT_SYMBOL_GPL(sysvipc_setup_sem); + +int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) +{ + int err = 0; + struct sem_array *sma; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&sem_ids(ns).rw_mutex); + in_use = sem_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + sma = idr_find(&sem_ids(ns).ipcs_idr, next_id); + if (sma == NULL) + continue; + ipc_lock_by_ptr(&sma->sem_perm); + err = func(sem_buildid(next_id, sma->sem_perm.seq), sma, arg); + sem_unlock(sma); + if (err) + break; + total++; + } + up_write(&sem_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_sem); +EXPORT_SYMBOL_GPL(exit_sem); +#endif diff -uprN linux-2.6.24/ipc/shm.c linux-2.6.24.ovz/ipc/shm.c --- linux-2.6.24/ipc/shm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/shm.c 2008-03-25 18:53:59.000000000 -0500 @@ -38,9 +38,13 @@ #include #include #include +#include #include +#include +#include + #include "util.h" struct shm_file_data { @@ -185,9 +189,10 @@ static inline void shm_rmid(struct ipc_n ipc_rmid(&shm_ids(ns), &s->shm_perm); } -static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp) +static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp, + int reqid) { - return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); + return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, reqid); } @@ -207,6 +212,48 @@ static void shm_open(struct vm_area_stru shm_unlock(shp); } +static int shmem_lock(struct shmid_kernel *shp, int lock, + struct user_struct *user) +{ + struct file *file = shp->shm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long size; + + size = shp->shm_segsz + PAGE_SIZE - 1; + +#ifdef CONFIG_SHMEM + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (ub_lockedshm_charge(info, size) < 0) + goto out_ch; + + if (!user_shm_lock(inode->i_size, user)) + goto out_user; + info->flags |= VM_LOCKED; + } + if (!lock && (info->flags & VM_LOCKED) && user) { + ub_lockedshm_uncharge(info, size); + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + } + spin_unlock(&info->lock); + return 0; + +out_user: + ub_lockedshm_uncharge(info, size); +out_ch: + spin_unlock(&info->lock); + return -ENOMEM; +#else + if (lock && ub_lockedshm_charge(info, size)) + return -ENOMEM; + if (!lock) + ub_lockedshm_uncharge(info, size); + return 0; +#endif +} + /* * shm_destroy - free the struct shmid_kernel * @@ -222,7 +269,7 @@ static void shm_destroy(struct ipc_names shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); else user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, shp->mlock_user); @@ -385,11 +432,12 @@ static int newseg(struct ipc_namespace * key_t key = params->key; int shmflg = params->flg; size_t size = params->u.size; + int shmid = params->id; int error; struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; struct file * file; - char name[13]; + char name[64]; int id; if (size < SHMMIN || size > ns->shm_ctlmax) @@ -413,7 +461,7 @@ static int newseg(struct ipc_namespace * return error; } - sprintf (name, "SYSV%08x", key); + snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key); if (shmflg & SHM_HUGETLB) { /* hugetlb_file_setup takes care of mlock user accounting */ file = hugetlb_file_setup(name, size); @@ -433,7 +481,7 @@ static int newseg(struct ipc_namespace * if (IS_ERR(file)) goto no_file; - id = shm_addid(ns, shp); + id = shm_addid(ns, shp, shmid); if (id < 0) { error = id; goto no_id; @@ -507,6 +555,7 @@ asmlinkage long sys_shmget (key_t key, s shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; + shm_params.id = -1; return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } @@ -785,14 +834,14 @@ asmlinkage long sys_shmctl (int shmid, i if(cmd==SHM_LOCK) { struct user_struct * user = current->user; if (!is_file_hugepages(shp->shm_file)) { - err = shmem_lock(shp->shm_file, 1, user); + err = shmem_lock(shp, 1, user); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } } } else if (!is_file_hugepages(shp->shm_file)) { - shmem_lock(shp->shm_file, 0, shp->mlock_user); + shmem_lock(shp, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; } @@ -824,7 +873,7 @@ asmlinkage long sys_shmctl (int shmid, i if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_VE_SYS_ADMIN)) { err=-EPERM; goto out_unlock_up; } @@ -864,7 +913,7 @@ asmlinkage long sys_shmctl (int shmid, i err=-EPERM; if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_VE_SYS_ADMIN)) { goto out_unlock_up; } @@ -1177,3 +1226,67 @@ static int sysvipc_shm_proc_show(struct shp->shm_ctim); } #endif + +#ifdef CONFIG_VE +#include + +struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) +{ + struct ipc_namespace *ns; + struct ipc_ops shm_ops; + struct ipc_params shm_params; + struct shmid_kernel *shp; + struct file *file; + int rv; + + ns = current->nsproxy->ipc_ns; + + shm_ops.getnew = newseg; + shm_ops.associate = shm_security; + shm_ops.more_checks = shm_more_checks; + + shm_params.key = key; + shm_params.flg = shmflg | IPC_CREAT; + shm_params.u.size = size; + shm_params.id = shmid; + + rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + if (rv < 0) + return ERR_PTR(rv); + shp = shm_lock(ns, rv); + BUG_ON(IS_ERR(shp)); + file = shp->shm_file; + get_file(file); + shm_unlock(shp); + return file; +} +EXPORT_SYMBOL_GPL(sysvipc_setup_shm); + +int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) +{ + int err = 0; + struct shmid_kernel* shp; + struct ipc_namespace *ns; + int next_id; + int total, in_use; + + ns = current->nsproxy->ipc_ns; + + down_write(&shm_ids(ns).rw_mutex); + in_use = shm_ids(ns).in_use; + for (total = 0, next_id = 0; total < in_use; next_id++) { + shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); + if (shp == NULL) + continue; + ipc_lock_by_ptr(&shp->shm_perm); + err = func(shp, arg); + shm_unlock(shp); + if (err) + break; + total++; + } + up_write(&shm_ids(ns).rw_mutex); + return err; +} +EXPORT_SYMBOL_GPL(sysvipc_walk_shm); +#endif diff -uprN linux-2.6.24/ipc/util.c linux-2.6.24.ovz/ipc/util.c --- linux-2.6.24/ipc/util.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/util.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,6 +36,8 @@ #include +#include + #include "util.h" struct ipc_proc_iface { @@ -258,6 +260,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * @ids: IPC identifier set * @new: new IPC permission set * @size: limit for the number of used ids + * @reqid: if >= 0, get this id exactly. If -1 -- don't care. * * Add an entry 'new' to the IPC ids idr. The permissions object is * initialised and the first free entry is set up and the id assigned @@ -267,10 +270,18 @@ int ipc_get_maxid(struct ipc_ids *ids) * Called with ipc_ids.rw_mutex held as a writer. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) { int id, err; + if (reqid >= 0) { + id = reqid % SEQ_MULTIPLIER; + err = idr_get_new_above(&ids->ipcs_idr, new, id, &id); + if (err || id != (reqid % SEQ_MULTIPLIER)) + return -1; + goto found; + } + if (size > IPCMNI) size = IPCMNI; @@ -280,15 +291,19 @@ int ipc_addid(struct ipc_ids* ids, struc err = idr_get_new(&ids->ipcs_idr, new, &id); if (err) return err; - +found: ids->in_use++; new->cuid = new->uid = current->euid; new->gid = new->cgid = current->egid; - new->seq = ids->seq++; - if(ids->seq > ids->seq_max) - ids->seq = 0; + if (reqid >= 0) { + new->seq = reqid/SEQ_MULTIPLIER; + } else { + new->seq = ids->seq++; + if(ids->seq > ids->seq_max) + ids->seq = 0; + } spin_lock_init(&new->lock); new->deleted = 0; @@ -455,9 +470,9 @@ void* ipc_alloc(int size) { void* out; if(size > PAGE_SIZE) - out = vmalloc(size); + out = ub_vmalloc(size); else - out = kmalloc(size, GFP_KERNEL); + out = kmalloc(size, GFP_KERNEL_UBC); return out; } @@ -540,14 +555,14 @@ void* ipc_rcu_alloc(int size) * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { - out = vmalloc(HDRLEN_VMALLOC + size); + out = ub_vmalloc(HDRLEN_VMALLOC + size); if (out) { out += HDRLEN_VMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; } } else { - out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC); if (out) { out += HDRLEN_KMALLOC; container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; diff -uprN linux-2.6.24/ipc/util.h linux-2.6.24.ovz/ipc/util.h --- linux-2.6.24/ipc/util.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/ipc/util.h 2008-03-25 18:53:59.000000000 -0500 @@ -47,6 +47,7 @@ struct ipc_params { size_t size; /* for shared memories */ int nsems; /* for semaphores */ } u; /* holds the getnew() specific param */ + int id; }; /* @@ -82,7 +83,7 @@ void __init ipc_init_proc_interface(cons #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) /* must be called with ids->rw_mutex acquired for writing */ -int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); +int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int); /* must be called with ids->rw_mutex acquired for reading */ int ipc_get_maxid(struct ipc_ids *); diff -uprN linux-2.6.24/kernel/Kconfig.openvz linux-2.6.24.ovz/kernel/Kconfig.openvz --- linux-2.6.24/kernel/Kconfig.openvz 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/Kconfig.openvz 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,84 @@ +# Copyright (C) 2005 SWsoft +# All rights reserved. +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "OpenVZ" + +config VE + bool "Virtual Environment support" + default y + select PID_NS + select NET_NS + select USER_NS + help + This option adds support of virtual Linux running on the original box + with fully supported virtual network driver, tty subsystem and + configurable access for hardware and other resources. + +config VE_CALLS + tristate "VE calls interface" + depends on VE + select VZ_DEV + default m + help + This option controls how to build vzmon code containing VE calls. + By default it's build in module vzmon.o + +config VZ_GENCALLS + bool + default y + +config VE_NETDEV + tristate "VE network device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build venet device. This is a + common interface for networking in VE. + +config VE_ETHDEV + tristate "Virtual ethernet device" + depends on VE_CALLS && NET + select VZ_DEV + default m + help + This option controls whether to build virtual ethernet device. + +config VZ_DEV + tristate "VE device" + default m + help + This option adds support of vzdev device, which is used by + user-space applications to control Virtual Environments. + +config VE_IPTABLES + bool "VE netfiltering" + depends on VE && VE_NETDEV && INET && NETFILTER + default y + help + This option controls whether to build VE netfiltering code. + +config VZ_WDOG + tristate "VE watchdog module" + depends on VE_CALLS + default m + help + This option controls building of vzwdog module, which dumps + a lot of useful system info on console periodically. + +config VZ_CHECKPOINT + tristate "Checkpointing & restoring Virtual Environments" + depends on VE_CALLS && INET + select PM + select PM_SLEEP + select TUN + select VE_ETHDEV + select VE_NETDEV + default m + help + This option adds two modules, "cpt" and "rst", which allow + to save a running Virtual Environment and restore it + on another host (live migration) or on the same host (checkpointing). + +endmenu diff -uprN linux-2.6.24/kernel/Makefile linux-2.6.24.ovz/kernel/Makefile --- linux-2.6.24/kernel/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,10 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_BEANCOUNTERS) += bc/ +obj-y += ve/ +obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) @@ -57,6 +61,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_VZ_FAIRSCHED) += vzfairsched.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -uprN linux-2.6.24/kernel/audit.c linux-2.6.24.ovz/kernel/audit.c --- linux-2.6.24/kernel/audit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/audit.c 2008-03-25 18:53:59.000000000 -0500 @@ -586,6 +586,9 @@ static int audit_receive_msg(struct sk_b char *ctx; u32 len; + if (!ve_is_super(skb->owner_env)) + return -ECONNREFUSED; + err = audit_netlink_ok(skb, msg_type); if (err) return err; diff -uprN linux-2.6.24/kernel/auditfilter.c linux-2.6.24.ovz/kernel/auditfilter.c --- linux-2.6.24/kernel/auditfilter.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/auditfilter.c 2008-03-25 18:53:59.000000000 -0500 @@ -167,7 +167,7 @@ static struct audit_parent *audit_init_p inotify_init_watch(&parent->wdata); /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, + wd = inotify_add_watch_dget(audit_ih, &parent->wdata, ndp->dentry, ndp->mnt, AUDIT_IN_WATCH); if (wd < 0) { audit_free_parent(&parent->wdata); diff -uprN linux-2.6.24/kernel/bc/Kconfig linux-2.6.24.ovz/kernel/bc/Kconfig --- linux-2.6.24/kernel/bc/Kconfig 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,111 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +menu "User resources" + +config BEANCOUNTERS + bool "Enable user resource accounting" + default y + help + This patch provides accounting and allows to configure + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable + memory (either mlock'ed or used by internal kernel structures and + buffers). The main goal of this patch is to protect processes + from running short of important resources because of an accidental + misbehavior of processes or malicious activity aiming to ``kill'' + the system. It's worth to mention that resource limits configured + by setrlimit(2) do not give an acceptable level of protection + because they cover only small fraction of resources and work on a + per-process basis. Per-process accounting doesn't prevent malicious + users from spawning a lot of resource-consuming processes. + +config BC_RSS_ACCOUNTING + bool "Account physical memory usage" + default y + depends on BEANCOUNTERS + help + This allows to estimate per beancounter physical memory usage. + Implemented alghorithm accounts shared pages of memory as well, + dividing them by number of beancounter which use the page. + +config BC_IO_ACCOUNTING + bool "Account disk IO" + default y + depends on BC_RSS_ACCOUNTING + help + When on this option allows seeing disk IO activity caused by + tasks from each UB + +config BC_IO_SCHED + bool "UBC I/O priority" + default y + depends on BC_IO_ACCOUNTING && IOSCHED_CFQ + help + This option controls whether to build CFQ I/O scheduler + with support of UBC I/O priority. + +config BC_SWAP_ACCOUNTING + bool "Account swap usage" + default y + depends on BEANCOUNTERS + help + This allows accounting of swap usage. + +config BC_PROC + bool "Report resource usage in /proc" + default y + depends on BEANCOUNTERS + help + Allows a system administrator to inspect resource accounts and limits. + +config BC_DEBUG + bool "User resources debug features" + default n + depends on BEANCOUNTERS + help + Enables to setup debug features for user resource accounting + +config BC_DEBUG_IO + bool "Debug IO accounting" + default y + depends on BC_DEBUG && BC_IO_ACCOUNTING + help + Debugging for IO accointing. + +config BC_DEBUG_KMEM + bool "Debug kmemsize with cache counters" + default n + depends on BC_DEBUG + help + Adds /proc/user_beancounters_debug entry to get statistics + about cache usage of each beancounter + +config BC_KEEP_UNUSED + bool "Keep unused beancounter alive" + default y + depends on BC_DEBUG + help + If on, unused beancounters are kept on the hash and maxheld value + can be looked through. + +config BC_DEBUG_ITEMS + bool "Account resources in items rather than in bytes" + default y + depends on BC_DEBUG + help + When true some of the resources (e.g. kmemsize) are accounted + in items instead of bytes. + +config BC_UNLIMITED + bool "Use unlimited ubc settings" + default y + depends on BC_DEBUG + help + When ON all limits and barriers are set to max values. +endmenu diff -uprN linux-2.6.24/kernel/bc/Makefile linux-2.6.24.ovz/kernel/bc/Makefile --- linux-2.6.24/kernel/bc/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,16 @@ +# +# User resources part (UBC) +# +# Copyright (C) 2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \ + vm_pages.o statd.o oom_kill.o + +obj-$(CONFIG_NET) += net.o +obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o +obj-$(CONFIG_BC_PROC) += proc.o +obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o +obj-$(CONFIG_BC_IO_SCHED) += io_prio.o diff -uprN linux-2.6.24/kernel/bc/beancounter.c linux-2.6.24.ovz/kernel/bc/beancounter.c --- linux-2.6.24/kernel/bc/beancounter.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/beancounter.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,676 @@ +/* + * linux/kernel/bc/beancounter.c + * + * Copyright (C) 1998 Alan Cox + * 1998-2000 Andrey V. Savochkin + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - more intelligent limit check in mremap(): currently the new size is + * charged and _then_ old size is uncharged + * (almost done: !move_vma case is completely done, + * move_vma in its current implementation requires too many conditions to + * do things right, because it may be not only expansion, but shrinking + * also, plus do_munmap will require an additional parameter...) + * - problem: bad pmd page handling + * - consider /proc redesign + * - TCP/UDP ports + * + consider whether __charge_beancounter_locked should be inline + * + * Changes: + * 1999/08/17 Marcelo Tosatti + * - Set "barrier" and "limit" parts of limits atomically. + * 1999/10/06 Marcelo Tosatti + * - setublimit system call. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct kmem_cache *ub_cachep; +static struct user_beancounter default_beancounter; +struct user_beancounter ub0; +EXPORT_SYMBOL_GPL(ub0); + +const char *ub_rnames[] = { + "kmemsize", /* 0 */ + "lockedpages", + "privvmpages", + "shmpages", + "dummy", + "numproc", /* 5 */ + "physpages", + "vmguarpages", + "oomguarpages", + "numtcpsock", + "numflock", /* 10 */ + "numpty", + "numsiginfo", + "tcpsndbuf", + "tcprcvbuf", + "othersockbuf", /* 15 */ + "dgramrcvbuf", + "numothersock", + "dcachesize", + "numfile", + "dummy", /* 20 */ + "dummy", + "dummy", + "numiptent", + "unused_privvmpages", /* UB_RESOURCES */ + "tmpfs_respages", + "swap_pages", + "held_pages", +}; + +static void init_beancounter_struct(struct user_beancounter *ub); +static void init_beancounter_store(struct user_beancounter *ub); +static void init_beancounter_nolimits(struct user_beancounter *ub); + +int print_ub_uid(struct user_beancounter *ub, char *buf, int size) +{ + if (ub->parent != NULL) + return snprintf(buf, size, "%u.%u", + ub->parent->ub_uid, ub->ub_uid); + else + return snprintf(buf, size, "%u", ub->ub_uid); +} +EXPORT_SYMBOL(print_ub_uid); + +#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) +#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) +struct hlist_head ub_hash[UB_HASH_SIZE]; +DEFINE_SPINLOCK(ub_hash_lock); +LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */ +EXPORT_SYMBOL(ub_hash); +EXPORT_SYMBOL(ub_hash_lock); +EXPORT_SYMBOL(ub_list_head); + +/* + * Per user resource beancounting. Resources are tied to their luid. + * The resource structure itself is tagged both to the process and + * the charging resources (a socket doesn't want to have to search for + * things at irq time for example). Reference counters keep things in + * hand. + * + * The case where a user creates resource, kills all his processes and + * then starts new ones is correctly handled this way. The refcounters + * will mean the old entry is still around with resource tied to it. + */ + +static inline void free_ub(struct user_beancounter *ub) +{ + free_percpu(ub->ub_percpu); + kmem_cache_free(ub_cachep, ub); +} + +static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash, + uid_t uid, struct user_beancounter *parent) +{ + struct user_beancounter *ub; + struct hlist_node *ptr; + + hlist_for_each_entry (ub, ptr, hash, ub_hash) + if (ub->ub_uid == uid && ub->parent == parent) + return get_beancounter(ub); + + return NULL; +} + +struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_hash_fun(uid)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, uid, NULL); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) + free_ub(new_ub); + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub); + memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = uid; + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_beancounter_byuid); + +struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, + int id, int create) +{ + struct user_beancounter *new_ub, *ub; + unsigned long flags; + struct hlist_head *hash; + + hash = &ub_hash[ub_subhash_fun(p, id)]; + new_ub = NULL; +retry: + spin_lock_irqsave(&ub_hash_lock, flags); + ub = bc_lookup_hash(hash, id, p); + if (ub != NULL) { + spin_unlock_irqrestore(&ub_hash_lock, flags); + + if (new_ub != NULL) { + put_beancounter(new_ub->parent); + free_ub(new_ub); + } + return ub; + } + + if (!create) { + /* no ub found */ + spin_unlock_irqrestore(&ub_hash_lock, flags); + return NULL; + } + + if (new_ub != NULL) { + list_add_rcu(&new_ub->ub_list, &ub_list_head); + hlist_add_head(&new_ub->ub_hash, hash); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return new_ub; + } + spin_unlock_irqrestore(&ub_hash_lock, flags); + + /* alloc new ub */ + new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, + GFP_KERNEL); + if (new_ub == NULL) + return NULL; + + ub_debug(UBD_ALLOC, "Creating sub %p\n", new_ub); + memset(new_ub, 0, sizeof(*new_ub)); + init_beancounter_nolimits(new_ub); + init_beancounter_store(new_ub); + init_beancounter_struct(new_ub); + new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); + if (new_ub->ub_percpu == NULL) + goto fail_free; + new_ub->ub_uid = id; + new_ub->parent = get_beancounter(p); + goto retry; + +fail_free: + kmem_cache_free(ub_cachep, new_ub); + return NULL; +} +EXPORT_SYMBOL(get_subbeancounter_byid); + +static void put_warn(struct user_beancounter *ub) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n", + atomic_read(&ub->ub_refcount), id, ub); +} + +#ifdef CONFIG_BC_KEEP_UNUSED +#define release_beancounter(ub) do { } while (0) +#else +static int verify_res(struct user_beancounter *ub, int resource, + unsigned long held) +{ + char id[64]; + + if (likely(held == 0)) + return 1; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", + id, held, ub_rnames[resource]); + return 0; +} + +static inline void bc_verify_held(struct user_beancounter *ub) +{ + int i, clean; + + clean = 1; + for (i = 0; i < UB_RESOURCES; i++) + clean &= verify_res(ub, i, ub->ub_parms[i].held); + + clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); + clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); + clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); + clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); + + ub_debug_trace(!clean, 5, 60*HZ); +} + +static void bc_free_rcu(struct rcu_head *rcu) +{ + struct user_beancounter *ub; + + ub = container_of(rcu, struct user_beancounter, rcu); + free_ub(ub); +} + +static void delayed_release_beancounter(struct work_struct *w) +{ + struct user_beancounter *ub, *parent; + unsigned long flags; + + ub = container_of(w, struct user_beancounter, cleanup.work); +again: + local_irq_save(flags); + if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) { + /* raced with get_beancounter_byuid */ + local_irq_restore(flags); + return; + } + + hlist_del(&ub->ub_hash); + list_del_rcu(&ub->ub_list); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + bc_verify_held(ub); + ub_free_counters(ub); + bc_fini_ioprio(&ub->iopriv); + parent = ub->parent; + + call_rcu(&ub->rcu, bc_free_rcu); + if (parent) { + ub = parent; + goto again; + } +} + +static inline void release_beancounter(struct user_beancounter *ub) +{ + struct execute_work *ew; + + ew = &ub->cleanup; + INIT_WORK(&ew->work, delayed_release_beancounter); + schedule_work(&ew->work); +} +#endif + +void __put_beancounter(struct user_beancounter *ub) +{ + unsigned long flags; + + /* equevalent to atomic_dec_and_lock_irqsave() */ + local_irq_save(flags); + if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { + if (unlikely(atomic_read(&ub->ub_refcount) < 0)) + put_warn(ub); + local_irq_restore(flags); + return; + } + + if (unlikely(ub == get_ub0())) { + printk(KERN_ERR "Trying to put ub0\n"); + spin_unlock_irqrestore(&ub_hash_lock, flags); + return; + } + + /* prevent get_beancounter_byuid + put_beancounter() reentrance */ + atomic_inc(&ub->ub_refcount); + spin_unlock_irqrestore(&ub_hash_lock, flags); + + release_beancounter(ub); +} +EXPORT_SYMBOL(__put_beancounter); + +void put_beancounter_safe(struct user_beancounter *ub) +{ + synchronize_rcu(); + __put_beancounter(ub); +} +EXPORT_SYMBOL(put_beancounter_safe); + +/* + * Generic resource charging stuff + */ + +int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + /* + * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition + * at the moment is possible so an overflow is impossible. + */ + ub->ub_parms[resource].held += val; + + switch (strict) { + case UB_HARD: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].barrier) + break; + case UB_SOFT: + if (ub->ub_parms[resource].held > + ub->ub_parms[resource].limit) + break; + case UB_FORCE: + ub_adjust_maxheld(ub, resource); + return 0; + default: + BUG(); + } + + if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) + printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", + ub_rnames[resource], ub->ub_uid); + ub->ub_parms[resource].failcnt++; + ub->ub_parms[resource].held -= val; + return -ENOMEM; +} + +int charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, enum ub_severity strict) +{ + int retval; + struct user_beancounter *p, *q; + unsigned long flags; + + retval = -EINVAL; + if (val > UB_MAXVALUE) + goto out; + + local_irq_save(flags); + for (p = ub; p != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + retval = __charge_beancounter_locked(p, resource, val, strict); + spin_unlock(&p->ub_lock); + if (retval) + goto unroll; + } +out_restore: + local_irq_restore(flags); +out: + return retval; + +unroll: + for (q = ub; q != p; q = q->parent) { + spin_lock(&q->ub_lock); + __uncharge_beancounter_locked(q, resource, val); + spin_unlock(&q->ub_lock); + } + goto out_restore; +} + +EXPORT_SYMBOL(charge_beancounter); + +void __charge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __charge_beancounter_locked(p, resource, val, UB_FORCE); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__charge_beancounter_notop); + +void uncharge_warn(struct user_beancounter *ub, int resource, + unsigned long val, unsigned long held) +{ + char id[64]; + + print_ub_uid(ub, id, sizeof(id)); + printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", + val, held, ub_rnames[resource], id); + ub_debug_trace(1, 10, 10*HZ); +} + +void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val) +{ + ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_parms[resource].held); + if (ub->ub_parms[resource].held < val) { + uncharge_warn(ub, resource, + val, ub->ub_parms[resource].held); + val = ub->ub_parms[resource].held; + } + ub->ub_parms[resource].held -= val; +} + +void uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) +{ + unsigned long flags; + struct user_beancounter *p; + + for (p = ub; p != NULL; p = p->parent) { + spin_lock_irqsave(&p->ub_lock, flags); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock_irqrestore(&p->ub_lock, flags); + } +} + +EXPORT_SYMBOL(uncharge_beancounter); + +void __uncharge_beancounter_notop(struct user_beancounter *ub, + int resource, unsigned long val) +{ + struct user_beancounter *p; + unsigned long flags; + + local_irq_save(flags); + for (p = ub; p->parent != NULL; p = p->parent) { + spin_lock(&p->ub_lock); + __uncharge_beancounter_locked(p, resource, val); + spin_unlock(&p->ub_lock); + } + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__uncharge_beancounter_notop); + + +/* + * Rate limiting stuff. + */ +int ub_ratelimit(struct ub_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} +EXPORT_SYMBOL(ub_ratelimit); + + +/* + * Initialization + * + * struct user_beancounter contains + * - limits and other configuration settings, + * with a copy stored for accounting purposes, + * - structural fields: lists, spinlocks and so on. + * + * Before these parts are initialized, the structure should be memset + * to 0 or copied from a known clean structure. That takes care of a lot + * of fields not initialized explicitly. + */ + +static void init_beancounter_struct(struct user_beancounter *ub) +{ + ub->ub_magic = UB_MAGIC; + atomic_set(&ub->ub_refcount, 1); + spin_lock_init(&ub->ub_lock); + INIT_LIST_HEAD(&ub->ub_tcp_sk_list); + INIT_LIST_HEAD(&ub->ub_other_sk_list); +#ifdef CONFIG_BC_DEBUG_KMEM + INIT_LIST_HEAD(&ub->ub_cclist); +#endif + bc_init_ioprio(&ub->iopriv); +} + +static void init_beancounter_store(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + memcpy(&ub->ub_store[k], &ub->ub_parms[k], + sizeof(struct ubparm)); + } +} + +static void init_beancounter_nolimits(struct user_beancounter *ub) +{ + int k; + + for (k = 0; k < UB_RESOURCES; k++) { + ub->ub_parms[k].limit = UB_MAXVALUE; + /* FIXME: whether this is right for physpages and guarantees? */ + ub->ub_parms[k].barrier = UB_MAXVALUE; + } + + /* FIXME: set unlimited rate? */ + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +static void init_beancounter_syslimits(struct user_beancounter *ub) +{ + unsigned long mp; + extern int max_threads; + int k; + + mp = num_physpages; + ub->ub_parms[UB_KMEMSIZE].limit = + mp > (192*1024*1024 >> PAGE_SHIFT) ? + 32*1024*1024 : (mp << PAGE_SHIFT) / 6; + ub->ub_parms[UB_LOCKEDPAGES].limit = 8; + ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; + ub->ub_parms[UB_SHMPAGES].limit = 64; + ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; + ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; + ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ + ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ + ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; + ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ + ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ + ub->ub_parms[UB_NUMFLOCK].limit = 1024; + ub->ub_parms[UB_NUMPTY].limit = 16; + ub->ub_parms[UB_NUMSIGINFO].limit = 1024; + ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; + ub->ub_parms[UB_NUMFILE].limit = 1024; + + for (k = 0; k < UB_RESOURCES; k++) + ub->ub_parms[k].barrier = ub->ub_parms[k].limit; + + ub->ub_limit_rl.burst = 4; + ub->ub_limit_rl.interval = 300*HZ; +} + +#ifdef CONFIG_SMP +static struct percpu_data ub0_percpu; +#endif +static struct ub_percpu_struct ub0_percpu_data[NR_CPUS]; + +void __init ub_init_early(void) +{ + struct user_beancounter *ub; + + init_cache_counters(); + ub = get_ub0(); + memset(ub, 0, sizeof(*ub)); + ub->ub_uid = 0; + init_beancounter_nolimits(ub); + init_beancounter_store(ub); + init_beancounter_struct(ub); + ub->ub_percpu = static_percpu_ptr(&ub0_percpu, ub0_percpu_data); + + memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); + (void)set_exec_ub(ub); + current->task_bc.task_ub = get_beancounter(ub); + __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE); + current->task_bc.fork_sub = get_beancounter(ub); + ub_init_task_bc(¤t->task_bc); + init_mm.mm_ub = get_beancounter(ub); + + hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); + list_add(&ub->ub_list, &ub_list_head); +} + +void __init ub_init_late(void) +{ + ub_cachep = kmem_cache_create("user_beancounters", + sizeof(struct user_beancounter), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + memset(&default_beancounter, 0, sizeof(default_beancounter)); +#ifdef CONFIG_BC_UNLIMITED + init_beancounter_nolimits(&default_beancounter); +#else + init_beancounter_syslimits(&default_beancounter); +#endif + init_beancounter_store(&default_beancounter); + init_beancounter_struct(&default_beancounter); +} diff -uprN linux-2.6.24/kernel/bc/dcache.c linux-2.6.24.ovz/kernel/bc/dcache.c --- linux-2.6.24/kernel/bc/dcache.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/dcache.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,399 @@ +/* + * kernel/bc/dcache.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Locking + * traverse dcache_lock d_lock + * ub_dentry_charge + - + + * ub_dentry_uncharge + + - + * ub_dentry_charge_nofail + + - + * + * d_inuse changes are atomic, with special handling of "not in use" <-> + * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity + * here: (1) in many operations we need to change d_inuse of both dentry and + * its parent, and (2) on state transitions we need to adjust the account. + * + * Regarding (1): we do not have (and do not want) a single lock covering all + * operations, so in general it's impossible to get a consistent view of + * a tree with respect to d_inuse counters (except by swsuspend). It also + * means if a dentry with d_inuse of 0 gets one new in-use child and loses + * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0, + * and we can't say which way. + * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since + * uncharge can be done only after return from charge (with d_genocide being + * the only apparent exception). + * Regarding (2): there is a similar uncertainty with the dcache account. + * If the account is equal to the limit, one more dentry is started to be + * used and one is put, the account will either hit the limit (and an error + * will be returned), or decrement will happen before increment. + * + * These races do not really matter. + * The only things we want are: + * - if a system is suspenede with no in-use dentries, all d_inuse counters + * should be correct (-1); + * - d_inuse counters should always be >= -1. + * This holds if ->parent references are accessed and maintained properly. + * In subtle moments (like d_move) dentries exchanging their parents should + * both be in-use. At d_genocide time, lookups and charges are assumed to be + * impossible. + */ + +/* + * Hierarchical accounting + * UB argument must NOT be NULL + */ + +static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) + goto out_mem; + if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) + goto out_dcache; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_dcache: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); +out_mem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_uncharge_dcache(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); + __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static int charge_dcache(struct user_beancounter *ub, unsigned long size, + enum ub_severity sv) +{ + struct user_beancounter *p, *q; + + for (p = ub; p != NULL; p = p->parent) { + if (do_charge_dcache(p, size, sv)) + goto unroll; + } + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_uncharge_dcache(q, size); + return -ENOMEM; +} + +void uncharge_dcache(struct user_beancounter *ub, unsigned long size) +{ + for (; ub != NULL; ub = ub->parent) + do_uncharge_dcache(ub, size); +} + +/* + * Simple helpers to do maintain account and d_ub field. + */ + +static inline int d_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) { + put_beancounter(ub); + return -1; + } + d_bc->d_ub = ub; + return 0; +} + +static inline void d_forced_charge(struct dentry_beancounter *d_bc) +{ + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + charge_dcache(ub, d_bc->d_ubsize, UB_FORCE); + d_bc->d_ub = ub; +} + +/* + * Minor helpers + */ + +extern struct kmem_cache *dentry_cache; +extern struct kmem_cache *inode_cachep; +static struct rw_semaphore ub_dentry_alloc_sem; + +static inline unsigned long d_charge_size(struct dentry *dentry) +{ + /* dentry's d_name is already set to appropriate value (see d_alloc) */ + return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) + + (dname_external(dentry) ? + kmem_obj_objuse((void *)dentry->d_name.name) : 0); +} + +/* + * Entry points from dcache.c + */ + +/* + * Set initial d_inuse on d_alloc. + * Called with no locks, preemption disabled. + */ +int __ub_dentry_alloc(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + + d_bc = &dentry->dentry_bc; + d_bc->d_ub = get_beancounter(get_exec_ub()); + atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */ + d_bc->d_ubsize = d_charge_size(dentry); + + if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) + goto failure; + return 0; + +failure: + put_beancounter(d_bc->d_ub); + d_bc->d_ub = NULL; + return -ENOMEM; +} +void __ub_dentry_alloc_start(void) +{ + down_read(&ub_dentry_alloc_sem); + current->task_bc.dentry_alloc = 1; +} + +void __ub_dentry_alloc_end(void) +{ + current->task_bc.dentry_alloc = 0; + up_read(&ub_dentry_alloc_sem); +} + +/* + * It is assumed that parent is already in use, so traverse upwards is + * limited to one ancestor only. + * Called under d_lock and rcu_read_lock. + */ +int __ub_dentry_charge(struct dentry *dentry) +{ + struct dentry_beancounter *d_bc; + struct dentry *parent; + int ret; + + if (ub_dget_testone(dentry)) { + d_bc = &dentry->dentry_bc; + /* state transition -1 => 0 */ + if (d_charge(d_bc)) + goto failure; + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + } + return 0; + +failure: + /* + * Here we would like to fail the lookup. + * It is not easy: if d_lookup fails, callers expect that a dentry + * with the given name doesn't exist, and create a new one. + * So, first we forcedly charge for this dentry. + * Then try to remove it from cache safely. If it turns out to be + * possible, we can return error. + */ + d_forced_charge(d_bc); + + if (dentry != dentry->d_parent) { + parent = dentry->d_parent; + if (ub_dget_testone(parent)) + BUG(); + } + + ret = 0; + if (spin_trylock(&dcache_lock)) { + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + rcu_read_unlock(); + shrink_dcache_parent(dentry); + rcu_read_lock(); + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + } + if (atomic_read(&dentry->d_count) == 1) { + __d_drop(dentry); + ret = -1; + } + spin_unlock(&dcache_lock); + } + + return ret; +} + +/* + * Go up in the tree decreasing d_inuse. + * Called under dcache_lock. + */ +void __ub_dentry_uncharge(struct dentry *dentry) +{ + struct dentry *parent; + struct user_beancounter *ub; + unsigned long size; + + /* go up until state doesn't change or and root is reached */ + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + while (ub_dput_testzero(dentry)) { + /* state transition 0 => -1 */ + uncharge_dcache(ub, size); + put_beancounter(ub); + + parent = dentry->d_parent; + if (dentry == parent) + break; + + dentry = parent; + size = dentry->dentry_bc.d_ubsize; + ub = dentry->dentry_bc.d_ub; + } +} + +/* + * Forced charge for __dget_locked, where API doesn't allow to return error. + * Called under dcache_lock. + */ +void __ub_dentry_charge_nofail(struct dentry *dentry) +{ + struct dentry *parent; + + while (ub_dget_testone(dentry)) { + /* state transition -1 => 0 */ + d_forced_charge(&dentry->dentry_bc); + + parent = dentry->d_parent; + if (dentry == parent) + break; + dentry = parent; + } +} + +/* + * Adaptive accounting + */ + +int ub_dentry_on = 1; +int ub_dentry_alloc_barrier; +EXPORT_SYMBOL(ub_dentry_on); + +static unsigned long checklowat = 0; +static unsigned long checkhiwat = ULONG_MAX; + +static int sysctl_ub_dentry_chk = 10; +#define sysctl_ub_lowat sysctl_ub_watermark[0] +#define sysctl_ub_hiwat sysctl_ub_watermark[1] +static DECLARE_RWSEM(ub_dentry_alloc_sem); +/* 1024th of lowmem size */ +static unsigned int sysctl_ub_watermark[2] = {0, 100}; + +static void ub_dentry_set_limits(unsigned long pages, unsigned long cap) +{ + down_write(&ub_dentry_alloc_sem); + preempt_disable(); + checklowat = (pages >> 10) * sysctl_ub_lowat; + checkhiwat = (pages >> 10) * sysctl_ub_hiwat; + if (checkhiwat > cap) { + checkhiwat = cap; + checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat; + } + preempt_enable(); + up_write(&ub_dentry_alloc_sem); +} + +static int ub_dentry_proc_handler(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + r = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!r && write) + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + ULONG_MAX); + return r; +} + +static ctl_table ub_dentry_sysctl_table[] = { + { + .procname = "dentry_check", + .data = &sysctl_ub_dentry_chk, + .maxlen = sizeof(sysctl_ub_dentry_chk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dentry_watermark", + .data = &sysctl_ub_lowat, + .maxlen = sizeof(sysctl_ub_lowat) * 2, + .mode = 0644, + .proc_handler = ub_dentry_proc_handler, + }, + { .ctl_name = 0 } +}; +static ctl_table ub_dentry_sysctl_root[] = { + { + .procname = "ubc", + .mode = 0555, + .child = ub_dentry_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static int __init ub_dentry_init(void) +{ + /* + * Initial watermarks are limited, to limit walk time. + * 384MB translates into 0.8 sec on PIII 866MHz. + */ + ub_dentry_set_limits(totalram_pages - totalhigh_pages, + 384 * 1024 * 1024 / PAGE_SIZE); + if (register_sysctl_table(ub_dentry_sysctl_root) == NULL) + return -ENOMEM; + return 0; +} +__initcall(ub_dentry_init); diff -uprN linux-2.6.24/kernel/bc/io_acct.c linux-2.6.24.ovz/kernel/bc/io_acct.c --- linux-2.6.24/kernel/bc/io_acct.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/io_acct.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,514 @@ +/* + * kernel/bc/io_acct.c + * + * Copyright (C) 2006 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct mempool_s *pb_pool; + +#define PB_MIN_IO (1024) + +static inline struct page_beancounter *io_pb_alloc(void) +{ + return mempool_alloc(pb_pool, GFP_ATOMIC); +} + +static inline void io_pb_free(struct page_beancounter *pb) +{ + mempool_free(pb, pb_pool); +} + +struct page_beancounter **page_pblist(struct page *page) +{ + struct page_beancounter **pb, *iopb; + + pb = &page_pbc(page); + iopb = iopb_to_pb(*pb); + + return iopb == NULL ? pb : &iopb->page_pb_list; +} + +/* + * We save the context page was set dirty to use it later + * when the real write starts. If the page is mapped then + * IO pb is stores like this: + * + * Before saving: + * + * +- page -------+ + * | ... | + * | page_pb +---+ + * +--------------+ | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * After saving: + * + * +- page -------+ +- io pb ------+ + * | ... | | ... | + * | page_pb +----> | page_pb_list +-+ + * +--------------+ +--------------+ | + * | + * +-------------------+ + * | + * | +-----+ +-----+ +-----+ + * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ + * +-----+ +-----+ +-----+ | + * ^ | + * +---------------------------------+ + * + * And the page_pblist(...) function returns pointer to the place that + * points to this pbX ring. + */ + +#ifdef CONFIG_BC_DEBUG_IO +static LIST_HEAD(pb_io_list); +static unsigned long anon_pages, not_released; + +static inline void io_debug_save(struct page_beancounter *pb, + struct page_beancounter *mpb) +{ + pb->io_debug = (mpb == NULL); + list_add(&pb->io_list, &pb_io_list); +} + +static inline void io_debug_release(struct page_beancounter *pb) +{ + list_del(&pb->io_list); +} + +void ub_io_release_debug(struct page *page) +{ + struct page_beancounter *pb; + static int once = 0; + + pb = page_pbc(page); + if (likely(iopb_to_pb(pb) == NULL)) + return; + + if (!once) { + printk("BUG: Page has an IO bc but is not expectd to\n"); + dump_stack(); + once = 1; + } + + spin_lock(&pb_lock); + not_released++; + pb = iopb_to_pb(pb); + page_pbc(page) = NULL; + io_debug_release(pb); + pb->ub->io_pb_held--; + spin_unlock(&pb_lock); + + put_beancounter(pb->ub); + io_pb_free(pb); +} + +static inline int io_debug_precheck_save(struct page *page) +{ + if (unlikely(PageAnon(page))) { + anon_pages++; + return 1; + } + + return 0; +} + +static inline int io_debug_precheck_release(struct page *page) +{ + return 0; +} +#else +#define io_debug_save(pb, mpb) do { } while (0) +#define io_debug_release(pb) do { } while (0) +#define io_debug_precheck_save(page) (0) +#define io_debug_precheck_release(p) (0) +#endif + +static inline void set_page_io(struct page *page, struct page_beancounter *pb, + struct page_beancounter *mapped_pb) +{ + unsigned long val; + + val = (unsigned long)pb | PAGE_IO_MARK; + pb->page = page; + + page_pbc(page) = (struct page_beancounter *)val; + io_debug_save(pb, mapped_pb); + pb->ub->io_pb_held++; +} + +static inline void put_page_io(struct page *page, struct page_beancounter *pb) +{ + pb->ub->io_pb_held--; + io_debug_release(pb); + page_pbc(page) = pb->page_pb_list; +} + +void ub_io_save_context(struct page *page, size_t bytes_dirtied) +{ + struct user_beancounter *ub; + struct page_beancounter *pb, *mapped_pb, *io_pb; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + /* + * FIXME - this can happen from atomic context and + * it's probably not that good to loose some requests + */ + + pb = io_pb_alloc(); + io_pb = NULL; + + spin_lock(&pb_lock); + if (io_debug_precheck_save(page)) + goto out_unlock; + + mapped_pb = page_pbc(page); + io_pb = iopb_to_pb(mapped_pb); + if (io_pb != NULL) { + /* + * this page has an IO - release it and force a new one + * We could also race with page cleaning - see below + */ + mapped_pb = io_pb->page_pb_list; + put_page_io(page, io_pb); + } + + /* + * If the page is mapped we must save the context + * it maps to. If the page isn't mapped we use current + * context as this is a regular write. + */ + + if (mapped_pb != NULL) + ub = top_beancounter(mapped_pb->ub); + else + ub = get_io_ub(); + + if (!PageDirty(page)) { + /* + * race with clear_page_dirty(_for_io) - account + * writes for ub_io_release_context() + */ + if (io_pb != NULL) + io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; + if (pb != NULL) + io_pb_free(pb); + goto out_unlock; + } + + if (pb == NULL) { + ub->bytes_dirty_missed += bytes_dirtied; + goto out_unlock; + } + + /* + * the page may become clean here, but the context will be seen + * in ub_io_release_context() + */ + + pb->ub = get_beancounter(ub); + pb->page_pb_list = mapped_pb; + ub->bytes_dirtied += bytes_dirtied; + + set_page_io(page, pb, mapped_pb); + +out_unlock: + spin_unlock(&pb_lock); + + if (io_pb != NULL) { + put_beancounter(io_pb->ub); + io_pb_free(io_pb); + } +} + +void ub_io_release_context(struct page *page, size_t wrote) +{ + struct page_beancounter *pb; + + if (io_debug_precheck_release(page)) + return; + + if (unlikely(in_interrupt())) { + WARN_ON_ONCE(1); + return; + } + + spin_lock(&pb_lock); + pb = iopb_to_pb(page_pbc(page)); + if (unlikely(pb == NULL)) + /* + * this may happen if we failed to allocate + * context in ub_io_save_context or raced with it + */ + goto out_unlock; + + if (wrote) + pb->ub->bytes_wrote += wrote; + + put_page_io(page, pb); +out_unlock: + spin_unlock(&pb_lock); + + if (pb != NULL) { + put_beancounter(pb->ub); + io_pb_free(pb); + } +} + +void __init ub_init_io(struct kmem_cache *pb_cachep) +{ + pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep); + if (pb_pool == NULL) + panic("Can't create pb_pool"); +} + +#ifdef CONFIG_PROC_FS +#define in_flight(var) (var > var##_done ? var - var##_done : 0) + +static int bc_ioacct_show(struct seq_file *f, void *v) +{ + int i; + unsigned long long read, write, cancel; + unsigned long sync, sync_done; + unsigned long fsync, fsync_done; + unsigned long fdsync, fdsync_done; + unsigned long frsync, frsync_done; + unsigned long reads, writes; + unsigned long long rchar, wchar; + struct user_beancounter *ub; + + ub = seq_beancounter(f); + + read = write = cancel = 0; + sync = sync_done = fsync = fsync_done = + fdsync = fdsync_done = frsync = frsync_done = 0; + reads = writes = 0; + rchar = wchar = 0; + for_each_online_cpu(i) { + struct ub_percpu_struct *ub_percpu; + ub_percpu = per_cpu_ptr(ub->ub_percpu, i); + + read += ub_percpu->bytes_read; + write += ub_percpu->bytes_wrote; + cancel += ub_percpu->bytes_cancelled; + + sync += ub_percpu->sync; + fsync += ub_percpu->fsync; + fdsync += ub_percpu->fdsync; + frsync += ub_percpu->frsync; + sync_done += ub_percpu->sync_done; + fsync_done += ub_percpu->fsync_done; + fdsync_done += ub_percpu->fdsync_done; + frsync_done += ub_percpu->frsync_done; + + reads += ub_percpu->read; + writes += ub_percpu->write; + rchar += ub_percpu->rchar; + wchar += ub_percpu->wchar; + } + + seq_printf(f, bc_proc_llu_fmt, "read", read); + seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write); + seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied); + seq_printf(f, bc_proc_llu_fmt, "cancel", cancel); + seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync); + + seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync)); + seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync)); + seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync)); + + seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads); + seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar); + seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes); + seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar); + + seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held); + return 0; +} + +static struct bc_proc_entry bc_ioacct_entry = { + .name = "ioacct", + .u.show = bc_ioacct_show, +}; + +#ifdef CONFIG_BC_DEBUG_IO +#define PTR_SIZE (int)(sizeof(void *) * 2) +#define INT_SIZE (int)(sizeof(int) * 2) + +static int bc_io_show(struct seq_file *f, void *v) +{ + struct list_head *lh; + struct page_beancounter *pb; + struct page *pg; + + lh = (struct list_head *)v; + if (lh == &pb_io_list) { + seq_printf(f, "Races: anon %lu missed %lu\n", + anon_pages, not_released); + + seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s " + "%-*s %-*s %-1s %-*s %-*s\n", + PTR_SIZE, "pb", "", + PTR_SIZE, "page", "flg", + INT_SIZE, "cnt", INT_SIZE, "mcnt", + PTR_SIZE, "pb_list", + PTR_SIZE, "page_pb", "", + PTR_SIZE, "mapping", + INT_SIZE, "ub"); + return 0; + } + + pb = list_entry(lh, struct page_beancounter, io_list); + pg = pb->page; + seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n", + pb, pb->io_debug ? 'e' : 'm', pg, + PageDirty(pg) ? 'D' : 'd', + PageAnon(pg) ? 'A' : 'a', + PageWriteback(pg) ? 'W' : 'w', + PageLocked(pg) ? 'L' : 'l', + INT_SIZE, page_count(pg), + INT_SIZE, page_mapcount(pg), + pb->page_pb_list, page_pbc(pg), + iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!', + pg->mapping, pb->ub->ub_uid); + return 0; +} + +static void *bc_io_start(struct seq_file *f, loff_t *ppos) +{ + loff_t pos; + struct list_head *lh; + + pos = *ppos; + spin_lock(&pb_lock); + if (pos == 0) + return &pb_io_list; + + list_for_each (lh, &pb_io_list) + if (pos-- == 1) + return lh; + return NULL; +} + +static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct list_head *lh; + + (*ppos)++; + lh = (struct list_head *)v; + return lh->next == &pb_io_list ? NULL : lh->next; +} + +static void bc_io_stop(struct seq_file *f, void *v) +{ + spin_unlock(&pb_lock); +} + +static struct seq_operations bc_io_seq_ops = { + .start = bc_io_start, + .next = bc_io_next, + .stop = bc_io_stop, + .show = bc_io_show, +}; + +static int bc_io_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &bc_io_seq_ops); +} +static struct file_operations bc_io_debug_ops = { + .open = bc_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_ioacct_debug_entry = { + .name = "ioacct_debug", + .u.fops = &bc_io_debug_ops, +}; +#endif + +static int bc_ioacct_notify(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + unsigned long *vm_events; + unsigned long long bin, bout; + int i; + + if (event != VIRTINFO_VMSTAT) + return old_ret; + + ub = top_beancounter(get_exec_ub()); + if (ub == get_ub0()) + return old_ret; + + /* Think over: do we need to account here bytes_dirty_missed? */ + bout = ub->bytes_wrote; + bin = 0; + for_each_online_cpu(i) { + bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote; + bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read; + } + + /* convert to Kbytes */ + bout >>= 10; + bin >>= 10; + + vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS; + vm_events[PGPGOUT] = (unsigned long)bout; + vm_events[PGPGIN] = (unsigned long)bin; + return NOTIFY_OK; +} + +static struct vnotifier_block bc_ioacct_nb = { + .notifier_call = bc_ioacct_notify, +}; + +static int __init bc_ioacct_init(void) +{ +#ifdef CONFIG_BC_DEBUG_IO + bc_register_proc_root_entry(&bc_ioacct_debug_entry); +#endif + bc_register_proc_entry(&bc_ioacct_entry); + + virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb); + return 0; +} + +late_initcall(bc_ioacct_init); +#endif diff -uprN linux-2.6.24/kernel/bc/io_prio.c linux-2.6.24.ovz/kernel/bc/io_prio.c --- linux-2.6.24/kernel/bc/io_prio.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/io_prio.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,288 @@ +/* + * kernel/bc/io_prio.c + * + * Copyright (C) 2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Vasily Tarasov + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct cfq_bc_data *__find_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + + list_for_each_entry(cfq_bc, &iopriv->cfq_bc_head, cfq_bc_list) + if (cfq_bc->cfqd == cfqd) + return cfq_bc; + + return NULL; +} + +struct cfq_bc_data *bc_find_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + unsigned long flags; + + read_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + read_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + return cfq_bc; +} +struct cfq_bc_data *bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, + struct cfq_data *cfqd, gfp_t gfp_mask) +{ + struct cfq_bc_data *cfq_bc_new; + struct cfq_bc_data *cfq_bc; + unsigned long flags; + + cfq_bc = bc_find_cfq_bc(iopriv, cfqd); + if (cfq_bc) + return cfq_bc; + + cfq_bc_new = kzalloc(sizeof(*cfq_bc_new), gfp_mask); + if (!cfq_bc_new) + return NULL; + + cfq_init_cfq_bc(cfq_bc_new); + cfq_bc_new->cfqd = cfqd; + cfq_bc_new->ub_iopriv = iopriv; + + write_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); + cfq_bc = __find_cfq_bc(iopriv, cfqd); + if (cfq_bc) + kfree(cfq_bc_new); + else { + list_add_tail(&cfq_bc_new->cfq_bc_list, + &iopriv->cfq_bc_head); + cfq_bc = cfq_bc_new; + } + write_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); + + return cfq_bc; +} + +void bc_init_ioprio(struct ub_iopriv *iopriv) +{ + INIT_LIST_HEAD(&iopriv->cfq_bc_head); + rwlock_init(&iopriv->cfq_bc_list_lock); + iopriv->ioprio = UB_IOPRIO_BASE; +} + +static void inline bc_cfq_bc_check_empty(struct cfq_bc_data *cfq_bc) +{ + BUG_ON(!RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)); +} + +static void bc_release_cfq_bc(struct cfq_bc_data *cfq_bc) +{ + struct cfq_data *cfqd; + elevator_t *eq; + int i; + + cfqd = cfq_bc->cfqd; + eq = cfqd->queue->elevator; + + for (i = 0; i < CFQ_PRIO_LISTS; i++) { + if (cfq_bc->async_cfqq[0][i]) { + eq->ops->put_queue(cfq_bc->async_cfqq[0][i]); + cfq_bc->async_cfqq[0][i] = NULL; + } + if (cfq_bc->async_cfqq[1][i]) { + eq->ops->put_queue(cfq_bc->async_cfqq[1][i]); + cfq_bc->async_cfqq[1][i] = NULL; + } + } + if (cfq_bc->async_idle_cfqq) { + eq->ops->put_queue(cfq_bc->async_idle_cfqq); + cfq_bc->async_idle_cfqq = NULL; + } + /* + * Note: this cfq_bc is already not in active list, + * but can be still pointed from cfqd as active. + */ + cfqd->active_cfq_bc = NULL; + + bc_cfq_bc_check_empty(cfq_bc); + list_del(&cfq_bc->cfq_bc_list); + kfree(cfq_bc); +} + +void bc_fini_ioprio(struct ub_iopriv *iopriv) +{ + struct cfq_bc_data *cfq_bc; + struct cfq_bc_data *cfq_bc_tmp; + unsigned long flags; + spinlock_t *queue_lock; + + /* + * Don't get cfq_bc_list_lock since ub is already dead, + * but async cfqqs are still in hash list, consequently + * queue_lock should be hold. + */ + list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, + &iopriv->cfq_bc_head, cfq_bc_list) { + queue_lock = cfq_bc->cfqd->queue->queue_lock; + spin_lock_irqsave(queue_lock, flags); + bc_release_cfq_bc(cfq_bc); + spin_unlock_irqrestore(queue_lock, flags); + } +} + +void bc_cfq_exit_queue(struct cfq_data *cfqd) +{ + struct cfq_bc_data *cfq_bc; + struct user_beancounter *ub; + + local_irq_disable(); + for_each_beancounter(ub) { + write_lock(&ub->iopriv.cfq_bc_list_lock); + cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); + if (!cfq_bc) { + write_unlock(&ub->iopriv.cfq_bc_list_lock); + continue; + } + bc_release_cfq_bc(cfq_bc); + write_unlock(&ub->iopriv.cfq_bc_list_lock); + } + local_irq_enable(); +} + +int bc_expired(struct cfq_data *cfqd) +{ + return time_after(jiffies, cfqd->slice_end) ? 1 : 0; +} + +static inline int bc_empty(struct cfq_bc_data *cfq_bc) +{ + /* + * consider BC as empty only if there is no requests + * in elevator _and_ in driver + */ + if (!cfq_bc->rqnum && !cfq_bc->on_dispatch) + return 1; + + return 0; +} + +static inline unsigned long bc_time_slice_by_ioprio(unsigned int ioprio, + unsigned int base_slice) +{ + return base_slice + + (base_slice * (ioprio - UB_IOPRIO_MIN)) + / (UB_IOPRIO_MAX - UB_IOPRIO_MIN - 1); +} + +static inline void bc_set_active(struct cfq_data *cfqd) +{ + if (list_empty(&cfqd->act_cfq_bc_head)) { + cfqd->active_cfq_bc = NULL; + return; + } + + cfqd->active_cfq_bc = list_first_entry(&cfqd->act_cfq_bc_head, + struct cfq_bc_data, act_cfq_bc_list); + list_move_tail(&cfqd->active_cfq_bc->act_cfq_bc_list, + &cfqd->act_cfq_bc_head); + cfqd->slice_end = jiffies + + bc_time_slice_by_ioprio(cfqd->active_cfq_bc->ub_iopriv->ioprio, + cfqd->cfq_ub_slice); +} + +void bc_schedule_active(struct cfq_data *cfqd) +{ + if (bc_expired(cfqd) || !cfqd->active_cfq_bc || + bc_empty(cfqd->active_cfq_bc)) + bc_set_active(cfqd); +} + +void bc_inc_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + if (!cfq_bc->rqnum) + list_add_tail(&cfq_bc->act_cfq_bc_list, + &cfqq->cfqd->act_cfq_bc_head); + + cfq_bc->rqnum++; +} + +void bc_dec_rqnum(struct cfq_queue *cfqq) +{ + struct cfq_bc_data *cfq_bc; + + cfq_bc = cfqq->cfq_bc; + + cfq_bc->rqnum--; + + if (!cfq_bc->rqnum) + list_del(&cfq_bc->act_cfq_bc_list); +} + +unsigned long bc_set_ioprio(int ubid, int ioprio) +{ + struct user_beancounter *ub; + + if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX) + return -ERANGE; + + ub = get_beancounter_byuid(ubid, 0); + if (!ub) + return -ESRCH; + + ub->iopriv.ioprio = ioprio; + put_beancounter(ub); + + return 0; +} + +struct user_beancounter *bc_io_switch_context(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *old_ub = NULL; + + pb = page_iopb(page); + pb = iopb_to_pb(pb); + if (pb) { + get_beancounter(pb->ub); + old_ub = set_exec_ub(pb->ub); + } + + return old_ub; +} + +void bc_io_restore_context(struct user_beancounter *ub) +{ + struct user_beancounter *old_ub; + + if (ub) { + old_ub = set_exec_ub(ub); + put_beancounter(old_ub); + } +} + +EXPORT_SYMBOL(bc_io_switch_context); +EXPORT_SYMBOL(bc_io_restore_context); +EXPORT_SYMBOL(__find_cfq_bc); +EXPORT_SYMBOL(bc_fini_ioprio); +EXPORT_SYMBOL(bc_init_ioprio); +EXPORT_SYMBOL(bc_findcreate_cfq_bc); +EXPORT_SYMBOL(bc_cfq_exit_queue); +EXPORT_SYMBOL(bc_expired); +EXPORT_SYMBOL(bc_schedule_active); +EXPORT_SYMBOL(bc_inc_rqnum); +EXPORT_SYMBOL(bc_dec_rqnum); diff -uprN linux-2.6.24/kernel/bc/kmem.c linux-2.6.24.ovz/kernel/bc/kmem.c --- linux-2.6.24/kernel/bc/kmem.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/kmem.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,406 @@ +/* + * kernel/bc/kmem.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Initialization + */ + +/* + * Slab accounting + */ + +#ifdef CONFIG_BC_DEBUG_KMEM + +#define CC_HASH_SIZE 1024 +static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; +spinlock_t cc_lock; + +static void __free_cache_counters(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + struct ub_cache_counter *cc, **pprev, *del; + int i; + unsigned long flags; + + del = NULL; + spin_lock_irqsave(&cc_lock, flags); + for (i = 0; i < CC_HASH_SIZE; i++) { + pprev = &cc_hash[i]; + cc = cc_hash[i]; + while (cc != NULL) { + if (cc->ub != ub && cc->cachep != cachep) { + pprev = &cc->next; + cc = cc->next; + continue; + } + + list_del(&cc->ulist); + *pprev = cc->next; + cc->next = del; + del = cc; + cc = *pprev; + } + } + spin_unlock_irqrestore(&cc_lock, flags); + + while (del != NULL) { + cc = del->next; + kfree(del); + del = cc; + } +} + +void ub_free_counters(struct user_beancounter *ub) +{ + __free_cache_counters(ub, NULL); +} + +void ub_kmemcache_free(struct kmem_cache *cachep) +{ + __free_cache_counters(NULL, cachep); +} + +void __init init_cache_counters(void) +{ + memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); + spin_lock_init(&cc_lock); +} + +#define cc_hash_fun(ub, cachep) ( \ + (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ + ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ + ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ + ) & (CC_HASH_SIZE - 1)) + +static int change_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep, long val) +{ + struct ub_cache_counter *cc, *new_cnt, **pprev; + unsigned long flags; + + new_cnt = NULL; +again: + spin_lock_irqsave(&cc_lock, flags); + cc = cc_hash[cc_hash_fun(ub, cachep)]; + while (cc) { + if (cc->ub == ub && cc->cachep == cachep) + goto found; + cc = cc->next; + } + + if (new_cnt != NULL) + goto insert; + + spin_unlock_irqrestore(&cc_lock, flags); + + new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC); + if (new_cnt == NULL) + return -ENOMEM; + + new_cnt->counter = 0; + new_cnt->ub = ub; + new_cnt->cachep = cachep; + goto again; + +insert: + pprev = &cc_hash[cc_hash_fun(ub, cachep)]; + new_cnt->next = *pprev; + *pprev = new_cnt; + list_add(&new_cnt->ulist, &ub->ub_cclist); + cc = new_cnt; + new_cnt = NULL; + +found: + cc->counter += val; + spin_unlock_irqrestore(&cc_lock, flags); + if (new_cnt) + kfree(new_cnt); + return 0; +} + +static inline int inc_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + return change_slab_charged(ub, cachep, 1); +} + +static inline void dec_slab_charged(struct user_beancounter *ub, + struct kmem_cache *cachep) +{ + if (change_slab_charged(ub, cachep, -1) < 0) + BUG(); +} + +#include + +#define inc_pages_charged(ub, order) ub_percpu_add(ub, \ + pages_charged, 1 << order) +#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \ + pages_charged, 1 << order) + +#ifdef CONFIG_PROC_FS +static int bc_kmem_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + struct ub_cache_counter *cc; + long pages, vmpages, pbc; + int i; + + ub = seq_beancounter(f); + + pages = vmpages = pbc = 0; + for_each_online_cpu(i) { + pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; + vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; + pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs; + } + if (pages < 0) + pages = 0; + if (vmpages < 0) + vmpages = 0; + + seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); + seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc, + sizeof(struct page_beancounter)); + + spin_lock_irq(&cc_lock); + list_for_each_entry (cc, &ub->ub_cclist, ulist) { + struct kmem_cache *cachep; + + cachep = cc->cachep; + seq_printf(f, bc_proc_lu_lu_fmt, + kmem_cache_name(cachep), + cc->counter, + kmem_cache_objuse(cachep)); + } + spin_unlock_irq(&cc_lock); + return 0; +} + +static struct bc_proc_entry bc_kmem_debug_entry = { + .name = "kmem_debug", + .u.show = bc_kmem_debug_show, +}; + +static int __init bc_kmem_debug_init(void) +{ + bc_register_proc_entry(&bc_kmem_debug_entry); + return 0; +} + +late_initcall(bc_kmem_debug_init); +#endif + +#else +#define inc_slab_charged(ub, cache) (0) +#define dec_slab_charged(ub, cache) do { } while (0) +#define inc_pages_charged(ub, cache) do { } while (0) +#define dec_pages_charged(ub, cache) do { } while (0) +#endif + +#define UB_KMEM_QUANT (PAGE_SIZE * 4) + +/* called with IRQ disabled */ +int ub_kmemsize_charge(struct user_beancounter *ub, + unsigned long size, + enum ub_severity strict) +{ + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub || size > UB_KMEM_QUANT) + goto just_charge; + if (tbc->kmem_precharged >= size) { + tbc->kmem_precharged -= size; + return 0; + } + + if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) { + tbc->kmem_precharged += UB_KMEM_QUANT - size; + return 0; + } + +just_charge: + return charge_beancounter(ub, UB_KMEMSIZE, size, strict); +} + +/* called with IRQ disabled */ +void ub_kmemsize_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + struct task_beancounter *tbc; + + if (size > UB_MAXVALUE) { + printk("ub_kmemsize_uncharge: size %lu\n", size); + dump_stack(); + } + + tbc = ¤t->task_bc; + if (ub != tbc->task_ub) + goto just_uncharge; + + tbc->kmem_precharged += size; + if (tbc->kmem_precharged < UB_KMEM_QUANT * 2) + return; + size = tbc->kmem_precharged - UB_KMEM_QUANT; + tbc->kmem_precharged -= size; + +just_uncharge: + uncharge_beancounter(ub, UB_KMEMSIZE, size); +} + +/* called with IRQ disabled */ +int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags) +{ + unsigned int size; + struct user_beancounter *ub; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + return 0; + + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + if (ub_kmemsize_charge(ub, size, + (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto out_err; + + if (inc_slab_charged(ub, cachep) < 0) { + ub_kmemsize_uncharge(ub, size); + goto out_err; + } + *ub_slab_ptr(cachep, objp) = ub; + return 0; + +out_err: + put_beancounter(ub); + return -ENOMEM; +} + +/* called with IRQ disabled */ +void ub_slab_uncharge(struct kmem_cache *cachep, void *objp) +{ + unsigned int size; + struct user_beancounter **ub_ref; + + ub_ref = ub_slab_ptr(cachep, objp); + if (*ub_ref == NULL) + return; + + dec_slab_charged(*ub_ref, cachep); + size = CHARGE_SIZE(kmem_cache_objuse(cachep)); + ub_kmemsize_uncharge(*ub_ref, size); + put_beancounter(*ub_ref); + *ub_ref = NULL; +} + +/* + * Pages accounting + */ + +int ub_page_charge(struct page *page, int order, gfp_t mask) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = NULL; + if (!(mask & __GFP_UBC)) + goto out; + + ub = get_beancounter(get_exec_ub()); + if (ub == NULL) + goto out; + + local_irq_save(flags); + if (ub_kmemsize_charge(ub, CHARGE_ORDER(order), + (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) + goto err; + + inc_pages_charged(ub, order); + local_irq_restore(flags); +out: + BUG_ON(page_ub(page) != NULL); + page_ub(page) = ub; + return 0; + +err: + local_irq_restore(flags); + BUG_ON(page_ub(page) != NULL); + put_beancounter(ub); + return -ENOMEM; +} + +void ub_page_uncharge(struct page *page, int order) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = page_ub(page); + if (ub == NULL) + return; + + BUG_ON(ub->ub_magic != UB_MAGIC); + dec_pages_charged(ub, order); + local_irq_save(flags); + ub_kmemsize_uncharge(ub, CHARGE_ORDER(order)); + local_irq_restore(flags); + put_beancounter(ub); + page_ub(page) = NULL; +} + +/* + * takes init_mm.page_table_lock + * some outer lock to protect pages from vmalloced area must be held + */ +struct user_beancounter *vmalloc_ub(void *obj) +{ + struct page *pg; + + pg = vmalloc_to_page(obj); + if (pg == NULL) + return NULL; + + return page_ub(pg); +} + +EXPORT_SYMBOL(vmalloc_ub); + +struct user_beancounter *mem_ub(void *obj) +{ + struct user_beancounter *ub; + + if ((unsigned long)obj >= VMALLOC_START && + (unsigned long)obj < VMALLOC_END) + ub = vmalloc_ub(obj); + else + ub = slab_ub(obj); + + return ub; +} + +EXPORT_SYMBOL(mem_ub); diff -uprN linux-2.6.24/kernel/bc/misc.c linux-2.6.24.ovz/kernel/bc/misc.c --- linux-2.6.24/kernel/bc/misc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/misc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,455 @@ +/* + * kernel/bc/misc.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_FILE_MINQUANT 3 +#define UB_FILE_MAXQUANT 10 +#define UB_FILE_INIQUANT 4 + +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize); + +extern struct kmem_cache *filp_cachep; + +static inline unsigned long ub_file_kmemsize(unsigned long nr) +{ + return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr; +} + +/* + * Task staff + */ + +static void init_task_sub(struct task_struct *parent, + struct task_struct *tsk, + struct task_beancounter *old_bc) +{ + struct task_beancounter *new_bc; + struct user_beancounter *sub; + + new_bc = &tsk->task_bc; + sub = old_bc->fork_sub; + new_bc->fork_sub = get_beancounter(sub); + new_bc->task_fnode = NULL; + new_bc->task_freserv = old_bc->task_freserv; + old_bc->task_freserv = NULL; + memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); + new_bc->pgfault_handle = 0; + new_bc->pgfault_allot = 0; +} + +void ub_init_task_bc(struct task_beancounter *tbc) +{ + tbc->file_precharged = 0; + tbc->file_quant = UB_FILE_INIQUANT; + tbc->file_count = 0; + + tbc->kmem_precharged = 0; + tbc->dentry_alloc = 0; +} + +int ub_task_charge(struct task_struct *parent, struct task_struct *task) +{ + struct task_beancounter *old_bc; + struct task_beancounter *new_bc; + struct user_beancounter *ub, *pub; + unsigned long file_nr, kmemsize; + unsigned long flags; + + old_bc = &parent->task_bc; + ub = old_bc->fork_sub; + new_bc = &task->task_bc; + new_bc->task_ub = get_beancounter(ub); + new_bc->exec_ub = get_beancounter(ub); + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC, + 1, UB_HARD) < 0)) + goto out_numproc; + + ub_init_task_bc(new_bc); + file_nr = ub_file_precharge(new_bc, pub, &kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + charge_beancounter_notop(ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize); + } + + init_task_sub(parent, task, old_bc); + return 0; + +out_numproc: + spin_unlock_irqrestore(&pub->ub_lock, flags); + __put_beancounter_batch(ub, 2); + return -ENOMEM; +} + +extern atomic_t dbgpre; + +void ub_task_uncharge(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long file_nr, file_kmemsize; + unsigned long flags; + + task_bc = &task->task_bc; + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + __uncharge_beancounter_locked(pub, UB_NUMPROC, 1); + file_nr = task_bc->file_precharged; + if (likely(file_nr)) + __uncharge_beancounter_locked(pub, + UB_NUMFILE, file_nr); + + /* see comment in ub_file_charge */ + task_bc->file_precharged = 0; + file_kmemsize = ub_file_kmemsize(file_nr); + if (likely(file_kmemsize)) + __uncharge_beancounter_locked(pub, + UB_KMEMSIZE, file_kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + + uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1); + if (likely(file_nr)) { + uncharge_beancounter_notop(task_bc->task_ub, + UB_NUMFILE, file_nr); + __put_beancounter_batch(task_bc->task_ub, file_nr); + } + if (likely(file_kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, + UB_KMEMSIZE, file_kmemsize); +} + +void ub_task_put(struct task_struct *task) +{ + struct task_beancounter *task_bc; + struct user_beancounter *pub; + unsigned long kmemsize, flags; + + task_bc = &task->task_bc; + + pub = top_beancounter(task_bc->task_ub); + spin_lock_irqsave(&pub->ub_lock, flags); + kmemsize = task_bc->kmem_precharged; + task_bc->kmem_precharged = 0; + if (likely(kmemsize)) + __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize); + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(kmemsize)) + uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize); + + put_beancounter(task_bc->exec_ub); + put_beancounter(task_bc->task_ub); + put_beancounter(task_bc->fork_sub); + /* can't be freed elsewhere, failures possible in the middle of fork */ + if (task_bc->task_freserv != NULL) + kfree(task_bc->task_freserv); + + task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; + task_bc->task_ub = (struct user_beancounter *)0xdead100c; + BUG_ON(task_bc->kmem_precharged != 0); +} + +/* + * Files and file locks. + */ +/* + * For NUMFILE, we do not take a lock and call charge function + * for every file. We try to charge in batches, keeping local reserve on + * task. For experimental purposes, batch size is adaptive and depends + * on numfile barrier, number of processes, and the history of successes and + * failures of batch charges. + * + * Per-task fields have the following meaning + * file_precharged number of files charged to beancounter in advance, + * file_quant logarithm of batch size + * file_count counter of charge successes, to reduce batch size + * fluctuations. + */ +static unsigned long ub_file_precharge(struct task_beancounter *task_bc, + struct user_beancounter *ub, unsigned long *kmemsize) +{ + unsigned long n, kmem; + + n = 1UL << task_bc->file_quant; + if (ub->ub_parms[UB_NUMPROC].held > + (ub->ub_parms[UB_NUMFILE].barrier >> + task_bc->file_quant)) + goto nopre; + if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD))) + goto nopre; + kmem = ub_file_kmemsize(n); + if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE, + kmem, UB_HARD))) + goto nopre_kmem; + + task_bc->file_precharged += n; + get_beancounter_batch(task_bc->task_ub, n); + task_bc->file_count++; + if (task_bc->file_quant < UB_FILE_MAXQUANT && + task_bc->file_count >= task_bc->file_quant) { + task_bc->file_quant++; + task_bc->file_count = 0; + } + *kmemsize = kmem; + return n; + +nopre_kmem: + __uncharge_beancounter_locked(ub, UB_NUMFILE, n); +nopre: + if (task_bc->file_quant > UB_FILE_MINQUANT) + task_bc->file_quant--; + task_bc->file_count = 0; + return 0; +} + +int ub_file_charge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long file_nr, kmem; + unsigned long flags; + int err; + + task_bc = ¤t->task_bc; + ub = get_exec_ub(); + if (unlikely(ub != task_bc->task_ub)) + goto just_charge; + + if (likely(task_bc->file_precharged > 0)) { + /* + * files are put via RCU in 2.6.16 so during + * this decrement an IRQ can happen and called + * ub_files_uncharge() will mess file_precharged + * + * ub_task_uncharge() is called via RCU also so no + * protection is needed there + * + * Xemul + */ + + local_irq_save(flags); + task_bc->file_precharged--; + local_irq_restore(flags); + + f->f_ub = ub; + return 0; + } + + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); + file_nr = ub_file_precharge(task_bc, pub, &kmem); + if (unlikely(!file_nr)) + goto last_try; + spin_unlock(&pub->ub_lock); + task_bc->file_precharged--; + local_irq_restore(flags); + + charge_beancounter_notop(ub, UB_NUMFILE, file_nr); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = ub; + return 0; + +just_charge: + pub = top_beancounter(ub); + spin_lock_irqsave(&pub->ub_lock, flags); +last_try: + kmem = ub_file_kmemsize(1); + err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD); + if (likely(!err)) { + err = __charge_beancounter_locked(pub, UB_KMEMSIZE, + kmem, UB_HARD); + if (unlikely(err)) + __uncharge_beancounter_locked(pub, UB_NUMFILE, 1); + } + spin_unlock_irqrestore(&pub->ub_lock, flags); + if (likely(!err)) { + charge_beancounter_notop(ub, UB_NUMFILE, 1); + charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); + f->f_ub = get_beancounter(ub); + } + return err; +} + +void ub_file_uncharge(struct file *f) +{ + struct user_beancounter *ub, *pub; + struct task_beancounter *task_bc; + unsigned long nr; + + ub = f->f_ub; + task_bc = ¤t->task_bc; + if (likely(ub == task_bc->task_ub)) { + task_bc->file_precharged++; + pub = top_beancounter(ub); + if (ub_barrier_farnr(pub, UB_NUMFILE) && + ub_barrier_farsz(pub, UB_KMEMSIZE)) + return; + if (task_bc->file_precharged < (1UL << task_bc->file_quant)) + return; + nr = task_bc->file_precharged + - (1UL << (task_bc->file_quant - 1)); + task_bc->file_precharged -= nr; + __put_beancounter_batch(ub, nr); + uncharge_beancounter(ub, UB_NUMFILE, nr); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr)); + } else { + uncharge_beancounter(ub, UB_NUMFILE, 1); + uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); + put_beancounter(ub); + } +} + +int ub_flock_charge(struct file_lock *fl, int hard) +{ + struct user_beancounter *ub; + int err; + + /* No need to get_beancounter here since it's already got in slab */ + ub = slab_ub(fl); + if (ub == NULL) + return 0; + + err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); + if (!err) + fl->fl_charged = 1; + return err; +} + +void ub_flock_uncharge(struct file_lock *fl) +{ + struct user_beancounter *ub; + + /* Ub will be put in slab */ + ub = slab_ub(fl); + if (ub == NULL || !fl->fl_charged) + return; + + uncharge_beancounter(ub, UB_NUMFLOCK, 1); + fl->fl_charged = 0; +} + +/* + * Signal handling + */ + +static int do_ub_siginfo_charge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) + goto out_kmem; + + if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) + goto out_num; + + spin_unlock_irqrestore(&ub->ub_lock, flags); + return 0; + +out_num: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); +out_kmem: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return -ENOMEM; +} + +static void do_ub_siginfo_uncharge(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) +{ + unsigned long size; + struct user_beancounter *p, *q; + + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (p = ub; p != NULL; p = p->parent) { + if (do_ub_siginfo_charge(p, size)) + goto unroll; + } + + sq->sig_ub = get_beancounter(ub); + return 0; + +unroll: + for (q = ub; q != p; q = q->parent) + do_ub_siginfo_uncharge(q, size); + return -ENOMEM; +} +EXPORT_SYMBOL(ub_siginfo_charge); + +void ub_siginfo_uncharge(struct sigqueue *sq) +{ + unsigned long size; + struct user_beancounter *ub, *p; + + p = ub = sq->sig_ub; + sq->sig_ub = NULL; + size = CHARGE_SIZE(kmem_obj_objuse(sq)); + for (; ub != NULL; ub = ub->parent) + do_ub_siginfo_uncharge(ub, size); + put_beancounter(p); +} + +/* + * PTYs + */ + +int ub_pty_charge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + int retval; + + ub = slab_ub(tty); + retval = 0; + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + !test_bit(TTY_CHARGED, &tty->flags)) { + retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); + if (!retval) + set_bit(TTY_CHARGED, &tty->flags); + } + return retval; +} + +void ub_pty_uncharge(struct tty_struct *tty) +{ + struct user_beancounter *ub; + + ub = slab_ub(tty); + if (ub && tty->driver->subtype == PTY_TYPE_MASTER && + test_bit(TTY_CHARGED, &tty->flags)) { + uncharge_beancounter(ub, UB_NUMPTY, 1); + clear_bit(TTY_CHARGED, &tty->flags); + } +} diff -uprN linux-2.6.24/kernel/bc/net.c linux-2.6.24.ovz/kernel/bc/net.c --- linux-2.6.24/kernel/bc/net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/net.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1150 @@ +/* + * linux/kernel/bc/net.c + * + * Copyright (C) 1998-2004 Andrey V. Savochkin + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * TODO: + * - sizeof(struct inode) charge + * = tcp_mem_schedule() feedback based on ub limits + * + measures so that one socket won't exhaust all send buffers, + * see bug in bugzilla + * = sk->socket check for NULL in snd_wakeups + * (tcp_write_space checks for NULL itself) + * + in tcp_close(), orphaned socket abortion should be based on ubc + * resources (same in tcp_out_of_resources) + * Beancounter should also have separate orphaned socket counter... + * + for rcv, in-order segment should be accepted + * if only barrier is exceeded + * = tcp_rmem_schedule() feedback based on ub limits + * - repair forward_alloc mechanism for receive buffers + * It's idea is that some buffer space is pre-charged so that receive fast + * path doesn't need to take spinlocks and do other heavy stuff + * + tcp_prune_queue actions based on ub limits + * + window adjustments depending on available buffers for receive + * - window adjustments depending on available buffers for send + * + race around usewreserv + * + avoid allocating new page for each tiny-gram, see letter from ANK + * + rename ub_sock_lock + * + sk->sleep wait queue probably can be used for all wakeups, and + * sk->ub_wait is unnecessary + * + for UNIX sockets, the current algorithm will lead to + * UB_UNIX_MINBUF-sized messages only for non-blocking case + * - charge for af_packet sockets + * + all datagram sockets should be charged to NUMUNIXSOCK + * - we do not charge for skb copies and clones staying in device queues + * + live-lock if number of sockets is big and buffer limits are small + * [diff-ubc-dbllim3] + * - check that multiple readers/writers on the same socket won't cause fatal + * consequences + * - check allocation/charge orders + * + There is potential problem with callback_lock. In *snd_wakeup we take + * beancounter first, in sock_def_error_report - callback_lock first. + * then beancounter. This is not a problem if callback_lock taken + * readonly, but anyway... + * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator + * General kernel problems: + * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC + * notification won't get signals + * - datagram_poll looks racy + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* by some reason it is not used currently */ +#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0 + + +/* Skb truesize definition. Bad place. Den */ + +static inline int skb_chargesize_head(struct sk_buff *skb) +{ + return skb_charge_size(skb_end_pointer(skb) - skb->head + + sizeof(struct skb_shared_info)); +} + +int skb_charge_fullsize(struct sk_buff *skb) +{ + int chargesize; + struct sk_buff *skbfrag; + + chargesize = skb_chargesize_head(skb) + + PAGE_SIZE * skb_shinfo(skb)->nr_frags; + if (likely(skb_shinfo(skb)->frag_list == NULL)) + return chargesize; + for (skbfrag = skb_shinfo(skb)->frag_list; + skbfrag != NULL; + skbfrag = skbfrag->next) { + chargesize += skb_charge_fullsize(skbfrag); + } + return chargesize; +} +EXPORT_SYMBOL(skb_charge_fullsize); + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size); + +int __ub_too_many_orphans(struct sock *sk, int count) +{ + struct user_beancounter *ub; + + if (sock_has_ubc(sk)) { + ub = top_beancounter(sock_bc(sk)->ub); + if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) + return 1; + } + return 0; +} + +/* + * Queueing + */ + +static void ub_sock_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_other_sk_list)) { + p = ub->ub_other_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * See comments in ub_tcp_snd_wakeup. + * Locking note: both unix_write_space and + * sock_def_write_space take callback_lock themselves. + * We take it here just to be on the safe side and to + * act the same way as ub_tcp_snd_wakeup does. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, + UB_OTHERSOCKBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +static void ub_tcp_snd_wakeup(struct user_beancounter *ub) +{ + struct list_head *p; + struct sock *sk; + struct sock_beancounter *skbc; + struct socket *sock; + unsigned long added; + + while (!list_empty(&ub->ub_tcp_sk_list)) { + p = ub->ub_tcp_sk_list.next; + skbc = list_entry(p, struct sock_beancounter, ub_sock_list); + sk = skbc_sock(skbc); + + added = 0; + sock = sk->sk_socket; + if (sock == NULL) { + /* sk being destroyed */ + list_del_init(&skbc->ub_sock_list); + continue; + } + + ub_debug(UBD_NET_SLEEP, + "Checking queue, waiting %lu, reserv %lu\n", + skbc->ub_waitspc, skbc->poll_reserv); + added = -skbc->poll_reserv; + if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, + skbc->ub_waitspc)) + break; + added += skbc->poll_reserv; + + list_del_init(&skbc->ub_sock_list); + + /* + * Send async notifications and wake up. + * Locking note: we get callback_lock here because + * tcp_write_space is over-optimistic about calling context + * (socket lock is presumed). So we get the lock here although + * it belongs to the callback. + */ + sock_hold(sk); + read_lock(&sk->sk_callback_lock); + spin_unlock(&ub->ub_lock); + + sk->sk_write_space(sk); + read_unlock(&sk->sk_callback_lock); + + if (skbc->ub != ub && added) + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); + sock_put(sk); + + spin_lock(&ub->ub_lock); + } +} + +void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) +{ + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long added_reserv; + + if (!sock_has_ubc(sk)) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); + added_reserv = -skbc->poll_reserv; + if (!ub_sock_makewreserv_locked(sk, res, size)) { + /* + * It looks a bit hackish, but it is compatible with both + * wait_for_xx_ubspace and poll. + * This __set_current_state is equivalent to a wakeup event + * right after spin_unlock_irqrestore. + */ + __set_current_state(TASK_RUNNING); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (added_reserv) + charge_beancounter_notop(skbc->ub, res, added_reserv); + return; + } + + ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); + skbc->ub_waitspc = size; + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "re-adding socket to beancounter %p.\n", ub); + goto out; + } + + switch (res) { + case UB_TCPSNDBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_tcp_sk_list); + break; + case UB_OTHERSOCKBUF: + list_add_tail(&skbc->ub_sock_list, + &ub->ub_other_sk_list); + break; + default: + BUG(); + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +EXPORT_SYMBOL(ub_sock_snd_queue_add); + +long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + set_current_state(TASK_INTERRUPTIBLE); + if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) + break; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); + timeo = schedule_timeout(timeo); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return timeo; +} + +void ub_sock_sndqueuedel(struct sock *sk) +{ + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long flags; + + if (!sock_has_ubc(sk)) + return; + skbc = sock_bc(sk); + + /* race with write_space callback of other socket */ + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + list_del_init(&skbc->ub_sock_list); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Helpers + */ + +static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + skb_bc(skb)->ub = sock_bc(sk)->ub; + skb_bc(skb)->charged = size; + skb_bc(skb)->resource = resource; +} + +void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, + unsigned long size, int resource) +{ + if (!sock_has_ubc(sk)) + return; + + if (sock_bc(sk)->ub == NULL) + BUG(); + + __ub_skb_set_charge(skb, sk, size, resource); + + /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ + if (skb->sk == NULL) + skb->sk = sk; +} + +EXPORT_SYMBOL(ub_skb_set_charge); + +static inline void ub_skb_set_uncharge(struct sk_buff *skb) +{ + skb_bc(skb)->ub = NULL; + skb_bc(skb)->charged = 0; + skb_bc(skb)->resource = 0; +} + +static void ub_update_rmem_thres(struct sock_beancounter *skub) +{ + struct user_beancounter *ub; + + if (skub && skub->ub) { + ub = top_beancounter(skub->ub); + ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / + (ub->ub_parms[UB_NUMTCPSOCK].held + 1); + } +} + +static inline void ub_sock_wcharge_dec(struct sock *sk, + unsigned long chargesize) +{ + /* The check sk->sk_family != PF_NETLINK is made as the skb is + * queued to the kernel end of socket while changed to the user one. + * Den */ + if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) { + if (sock_bc(sk)->ub_wcharged > chargesize) + sock_bc(sk)->ub_wcharged -= chargesize; + else + sock_bc(sk)->ub_wcharged = 0; + } +} + +/* + * Charge socket number + */ + +static inline void sk_alloc_beancounter(struct sock *sk) +{ + struct sock_beancounter *skbc; + + skbc = sock_bc(sk); + memset(skbc, 0, sizeof(struct sock_beancounter)); +} + +static inline void sk_free_beancounter(struct sock *sk) +{ +} + +static int __sock_charge(struct sock *sk, int res) +{ + struct sock_beancounter *skbc; + struct user_beancounter *cub, *ub; + unsigned long added_reserv, added_forw; + unsigned long flags; + + cub = get_exec_ub(); + if (unlikely(cub == NULL)) + return 0; + + sk_alloc_beancounter(sk); + skbc = sock_bc(sk); + INIT_LIST_HEAD(&skbc->ub_sock_list); + + ub = top_beancounter(cub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0)) + goto out_limit; + + added_reserv = 0; + added_forw = 0; + if (res == UB_NUMTCPSOCK) { + added_reserv = skb_charge_size(MAX_TCP_HEADER + + 1500 - sizeof(struct iphdr) - + sizeof(struct tcphdr)); + added_reserv *= 4; + ub->ub_parms[UB_TCPSNDBUF].held += added_reserv; + if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) { + ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv; + added_reserv = 0; + } + skbc->poll_reserv = added_reserv; + + added_forw = SK_STREAM_MEM_QUANTUM * 4; + ub->ub_parms[UB_TCPRCVBUF].held += added_forw; + if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + ub->ub_parms[UB_TCPRCVBUF].held -= added_forw; + added_forw = 0; + } + skbc->forw_space = added_forw; + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + charge_beancounter_notop(cub, res, 1); + if (added_reserv) + charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv); + if (added_forw) + charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw); + + skbc->ub = get_beancounter(cub); + return 0; + +out_limit: + spin_unlock_irqrestore(&ub->ub_lock, flags); + sk_free_beancounter(sk); + return -ENOMEM; +} + +int ub_tcp_sock_charge(struct sock *sk) +{ + int ret; + + ret = __sock_charge(sk, UB_NUMTCPSOCK); + ub_update_rmem_thres(sock_bc(sk)); + + return ret; +} + +int ub_other_sock_charge(struct sock *sk) +{ + return __sock_charge(sk, UB_NUMOTHERSOCK); +} + +EXPORT_SYMBOL(ub_other_sock_charge); + +int ub_sock_charge(struct sock *sk, int family, int type) +{ + return (IS_TCP_SOCK(family, type) ? + ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); +} + +EXPORT_SYMBOL(ub_sock_charge); + +/* + * Uncharge socket number + */ + +void ub_sock_uncharge(struct sock *sk) +{ + int is_tcp_sock; + unsigned long flags; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long reserv, forw; + + if (unlikely(!sock_has_ubc(sk))) + return; + + is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); + skbc = sock_bc(sk); + ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); + + ub = top_beancounter(skbc->ub); + + spin_lock_irqsave(&ub->ub_lock, flags); + if (!list_empty(&skbc->ub_sock_list)) { + ub_debug(UBD_NET_SOCKET, + "ub_sock_uncharge: removing from ub(%p) queue.\n", + skbc); + list_del_init(&skbc->ub_sock_list); + } + + reserv = skbc->poll_reserv; + forw = skbc->forw_space; + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + __uncharge_beancounter_locked(ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + ub_sock_wcharge_dec(sk, reserv); + if (unlikely(skbc->ub_wcharged)) + printk(KERN_WARNING + "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", + skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); + skbc->poll_reserv = 0; + skbc->forw_space = 0; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), + reserv); + if (forw) + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), + forw); + uncharge_beancounter_notop(skbc->ub, + (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); + + put_beancounter(skbc->ub); + sk_free_beancounter(sk); +} + +/* + * Special case for netlink_dump - (un)charges precalculated size + */ + +int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) +{ + int ret; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + ret = charge_beancounter(sock_bc(sk)->ub, + UB_DGRAMRCVBUF, chargesize, UB_HARD); + if (ret < 0) + return ret; + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return ret; +} + +/* + * Poll reserve accounting + * + * This is the core of socket buffer management (along with queueing/wakeup + * functions. The rest of buffer accounting either call these functions, or + * repeat parts of their logic for some simpler cases. + */ + +static int ub_sock_makewreserv_locked(struct sock *sk, + int bufid, unsigned long size) +{ + unsigned long wcharge_added; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + skbc = sock_bc(sk); + if (skbc->poll_reserv >= size) /* no work to be done */ + goto out; + + ub = top_beancounter(skbc->ub); + ub->ub_parms[bufid].held += size - skbc->poll_reserv; + + wcharge_added = 0; + /* + * Logic: + * 1) when used memory hits barrier, we set wmem_pressure; + * wmem_pressure is reset under barrier/2; + * between barrier/2 and barrier we limit per-socket buffer growth; + * 2) each socket is guaranteed to get (limit-barrier)/maxsockets + * calculated on the base of memory eaten after the barrier is hit + */ + skbc = sock_bc(sk); +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub_hfbarrier_hit(ub, bufid)) { + if (ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 0; + } +#endif + if (ub_barrier_hit(ub, bufid)) { +#if UB_SOCK_MAINTAIN_WMEMPRESSURE + if (!ub->ub_wmem_pressure) + ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, + ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_wmem_pressure = 1; +#endif + if (sk->sk_family == PF_NETLINK) + goto unroll; + wcharge_added = size - skbc->poll_reserv; + skbc->ub_wcharged += wcharge_added; + if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit + + ub->ub_parms[bufid].barrier > + ub->ub_parms[bufid].limit) + goto unroll_wch; + } + if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) + goto unroll; + + ub_adjust_maxheld(ub, bufid); + skbc->poll_reserv = size; +out: + return 0; + +unroll_wch: + skbc->ub_wcharged -= wcharge_added; +unroll: + ub_debug(UBD_NET_SEND, + "makewres: deny " + "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", + sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, + skbc->ub_wcharged, sk->sk_sndbuf); + ub->ub_parms[bufid].failcnt++; + ub->ub_parms[bufid].held -= size - skbc->poll_reserv; + + if (sk->sk_socket != NULL) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + return -ENOMEM; +} + +int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + skbc = sock_bc(sk); + + /* + * This function provides that there is sufficient reserve upon return + * only if sk has only one user. We can check poll_reserv without + * serialization and avoid locking if the reserve already exists. + */ + if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size)) + return 0; + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, bufid, size); + added_reserv += skbc->poll_reserv; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, bufid, added_reserv); + + return err; +} + +EXPORT_SYMBOL(ub_sock_make_wreserv); + +int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) +{ + struct sock_beancounter *skbc; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* optimize for the case if socket has sufficient reserve */ + ub_sock_make_wreserv(sk, bufid, size); + skbc = sock_bc(sk); + if (likely(skbc->poll_reserv >= size)) { + skbc->poll_reserv -= size; + return 0; + } + return -ENOMEM; +} + +EXPORT_SYMBOL(ub_sock_get_wreserv); + +static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long extra; + unsigned long flags; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + + extra = 0; + spin_lock_irqsave(&ub->ub_lock, flags); + skbc->poll_reserv += size; + if (skbc->poll_reserv > ressize) { + extra = skbc->poll_reserv - ressize; + ub_sock_wcharge_dec(sk, extra); + skbc->poll_reserv = ressize; + + __uncharge_beancounter_locked(ub, bufid, extra); + if (bufid == UB_TCPSNDBUF) + ub_tcp_snd_wakeup(ub); + else + ub_sock_snd_wakeup(ub); + } + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (extra) + uncharge_beancounter_notop(skbc->ub, bufid, extra); +} + +void ub_sock_ret_wreserv(struct sock *sk, int bufid, + unsigned long size, unsigned long ressize) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + + if (unlikely(!sock_has_ubc(sk))) + return; + + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + /* check if the reserve can be kept */ + if (ub_barrier_farsz(ub, bufid)) { + skbc->poll_reserv += size; + return; + } + ub_sock_do_ret_wreserv(sk, bufid, size, ressize); +} + +/* + * UB_DGRAMRCVBUF + */ + +int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) +{ + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + chargesize = skb_charge_fullsize(skb); + if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, + chargesize, UB_HARD)) + return -ENOMEM; + + ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); + return 0; +} + +EXPORT_SYMBOL(ub_sockrcvbuf_charge); + +static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) +{ + uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + +/* + * UB_TCPRCVBUF + */ + +int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int retval; + unsigned long flags; + struct user_beancounter *ub; + struct sock_beancounter *skbc; + unsigned long chargesize; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + skbc = sock_bc(sk); + + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->forw_space >= chargesize)) { + skbc->forw_space -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + return 0; + } + + /* + * Memory pressure reactions: + * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) + * 2) set UB_RMEM_SHRINK and tcp_clamp_window() + * tcp_collapse_queues() if rmem_alloc > rcvbuf + * 3) drop OFO, tcp_purge_ofo() + * 4) drop all. + * Currently, we do #2 and #3 at once (which means that current + * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, + * for example...) + * On memory pressure we jump from #0 to #3, and when the pressure + * subsides, to #1. + */ + retval = 0; + ub = top_beancounter(sock_bc(sk)->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[UB_TCPRCVBUF].held += chargesize; + if (ub->ub_parms[UB_TCPRCVBUF].held > + ub->ub_parms[UB_TCPRCVBUF].barrier && + strict != UB_FORCE) + goto excess; + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + +out: + if (retval == 0) { + charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, + chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); + } + return retval; + +excess: + ub->ub_rmem_pressure = UB_RMEM_SHRINK; + if (strict == UB_HARD) + retval = -ENOMEM; + if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) + retval = -ENOMEM; + /* + * We try to leave numsock*maxadvmss as a reserve for sockets not + * queueing any data yet (if the difference between the barrier and the + * limit is enough for this reserve). + */ + if (ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss + > ub->ub_parms[UB_TCPRCVBUF].limit && + atomic_read(&sk->sk_rmem_alloc)) + retval = -ENOMEM; + if (retval) { + ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; + ub->ub_parms[UB_TCPRCVBUF].failcnt++; + } + ub_adjust_maxheld(ub, UB_TCPRCVBUF); + spin_unlock_irqrestore(&ub->ub_lock, flags); + goto out; +} +EXPORT_SYMBOL(ub_sock_tcp_chargerecv); + +static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + unsigned long held, bar; + int prev_pres; + struct user_beancounter *ub; + + ub = top_beancounter(skb_bc(skb)->ub); + if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) { + sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged; + ub_skb_set_uncharge(skb); + return; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { + printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", + skb_bc(skb)->charged, + ub, ub->ub_parms[UB_TCPRCVBUF].held); + /* ass-saving bung */ + skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; + } + ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; + held = ub->ub_parms[UB_TCPRCVBUF].held; + bar = ub->ub_parms[UB_TCPRCVBUF].barrier; + prev_pres = ub->ub_rmem_pressure; + if (held <= bar - (bar >> 2)) + ub->ub_rmem_pressure = UB_RMEM_EXPAND; + else if (held <= bar) + ub->ub_rmem_pressure = UB_RMEM_KEEP; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, + skb_bc(skb)->charged); + ub_skb_set_uncharge(skb); +} + + +/* + * UB_OTHERSOCKBUF and UB_TCPSNDBUF + */ + +static void ub_socksndbuf_uncharge(struct sk_buff *skb) +{ + unsigned long flags; + struct user_beancounter *ub, *cub; + unsigned long chargesize; + + cub = skb_bc(skb)->ub; + ub = top_beancounter(cub); + chargesize = skb_bc(skb)->charged; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize); + if (skb->sk != NULL && sock_has_ubc(skb->sk)) + ub_sock_wcharge_dec(skb->sk, chargesize); + ub_sock_snd_wakeup(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + + uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize); + ub_skb_set_uncharge(skb); +} + +/* expected to be called under socket lock */ +static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) +{ + /* + * ub_sock_ret_wreserv call is abused here, we just want to uncharge + * skb size. However, to reduce duplication of the code doing + * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call + * a function that already does all of this. 2006/04/27 SAW + */ + ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged, + sock_bc(skb->sk)->poll_reserv); + ub_skb_set_uncharge(skb); +} + +void ub_skb_uncharge(struct sk_buff *skb) +{ + switch (skb_bc(skb)->resource) { + case UB_TCPSNDBUF: + ub_tcpsndbuf_uncharge(skb); + break; + case UB_TCPRCVBUF: + ub_tcprcvbuf_uncharge(skb); + break; + case UB_DGRAMRCVBUF: + ub_sockrcvbuf_uncharge(skb); + break; + case UB_OTHERSOCKBUF: + ub_socksndbuf_uncharge(skb); + break; + } +} + +EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ + +/* + * Other sock reserve managment + */ + +int ub_sock_getwres_other(struct sock *sk, unsigned long size) +{ + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + unsigned long added_reserv; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + /* + * Nothing except beancounter lock protects skbc->poll_reserv. + * So, take the lock and do the job. + * Dances with added_reserv repeat ub_sock_make_wreserv. + */ + skbc = sock_bc(sk); + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + added_reserv = -skbc->poll_reserv; + err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size); + added_reserv += skbc->poll_reserv; + if (!err) + skbc->poll_reserv -= size; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + if (added_reserv) + charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv); + + return err; +} +EXPORT_SYMBOL(ub_sock_getwres_other); + +void ub_sock_retwres_other(struct sock *sk, + unsigned long size, unsigned long ressize) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + + ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); +} + +/* + * TCP send buffers accouting. Paged part + */ + +int ub_sock_tcp_chargepage(struct sock *sk) +{ + struct sock_beancounter *skbc; + unsigned long extra; + int err; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE); + if (likely(skbc->poll_reserv >= PAGE_SIZE)) { + skbc->poll_reserv -= PAGE_SIZE; + return 0; + } + + /* + * Ok, full page is not available. + * However, this function must succeed if poll previously indicated + * that write is possible. We better make a forced charge here + * than reserve a whole page in poll. + */ + err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE); + if (unlikely(err < 0)) + goto out; + if (skbc->poll_reserv < PAGE_SIZE) { + extra = PAGE_SIZE - skbc->poll_reserv; + err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra, + UB_FORCE); + if (err < 0) + goto out; + skbc->poll_reserv += extra; + } + skbc->poll_reserv -= PAGE_SIZE; + return 0; + +out: + return err; +} + +void ub_sock_tcp_detachpage(struct sock *sk) +{ + struct sk_buff *skb; + + if (unlikely(!sock_has_ubc(sk))) + return; + + /* The page is just detached from socket. The last skb in queue + with paged part holds referrence to it */ + skb = skb_peek_tail(&sk->sk_write_queue); + if (skb == NULL) { + /* If the queue is empty - all data is sent and page is about + to be freed */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE, + sock_bc(sk)->poll_reserv); + } else { + /* Last skb is a good aproximation for a last skb with + paged part */ + skb_bc(skb)->charged += PAGE_SIZE; + } +} + +/* + * TCPSNDBUF charge functions below are called in the following cases: + * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by + * some technical reasons in TCP code; + * - fragmentation of TCP packets. + * These functions are allowed but not required to use poll_reserv. + * Originally, these functions didn't do that, since it didn't make + * any sense. Now, since poll_reserv now has a function of general reserve, + * they use it. + */ +int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb, + enum ub_severity strict) +{ + int ret; + unsigned long chargesize; + struct sock_beancounter *skbc; + struct user_beancounter *ub; + unsigned long flags; + + if (unlikely(!sock_has_ubc(sk))) + return 0; + + skbc = sock_bc(sk); + chargesize = skb_charge_fullsize(skb); + if (likely(skbc->poll_reserv >= chargesize)) { + skbc->poll_reserv -= chargesize; + __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + /* XXX hack, see ub_skb_set_charge */ + skb->sk = sk; + return 0; + } + + ub = top_beancounter(skbc->ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF, + chargesize, strict); + /* + * Note: this check is not equivalent of the corresponding check + * in makewreserv. It's similar in spirit, but an equivalent check + * would be too long and complicated here. + */ + if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF)) + skbc->ub_wcharged += chargesize; + spin_unlock_irqrestore(&ub->ub_lock, flags); + if (likely(!ret)) { + charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize); + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + } + return ret; +} +EXPORT_SYMBOL(ub_sock_tcp_chargesend); + +void ub_sock_tcp_unchargesend(struct sock *sk, unsigned long size) +{ + if (unlikely(!sock_has_ubc(sk))) + return; + /* see ub_tcpsndbuf_uncharge */ + ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, sock_bc(sk)->poll_reserv); +} + +/* + * Initialization + */ + +int __init skbc_cache_init(void) +{ + return 0; +} diff -uprN linux-2.6.24/kernel/bc/oom_kill.c linux-2.6.24.ovz/kernel/bc/oom_kill.c --- linux-2.6.24/kernel/bc/oom_kill.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/oom_kill.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define UB_OOM_TIMEOUT (5 * HZ) + +int oom_generation; +int oom_kill_counter; +static DEFINE_SPINLOCK(oom_lock); +static DECLARE_WAIT_QUEUE_HEAD(oom_wq); + +static inline int ub_oom_completed(struct task_struct *tsk) +{ + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + /* we were oom killed - just die */ + return 1; + if (tsk->task_bc.oom_generation != oom_generation) + /* some task was succesfully killed */ + return 1; + return 0; +} + +static void ub_clear_oom(void) +{ + struct user_beancounter *ub; + + rcu_read_lock(); + for_each_beancounter(ub) + ub->ub_oom_noproc = 0; + rcu_read_unlock(); +} + +/* Called with cpuset_lock held */ +int ub_oom_lock(void) +{ + int timeout; + DEFINE_WAIT(oom_w); + struct task_struct *tsk; + + tsk = current; + + spin_lock(&oom_lock); + if (!oom_kill_counter) + goto out_do_oom; + + timeout = UB_OOM_TIMEOUT; + while (1) { + if (ub_oom_completed(tsk)) { + spin_unlock(&oom_lock); + return -EINVAL; + } + + if (timeout == 0) + break; + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&oom_wq, &oom_w); + spin_unlock(&oom_lock); + cpuset_unlock(); + + timeout = schedule_timeout(timeout); + + cpuset_lock(); + spin_lock(&oom_lock); + remove_wait_queue(&oom_wq, &oom_w); + } + +out_do_oom: + ub_clear_oom(); + return 0; +} + +static inline long ub_current_overdraft(struct user_beancounter *ub) +{ + return ub->ub_parms[UB_OOMGUARPAGES].held + + ((ub->ub_parms[UB_KMEMSIZE].held + + ub->ub_parms[UB_TCPSNDBUF].held + + ub->ub_parms[UB_TCPRCVBUF].held + + ub->ub_parms[UB_OTHERSOCKBUF].held + + ub->ub_parms[UB_DGRAMRCVBUF].held) + >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; +} + +int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk) +{ + struct user_beancounter *mm_ub; + + if (ub == NULL) + return 0; + + task_lock(tsk); + if (tsk->mm == NULL) + mm_ub = NULL; + else + mm_ub = tsk->mm->mm_ub; + + while (mm_ub != NULL && mm_ub != ub) + mm_ub = mm_ub->parent; + task_unlock(tsk); + + return mm_ub != ub; +} + +struct user_beancounter *ub_oom_select_worst(void) +{ + struct user_beancounter *ub, *walkp; + long ub_maxover; + + ub_maxover = 0; + ub = NULL; + + rcu_read_lock(); + for_each_beancounter (walkp) { + long ub_overdraft; + + if (walkp->parent != NULL) + continue; + if (walkp->ub_oom_noproc) + continue; + + ub_overdraft = ub_current_overdraft(walkp); + if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) { + put_beancounter(ub); + ub = walkp; + ub_maxover = ub_overdraft; + } + } + + if (ub) + ub->ub_oom_noproc = 1; + rcu_read_unlock(); + + return ub; +} + +void ub_oom_mm_killed(struct user_beancounter *ub) +{ + static struct ub_rate_info ri = { 5, 60*HZ }; + + /* increment is serialized with oom_lock */ + ub->ub_parms[UB_OOMGUARPAGES].failcnt++; + + if (ub_ratelimit(&ri)) + show_mem(); +} + +void ub_oom_unlock(void) +{ + spin_unlock(&oom_lock); +} + +void ub_oom_task_dead(struct task_struct *tsk) +{ + spin_lock(&oom_lock); + oom_kill_counter = 0; + oom_generation++; + + printk("OOM killed process %s (pid=%d, ve=%d) exited, " + "free=%lu gen=%d.\n", + tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env), + nr_free_pages(), oom_generation); + /* if there is time to sleep in ub_oom_lock -> sleep will continue */ + wake_up_all(&oom_wq); + spin_unlock(&oom_lock); +} + +void ub_out_of_memory(struct user_beancounter *scope) +{ + struct user_beancounter *ub; + struct task_struct *p; + + cpuset_lock(); + spin_lock(&oom_lock); + ub_clear_oom(); + ub = get_beancounter(scope); + + read_lock(&tasklist_lock); +retry: + p = oom_select_bad_process(ub); + if (p == NULL || PTR_ERR(p) == -1UL) + goto unlock; + + if (oom_kill_process(p, (gfp_t)-1, -1, "UB Out of memory")) + goto retry; + + put_beancounter(ub); + +unlock: + read_unlock(&tasklist_lock); + spin_unlock(&oom_lock); + cpuset_unlock(); +} +EXPORT_SYMBOL(ub_out_of_memory); diff -uprN linux-2.6.24/kernel/bc/proc.c linux-2.6.24.ovz/kernel/bc/proc.c --- linux-2.6.24/kernel/bc/proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/proc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,680 @@ +/* + * kernel/bc/proc.c + * + * Copyright (C) 2006 OpenVZ. SWsoft Inc. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Generic output formats */ +#if BITS_PER_LONG == 32 +const char *bc_proc_lu_fmt = "\t%-20s %10lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n"; +#else +const char *bc_proc_lu_fmt = "\t%-20s %21lu\n"; +const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; +const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; +const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n"; +#endif + +#if BITS_PER_LONG == 32 +static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n"; +static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n"; +#else +static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n"; +static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n"; +#endif + +static void ub_show_res(struct seq_file *f, struct user_beancounter *ub, + int r, int show_uid) +{ + int len; + char ub_uid[64]; + + if (show_uid && r == 0) { + len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2); + ub_uid[len] = ':'; + ub_uid[len + 1] = '\0'; + } else + strcpy(ub_uid, ""); + + seq_printf(f, res_fmt, ub_uid, ub_rnames[r], + ub->ub_parms[r].held, + ub->ub_parms[r].maxheld, + ub->ub_parms[r].barrier, + ub->ub_parms[r].limit, + ub->ub_parms[r].failcnt); +} + +static void __show_resources(struct seq_file *f, struct user_beancounter *ub, + int show_uid) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + if (strcmp(ub_rnames[i], "dummy") != 0) + ub_show_res(f, ub, i, show_uid); + + for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++) + ub_show_res(f, ub, i, show_uid); +} + +static int bc_resources_show(struct seq_file *f, void *v) +{ + __show_resources(f, seq_beancounter(f), 0); + return 0; +} + +static struct bc_proc_entry bc_resources_entry = { + .name = "resources", + .u.show = bc_resources_show, +}; + +static int bc_debug_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + char buf[64]; + + ub = seq_beancounter(f); + print_ub_uid(ub, buf, sizeof(buf)); + seq_printf(f, "uid: %s\n", buf); + seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount)); + + seq_printf(f, "bc: %p\n", ub); + seq_printf(f, "par: %p\n", ub->parent); + seq_printf(f, "priv: %p\n", ub->private_data); + return 0; +} + +static struct bc_proc_entry bc_debug_entry = { + .name = "debug", + .u.show = bc_debug_show, +}; + +static int ub_show(struct seq_file *f, void *v) +{ + int i; + + for (i = 0; i < UB_RESOURCES_COMPAT; i++) + ub_show_res(f, (struct user_beancounter *)v, i, 1); + return 0; +} + +static int res_show(struct seq_file *f, void *v) +{ + __show_resources(f, (struct user_beancounter *)v, 1); + return 0; +} + +static int ub_accessible(struct user_beancounter *exec, + struct user_beancounter *target) +{ + struct user_beancounter *p, *q; + + p = top_beancounter(exec); + q = top_beancounter(target); + + return (p == get_ub0() || p == q); +} + +static void ub_show_header(struct seq_file *f) +{ + seq_printf(f, "Version: 2.5\n"); + seq_printf(f, head_fmt, "uid", "resource", + "held", "maxheld", "barrier", "limit", "failcnt"); +} + +static void *ub_start(struct seq_file *f, loff_t *ppos) +{ + struct user_beancounter *ub; + struct user_beancounter *exec_ub; + unsigned long pos; + + pos = *ppos; + if (pos == 0) + ub_show_header(f); + + exec_ub = get_exec_ub(); + + rcu_read_lock(); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + if (pos-- == 0) + return ub; + } + return NULL; +} + +static void *ub_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct user_beancounter *ub; + struct list_head *entry; + struct user_beancounter *exec_ub; + + exec_ub = get_exec_ub(); + ub = (struct user_beancounter *)v; + + entry = &ub->ub_list; + + list_for_each_continue_rcu(entry, &ub_list_head) { + ub = list_entry(entry, struct user_beancounter, ub_list); + if (ub->parent != NULL) + continue; + if (!ub_accessible(exec_ub, ub)) + continue; + + (*ppos)++; + return ub; + } + return NULL; +} + +static void ub_stop(struct seq_file *f, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations ub_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = ub_show, +}; + +static int ub_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &ub_seq_ops); +} + +static struct file_operations ub_file_operations = { + .open = ub_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations res_seq_ops = { + .start = ub_start, + .next = ub_next, + .stop = ub_stop, + .show = res_show, +}; + +static int res_open(struct inode *inode, struct file *filp) +{ + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EACCES; + + return seq_open(filp, &res_seq_ops); +} + +static struct file_operations resources_operations = { + .open = res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct bc_proc_entry bc_all_resources_entry = { + .name = "resources", + .u.fops = &resources_operations, +}; + +/* + * Generic showing stuff + */ + +static int cookies, num_entries; +static struct bc_proc_entry *bc_entries __read_mostly; +static struct bc_proc_entry *bc_root_entries __read_mostly; +static DEFINE_SPINLOCK(bc_entries_lock); +static struct proc_dir_entry *bc_proc_root; + +void bc_register_proc_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_entries; + bc_entries = e; + num_entries++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_entry); + +void bc_register_proc_root_entry(struct bc_proc_entry *e) +{ + spin_lock(&bc_entries_lock); + e->cookie = ++cookies; + e->next = bc_root_entries; + bc_root_entries = e; + bc_proc_root->nlink++; + spin_unlock(&bc_entries_lock); +} + +EXPORT_SYMBOL(bc_register_proc_root_entry); + +/* + * small helpers + */ + +static inline unsigned long bc_make_ino(struct user_beancounter *ub) +{ + unsigned long ret; + + ret = 0xbc000000; + if (ub->parent) + ret |= ((ub->parent->ub_uid) << 4); + ret |= (ub->ub_uid + 1); + return ret; +} + +static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de) +{ + return 0xbe000000 + de->cookie; +} + +static int bc_d_delete(struct dentry *d) +{ + return 1; +} + +static void bc_d_release(struct dentry *d) +{ + put_beancounter((struct user_beancounter *)d->d_fsdata); +} + +static struct inode_operations bc_entry_iops; +static struct file_operations bc_entry_fops; +static struct dentry_operations bc_dentry_ops = { + .d_delete = bc_d_delete, + .d_release = bc_d_release, +}; + +/* + * common directory operations' helpers + */ + +static int bc_readdir(struct file *file, filldir_t filler, void *data, + struct user_beancounter *parent) +{ + int err = 0; + loff_t pos, filled; + struct user_beancounter *ub, *prev; + struct bc_proc_entry *pde; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + pos = file->f_pos; + if (pos == 0) { + err = (*filler)(data, ".", 1, pos, + file->f_dentry->d_inode->i_ino, DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + if (pos == 1) { + err = (*filler)(data, "..", 2, pos, + parent_ino(file->f_dentry), DT_DIR); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + filled = 2; + for (pde = (parent == NULL ? bc_root_entries : bc_entries); + pde != NULL; pde = pde->next) { + if (filled++ < pos) + continue; + + err = (*filler)(data, pde->name, strlen(pde->name), pos, + bc_make_file_ino(pde), DT_REG); + if (err < 0) { + err = 0; + goto out; + } + pos++; + } + + rcu_read_lock(); + prev = NULL; + ub = list_entry(&ub_list_head, struct user_beancounter, ub_list); + while (1) { + int len; + unsigned long ino; + char buf[64]; + + ub = list_entry(rcu_dereference(ub->ub_list.next), + struct user_beancounter, ub_list); + if (&ub->ub_list == &ub_list_head) + break; + + if (ub->parent != parent) + continue; + + if (filled++ < pos) + continue; + + if (!get_beancounter_rcu(ub)) + continue; + + rcu_read_unlock(); + put_beancounter(prev); + + len = print_ub_uid(ub, buf, sizeof(buf)); + ino = bc_make_ino(ub); + + err = (*filler)(data, buf, len, pos, ino, DT_DIR); + if (err < 0) { + err = 0; + put_beancounter(ub); + goto out; + } + + rcu_read_lock(); + prev = ub; + pos++; + } + rcu_read_unlock(); + put_beancounter(prev); +out: + file->f_pos = pos; + return err; +} + +static int bc_looktest(struct inode *ino, void *data) +{ + return ino->i_op == &bc_entry_iops && ino->i_private == data; +} + +static int bc_lookset(struct inode *ino, void *data) +{ + struct user_beancounter *ub; + + ub = (struct user_beancounter *)data; + ino->i_private = data; + ino->i_ino = bc_make_ino(ub); + ino->i_fop = &bc_entry_fops; + ino->i_op = &bc_entry_iops; + ino->i_mode = S_IFDIR | S_IRUSR | S_IXUGO; + /* subbeancounters are not included, but who cares? */ + ino->i_nlink = num_entries + 2; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir, + struct dentry *dentry) +{ + struct inode *ino; + + ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub); + if (ino == NULL) + goto out_put; + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + dentry->d_fsdata = ub; + d_add(dentry, ino); + return NULL; + +out_put: + put_beancounter(ub); + return ERR_PTR(-ENOENT); +} + +/* + * files (bc_proc_entry) manipulations + */ + +static struct dentry *bc_lookup_file(struct inode *dir, + struct dentry *dentry, struct bc_proc_entry *root, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *)) +{ + struct bc_proc_entry *pde; + struct inode *ino; + + for (pde = root; pde != NULL; pde = pde->next) + if (strcmp(pde->name, dentry->d_name.name) == 0) + break; + + if (pde == NULL) + return ERR_PTR(-ESRCH); + + ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde); + if (ino == NULL) + return ERR_PTR(-ENOENT); + + unlock_new_inode(ino); + dentry->d_op = &bc_dentry_ops; + d_add(dentry, ino); + return NULL; +} + +static int bc_file_open(struct inode *ino, struct file *filp) +{ + struct bc_proc_entry *de; + struct user_beancounter *ub; + + de = (struct bc_proc_entry *)ino->i_private; + ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata; + BUG_ON(ub->ub_magic != UB_MAGIC); + + /* + * ub can't disappear: we hold d_parent, he holds the beancounter + */ + return single_open(filp, de->u.show, ub); +} + +static struct file_operations bc_file_ops = { + .open = bc_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int bc_looktest_entry(struct inode *ino, void *data) +{ + return ino->i_fop == &bc_file_ops && ino->i_private == data; +} + +static int bc_lookset_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = &bc_file_ops, + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_entries, + bc_looktest_entry, bc_lookset_entry); +} + +static int bc_looktest_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + return ino->i_fop == de->u.fops && ino->i_private == data; +} + +static int bc_lookset_root_entry(struct inode *ino, void *data) +{ + struct bc_proc_entry *de; + + de = (struct bc_proc_entry *)data; + ino->i_private = data; + ino->i_ino = bc_make_file_ino(de); + ino->i_fop = de->u.fops; + ino->i_mode = S_IFREG | S_IRUSR; + ino->i_nlink = 1; + ino->i_gid = 0; + ino->i_uid = 0; + return 0; +} + +static inline struct dentry *bc_lookup_root_files(struct inode *dir, + struct dentry *de) +{ + return bc_lookup_file(dir, de, bc_root_entries, + bc_looktest_root_entry, bc_lookset_root_entry); +} + +/* + * /proc/bc/.../ directory operations + */ + +static int bc_entry_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, + (struct user_beancounter *)file->f_dentry->d_fsdata); +} + +static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *par, *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '.') + return ERR_PTR(-ENOENT); + + par = (struct user_beancounter *)dir->i_private; + if (par->ub_uid != id) + return ERR_PTR(-ENOENT); + + id = simple_strtol(end + 1, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_subbeancounter_byid(par, id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_entry_fops = { + .read = generic_read_dir, + .readdir = bc_entry_readdir, +}; + +static struct inode_operations bc_entry_iops = { + .lookup = bc_entry_lookup, +}; + +/* + * /proc/bc directory operations + */ + +static int bc_root_readdir(struct file *file, void *data, filldir_t filler) +{ + return bc_readdir(file, filler, data, NULL); +} + +static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int id; + char *end; + struct user_beancounter *ub; + struct dentry *de; + + if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) + return ERR_PTR(-EPERM); + + de = bc_lookup_root_files(dir, dentry); + if (de != ERR_PTR(-ESRCH)) + return de; + + id = simple_strtol(dentry->d_name.name, &end, 10); + if (*end != '\0') + return ERR_PTR(-ENOENT); + + ub = get_beancounter_byuid(id, 0); + if (ub == NULL) + return ERR_PTR(-ENOENT); + + return bc_lookup(ub, dir, dentry); +} + +static struct file_operations bc_root_fops = { + .read = generic_read_dir, + .readdir = bc_root_readdir, +}; + +static struct inode_operations bc_root_iops = { + .lookup = bc_root_lookup, +}; + +static int __init ub_init_proc(void) +{ + struct proc_dir_entry *entry; + + bc_proc_root = create_proc_entry("bc", + S_IFDIR | S_IRUGO | S_IXUGO, NULL); + if (bc_proc_root == NULL) + panic("Can't create /proc/bc entry"); + + bc_proc_root->proc_fops = &bc_root_fops; + bc_proc_root->proc_iops = &bc_root_iops; + + bc_register_proc_entry(&bc_resources_entry); +#ifdef CONFIG_UBC_DEBUG + bc_register_proc_entry(&bc_debug_entry); +#endif + bc_register_proc_root_entry(&bc_all_resources_entry); + + entry = create_proc_glob_entry("user_beancounters", S_IRUGO, NULL); + entry->proc_fops = &ub_file_operations; + return 0; +} + +core_initcall(ub_init_proc); diff -uprN linux-2.6.24/kernel/bc/rss_pages.c linux-2.6.24.ovz/kernel/bc/rss_pages.c --- linux-2.6.24/kernel/bc/rss_pages.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/rss_pages.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,437 @@ +/* + * kernel/bc/rss_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static struct kmem_cache *pb_cachep; +spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; +static struct page_beancounter **pb_hash_table; +static unsigned int pb_hash_mask; + +/* + * Auxiliary staff + */ + +static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.next, struct page_beancounter, + page_list); +} + +static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) +{ + return list_entry(p->page_list.prev, struct page_beancounter, + page_list); +} + +/* + * Held pages manipulation + */ +static inline void set_held_pages(struct user_beancounter *bc) +{ + /* all three depend on ub_held_pages */ + __ub_update_physpages(bc); + __ub_update_oomguarpages(bc); + __ub_update_privvm(bc); +} + +static inline void do_dec_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages -= value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void dec_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_dec_held_pages(ub, value); +} + +static inline void do_inc_held_pages(struct user_beancounter *ub, int value) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_held_pages += value; + set_held_pages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +static void inc_held_pages(struct user_beancounter *ub, int value) +{ + for (; ub != NULL; ub = ub->parent) + do_inc_held_pages(ub, value); +} + +/* + * Alloc - free + */ + +inline int pb_alloc(struct page_beancounter **pbc) +{ + *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); + if (*pbc != NULL) { + (*pbc)->next_hash = NULL; + (*pbc)->pb_magic = PB_MAGIC; + } + return (*pbc == NULL); +} + +inline void pb_free(struct page_beancounter **pb) +{ + if (*pb != NULL) { + kmem_cache_free(pb_cachep, *pb); + *pb = NULL; + } +} + +void pb_free_list(struct page_beancounter **p_pb) +{ + struct page_beancounter *list, *pb; + + list = *p_pb; + if (list == PBC_COPY_SAME) + return; + + while (list) { + pb = list; + list = list->next_hash; + pb_free(&pb); + } + *p_pb = NULL; +} + +/* + * head -> -> -> ... + */ +static int __alloc_list(struct page_beancounter **head, int num) +{ + struct page_beancounter *pb; + + while (num > 0) { + if (pb_alloc(&pb)) + return -1; + pb->next_hash = *head; + *head = pb; + num--; + } + + return num; +} + +/* + * Ensure that the list contains at least num elements. + * p_pb points to an initialized list, may be of the zero length. + * + * mm->page_table_lock should be held + */ +int pb_alloc_list(struct page_beancounter **p_pb, int num) +{ + struct page_beancounter *list; + + for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); + if (!num) + return 0; + + /* + * *p_pb(after) *p_pb (before) + * \ \ + * -...-> -> ... + */ + if (__alloc_list(p_pb, num) < 0) + goto nomem; + return 0; + +nomem: + pb_free_list(p_pb); + return -ENOMEM; +} + +/* + * Allocates a page_beancounter for each + * user_beancounter in a hash + */ +int pb_alloc_all(struct page_beancounter **pbs) +{ + int need_alloc; + struct user_beancounter *ub; + + need_alloc = 0; + rcu_read_lock(); + for_each_beancounter(ub) + need_alloc++; + rcu_read_unlock(); + + if (!__alloc_list(pbs, need_alloc)) + return 0; + + pb_free_list(pbs); + return -ENOMEM; +} + +/* + * Hash routines + */ + +static inline int pb_hash(struct user_beancounter *ub, struct page *page) +{ + return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; +} + +/* pb_lock should be held */ +static inline void insert_pb(struct page_beancounter *p, struct page *page, + struct user_beancounter *ub, int hash) +{ + p->page = page; + p->ub = get_beancounter(ub); + p->next_hash = pb_hash_table[hash]; + pb_hash_table[hash] = p; + inc_pbc_count(ub); +} + +/* + * Heart + */ + +static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, + int hash) +{ + struct page_beancounter *p; + + for (p = pb_hash_table[hash]; + p != NULL && (p->page != page || p->ub != bc); + p = p->next_hash); + if (p == NULL) + return -1; + + PB_COUNT_INC(p->refcount); + return 0; +} + +static void __pb_add_ref(struct page *page, struct user_beancounter *bc, + struct page_beancounter **ppb, int hash) +{ + struct page_beancounter *head, *p, **hp; + int shift; + + p = *ppb; + *ppb = p->next_hash; + + insert_pb(p, page, bc, hash); + hp = page_pblist(page); + head = *hp; + + if (head != NULL) { + /* + * Move the first element to the end of the list. + * List head (pb_head) is set to the next entry. + * Note that this code works even if head is the only element + * on the list (because it's cyclic). + */ + BUG_ON(head->pb_magic != PB_MAGIC); + *hp = next_page_pb(head); + PB_SHIFT_INC(head->refcount); + shift = PB_SHIFT_GET(head->refcount); + /* + * Update user beancounter, the share of head has been changed. + * Note that the shift counter is taken after increment. + */ + dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); + /* add the new page beancounter to the end of the list */ + head = *hp; + list_add_tail(&p->page_list, &head->page_list); + } else { + *hp = p; + shift = 0; + INIT_LIST_HEAD(&p->page_list); + } + + p->refcount = PB_REFCOUNT_MAKE(shift, 1); + /* update user beancounter for the new page beancounter */ + inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); +} + +void pb_add_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (__pb_dup_ref(page, bc, hash)) + __pb_add_ref(page, bc, p_pb, hash); + spin_unlock(&pb_lock); +} + +void pb_dup_ref(struct page *page, struct mm_struct *mm, + struct page_beancounter **p_pb) +{ + int hash; + struct user_beancounter *bc; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + if (*page_pblist(page) == NULL) + /* + * pages like ZERO_PAGE must not be accounted in pbc + * so on fork we just skip them + */ + goto out_unlock; + + if (unlikely(*p_pb != PBC_COPY_SAME)) + __pb_add_ref(page, bc, p_pb, hash); + else if (unlikely(__pb_dup_ref(page, bc, hash))) + WARN_ON(1); +out_unlock: + spin_unlock(&pb_lock); +} + +void pb_remove_ref(struct page *page, struct mm_struct *mm) +{ + int hash; + struct user_beancounter *bc; + struct page_beancounter *p, **q, *f; + int shift, shiftt; + + bc = mm->mm_ub; + if (bc == NULL) + return; + + if (!PageAnon(page) && is_shmem_mapping(page->mapping)) + return; + + hash = pb_hash(bc, page); + + spin_lock(&pb_lock); + for (q = pb_hash_table + hash, p = *q; + p != NULL && (p->page != page || p->ub != bc); + q = &p->next_hash, p = *q); + if (p == NULL) + goto out_unlock; + + PB_COUNT_DEC(p->refcount); + if (PB_COUNT_GET(p->refcount)) + /* + * More references from the same user beancounter exist. + * Nothing needs to be done. + */ + goto out_unlock; + + /* remove from the hash list */ + f = p; + *q = p->next_hash; + + shift = PB_SHIFT_GET(p->refcount); + + dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); + + q = page_pblist(page); + if (*q == p) { + if (list_empty(&p->page_list)) { + *q = NULL; + goto out_free; + } + + *q = next_page_pb(p); + } + list_del(&p->page_list); + + /* Now balance the list. Move the tail and adjust its shift counter. */ + p = prev_page_pb(*q); + shiftt = PB_SHIFT_GET(p->refcount); + *q = p; + PB_SHIFT_DEC(p->refcount); + + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + + /* + * If the shift counter of the moved beancounter is different from the + * removed one's, repeat the procedure for one more tail beancounter + */ + if (shiftt > shift) { + p = prev_page_pb(*q); + *q = p; + PB_SHIFT_DEC(p->refcount); + inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); + } +out_free: + dec_pbc_count(f->ub); + spin_unlock(&pb_lock); + + put_beancounter(f->ub); + pb_free(&f); + return; + +out_unlock: + spin_unlock(&pb_lock); +} + +struct user_beancounter *pb_grab_page_ub(struct page *page) +{ + struct page_beancounter *pb; + struct user_beancounter *ub; + + spin_lock(&pb_lock); + pb = *page_pblist(page); + ub = (pb == NULL ? ERR_PTR(-EINVAL) : + get_beancounter(pb->ub)); + spin_unlock(&pb_lock); + return ub; +} + +void __init ub_init_pbc(void) +{ + unsigned long hash_size; + + pb_cachep = kmem_cache_create("page_beancounter", + sizeof(struct page_beancounter), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + hash_size = num_physpages >> 2; + for (pb_hash_mask = 1; + (hash_size & pb_hash_mask) != hash_size; + pb_hash_mask = (pb_hash_mask << 1) + 1); + hash_size = pb_hash_mask + 1; + printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); + pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); + memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); + + ub_init_io(pb_cachep); +} diff -uprN linux-2.6.24/kernel/bc/statd.c linux-2.6.24.ovz/kernel/bc/statd.c --- linux-2.6.24/kernel/bc/statd.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/statd.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,453 @@ +/* + * kernel/bc/statd.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(ubs_notify_list); +static long ubs_min_interval; +static ubstattime_t ubs_start_time, ubs_end_time; +static struct timer_list ubs_timer; + +static int ubstat_get_list(void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub, *ubp; + long *page, *ptr, *end; + int len; + + page = (long *)__get_free_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + retval = 0; + ubp = NULL; + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + for_each_beancounter(ub) { + if (ub->parent != NULL) + continue; + *ptr++ = ub->ub_uid; + if (ptr != end) + continue; + + get_beancounter(ub); + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + ubp = ub; + + len = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (copy_to_user(buf, page, len)) { + retval = -EFAULT; + goto out_put; + } + retval += len; + if (len < PAGE_SIZE) + goto out_put; + buf += len; + size -= len; + + ptr = page; + end = page + PAGE_SIZE / sizeof(*ptr); + + spin_lock_irq(&ub_hash_lock); + } + spin_unlock_irq(&ub_hash_lock); + + put_beancounter(ubp); + size = min_t(long, (ptr - page) * sizeof(*ptr), size); + if (size > 0 && copy_to_user(buf, page, size)) { + retval = -EFAULT; + goto out_put; + } + retval += size; + +out_put: + put_beancounter(ubp); + free_page((unsigned long)page); + return retval; +} + +static int ubstat_gettime(void __user *buf, long size) +{ + ubgettime_t data; + int retval; + + spin_lock(&ubs_notify_lock); + data.start_time = ubs_start_time; + data.end_time = ubs_end_time; + data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; + spin_unlock(&ubs_notify_lock); + + retval = min_t(long, sizeof(data), size); + if (copy_to_user(buf, &data, retval)) + retval = -EFAULT; + return retval; +} + +static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) +{ + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[1]; + } *data; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + + data->param[0].maxheld = ub->ub_store[res].maxheld; + data->param[0].failcnt = ub->ub_store[res].failcnt; + + return sizeof(*data); +} + +static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparm_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + data->param[resource].maxheld = ub->ub_store[resource].maxheld; + data->param[resource].failcnt = ub->ub_store[resource].failcnt; + wrote += sizeof(data->param[resource]); + } + + return wrote; +} + +static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, + int size) +{ + int wrote; + struct { + ubstattime_t start_time; + ubstattime_t end_time; + ubstatparmf_t param[UB_RESOURCES]; + } *data; + int resource; + + data = kbuf; + data->start_time = ubs_start_time; + data->end_time = ubs_end_time; + wrote = sizeof(data->start_time) + sizeof(data->end_time); + + for (resource = 0; resource < UB_RESOURCES; resource++) { + if (size < wrote + sizeof(data->param[resource])) + break; + /* The beginning of ubstatparmf_t matches struct ubparm. */ + memcpy(&data->param[resource], &ub->ub_store[resource], + sizeof(ub->ub_store[resource])); + data->param[resource].__unused1 = 0; + data->param[resource].__unused2 = 0; + wrote += sizeof(data->param[resource]); + } + return wrote; +} + +static int ubstat_get_stat(struct user_beancounter *ub, long cmd, + void __user *buf, long size) +{ + void *kbuf; + int retval; + + kbuf = (void *)__get_free_page(GFP_KERNEL); + if (kbuf == NULL) + return -ENOMEM; + + spin_lock(&ubs_notify_lock); + switch (UBSTAT_CMD(cmd)) { + case UBSTAT_READ_ONE: + retval = -EINVAL; + if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) + break; + retval = ubstat_do_read_one(ub, + UBSTAT_PARMID(cmd), kbuf); + break; + case UBSTAT_READ_ALL: + retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); + break; + case UBSTAT_READ_FULL: + retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); + break; + default: + retval = -EINVAL; + } + spin_unlock(&ubs_notify_lock); + + if (retval > 0) { + retval = min_t(long, retval, size); + if (copy_to_user(buf, kbuf, retval)) + retval = -EFAULT; + } + + free_page((unsigned long)kbuf); + return retval; +} + +static int ubstat_handle_notifrq(ubnotifrq_t *req) +{ + int retval; + struct ub_stat_notify *new_notify; + struct list_head *entry; + struct task_struct *tsk_to_free; + + new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); + if (new_notify == NULL) + return -ENOMEM; + + tsk_to_free = NULL; + INIT_LIST_HEAD(&new_notify->list); + + spin_lock(&ubs_notify_lock); + list_for_each(entry, &ubs_notify_list) { + struct ub_stat_notify *notify; + + notify = list_entry(entry, struct ub_stat_notify, list); + if (notify->task == current) { + kfree(new_notify); + new_notify = notify; + break; + } + } + + retval = -EINVAL; + if (req->maxinterval < 1) + goto out_unlock; + if (req->maxinterval > TIME_MAX_SEC) + req->maxinterval = TIME_MAX_SEC; + if (req->maxinterval < ubs_min_interval) { + unsigned long dif; + + ubs_min_interval = req->maxinterval; + dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; + if (dif > req->maxinterval) + mod_timer(&ubs_timer, + ubs_timer.expires - + (dif - req->maxinterval) * HZ); + } + + if (entry != &ubs_notify_list) { + list_del(&new_notify->list); + tsk_to_free = new_notify->task; + } + if (req->signum) { + new_notify->task = current; + get_task_struct(new_notify->task); + new_notify->signum = req->signum; + list_add(&new_notify->list, &ubs_notify_list); + } else + kfree(new_notify); + retval = 0; +out_unlock: + spin_unlock(&ubs_notify_lock); + if (tsk_to_free != NULL) + put_task_struct(tsk_to_free); + return retval; +} + +/* + * former sys_ubstat + */ +long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + int retval; + struct user_beancounter *ub; + + if (func == UBSTAT_UBPARMNUM) + return UB_RESOURCES; + if (func == UBSTAT_UBLIST) + return ubstat_get_list(buf, size); + if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) + return -EPERM; + + if (func == UBSTAT_GETTIME) { + retval = ubstat_gettime(buf, size); + goto notify; + } + + ub = get_exec_ub(); + if (ub != NULL && ub->ub_uid == arg1) + get_beancounter(ub); + else /* FIXME must be if (ve_is_super) */ + ub = get_beancounter_byuid(arg1, 0); + + if (ub == NULL) + return -ESRCH; + + retval = ubstat_get_stat(ub, func, buf, size); + put_beancounter(ub); +notify: + /* Handle request for notification */ + if (retval >= 0) { + ubnotifrq_t notifrq; + int err; + + err = -EFAULT; + if (!copy_from_user(¬ifrq, (void __user *)arg2, + sizeof(notifrq))) + err = ubstat_handle_notifrq(¬ifrq); + if (err) + retval = err; + } + + return retval; +} + +static void ubstat_save_onestat(struct user_beancounter *ub) +{ + int resource; + + /* called with local irq disabled */ + spin_lock(&ub->ub_lock); + for (resource = 0; resource < UB_RESOURCES; resource++) { + memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], + sizeof(struct ubparm)); + ub->ub_parms[resource].minheld = + ub->ub_parms[resource].maxheld = + ub->ub_parms[resource].held; + } + spin_unlock(&ub->ub_lock); +} + +static void ubstat_save_statistics(void) +{ + unsigned long flags; + struct user_beancounter *ub; + + local_irq_save(flags); + for_each_beancounter (ub) + ubstat_save_onestat(ub); + local_irq_restore(flags); +} + +static void ubstatd_timeout(unsigned long __data) +{ + struct task_struct *p; + + p = (struct task_struct *) __data; + wake_up_process(p); +} + +/* + * Safe wrapper for send_sig. It prevents a race with release_task + * for sighand. + * Should be called under tasklist_lock. + */ +static void task_send_sig(struct ub_stat_notify *notify) +{ + if (likely(notify->task->sighand != NULL)) + send_sig(notify->signum, notify->task, 1); +} + +static inline void do_notifies(void) +{ + LIST_HEAD(notif_free_list); + struct ub_stat_notify *notify; + struct ub_stat_notify *tmp; + + spin_lock(&ubs_notify_lock); + ubs_start_time = ubs_end_time; + /* + * the expression below relies on time being unsigned long and + * arithmetic promotion rules + */ + ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; + mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); + ubs_min_interval = TIME_MAX_SEC; + /* save statistics accumulated for the interval */ + ubstat_save_statistics(); + /* send signals */ + read_lock(&tasklist_lock); + while (!list_empty(&ubs_notify_list)) { + notify = list_entry(ubs_notify_list.next, + struct ub_stat_notify, list); + task_send_sig(notify); + list_del(¬ify->list); + list_add(¬ify->list, ¬if_free_list); + } + read_unlock(&tasklist_lock); + spin_unlock(&ubs_notify_lock); + + list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { + put_task_struct(notify->task); + kfree(notify); + } +} + +/* + * Kernel thread + */ +static int ubstatd(void *unused) +{ + /* daemonize call will take care of signals */ + daemonize("ubstatd"); + + ubs_timer.data = (unsigned long)current; + ubs_timer.function = ubstatd_timeout; + add_timer(&ubs_timer); + + while (1) { + set_task_state(current, TASK_INTERRUPTIBLE); + if (time_after(ubs_timer.expires, jiffies)) { + schedule(); + try_to_freeze(); + continue; + } + + __set_task_state(current, TASK_RUNNING); + do_notifies(); + } + return 0; +} + +static int __init ubstatd_init(void) +{ + init_timer(&ubs_timer); + ubs_timer.expires = TIME_MAX_JIF; + ubs_min_interval = TIME_MAX_SEC; + ubs_start_time = ubs_end_time = 0; + + kernel_thread(ubstatd, NULL, 0); + return 0; +} + +module_init(ubstatd_init); diff -uprN linux-2.6.24/kernel/bc/sys.c linux-2.6.24.ovz/kernel/bc/sys.c --- linux-2.6.24/kernel/bc/sys.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/sys.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,173 @@ +/* + * kernel/bc/sys.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include + +#include + +/* + * The (rather boring) getluid syscall + */ +asmlinkage long sys_getluid(void) +{ + struct user_beancounter *ub; + + ub = get_exec_ub(); + if (ub == NULL) + return -EINVAL; + + return ub->ub_uid; +} + +/* + * The setluid syscall + */ +asmlinkage long sys_setluid(uid_t uid) +{ + struct user_beancounter *ub; + struct task_beancounter *task_bc; + int error; + + task_bc = ¤t->task_bc; + + /* You may not disown a setluid */ + error = -EINVAL; + if (uid == (uid_t)-1) + goto out; + + /* You may only set an ub as root */ + error = -EPERM; + if (!capable(CAP_SETUID)) + goto out; + /* + * The ub once set is irrevocable to all + * unless it's set from ve0. + */ + if (!ve_is_super(get_exec_env())) + goto out; + + /* Ok - set up a beancounter entry for this user */ + error = -ENOBUFS; + ub = get_beancounter_byuid(uid, 1); + if (ub == NULL) + goto out; + + ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " + "for %.20s pid %d\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid); + /* install bc */ + error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub); + if (!(error & NOTIFY_FAIL)) { + put_beancounter(task_bc->exec_ub); + task_bc->exec_ub = ub; + if (!(error & NOTIFY_OK)) { + put_beancounter(task_bc->fork_sub); + task_bc->fork_sub = get_beancounter(ub); + } + error = 0; + } else { + put_beancounter(ub); + error = -ENOBUFS; + } +out: + return error; +} + +long do_setublimit(uid_t uid, unsigned long resource, + unsigned long *new_limits) +{ + int error; + unsigned long flags; + struct user_beancounter *ub; + + error = -EPERM; + if(!capable(CAP_SYS_RESOURCE)) + goto out; + + if (!ve_is_super(get_exec_env())) + goto out; + + error = -EINVAL; + if (resource >= UB_RESOURCES) + goto out; + + error = -EINVAL; + if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) + goto out; + + error = -ENOENT; + ub = get_beancounter_byuid(uid, 0); + if (ub == NULL) { + ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); + goto out; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_parms[resource].barrier = new_limits[0]; + ub->ub_parms[resource].limit = new_limits[1]; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + put_beancounter(ub); + + error = 0; +out: + return error; +} + +/* + * The setbeanlimit syscall + */ +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, + unsigned long __user *limits) +{ + unsigned long new_limits[2]; + + if (copy_from_user(&new_limits, limits, sizeof(new_limits))) + return -EFAULT; + + return do_setublimit(uid, resource, new_limits); +} + +extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size); +asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, + void __user *buf, long size) +{ + if (!ve_is_super(get_exec_env())) + return -EPERM; + + return do_ubstat(func, arg1, arg2, buf, size); +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_setublimit(uid_t uid, int resource, + unsigned int __user *limits) +{ + unsigned int u_new_limits[2]; + unsigned long new_limits[2]; + + if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) + return -EFAULT; + + new_limits[0] = u_new_limits[0]; + new_limits[1] = u_new_limits[1]; + + return do_setublimit(uid, resource, new_limits); +} + +asmlinkage long compat_sys_ubstat(int func, unsigned int arg1, + unsigned int arg2, compat_uptr_t *buf, long size) +{ + return sys_ubstat(func, arg1, arg2, buf, size); +} +#endif diff -uprN linux-2.6.24/kernel/bc/vm_pages.c linux-2.6.24.ovz/kernel/bc/vm_pages.c --- linux-2.6.24/kernel/bc/vm_pages.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/bc/vm_pages.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,549 @@ +/* + * kernel/bc/vm_pages.c + * + * Copyright (C) 2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + if (!pte_none(*pte) && pte_present(*pte)) + (*ret)++; + } while (pte++, addr += PAGE_SIZE, (addr != end)); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + next = pages_in_pte_range(vma, pmd, addr, next, ret); + } while (pmd++, addr = next, (addr != end)); + + return addr; +} + +static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long *ret) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = pages_in_pmd_range(vma, pud, addr, next, ret); + } while (pud++, addr = next, (addr != end)); + + return addr; +} + +unsigned long pages_in_vma_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + unsigned long ret; + + ret = 0; + BUG_ON(addr >= end); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = pages_in_pud_range(vma, pgd, addr, next, &ret); + } while (pgd++, addr = next, (addr != end)); + return ret; +} + +void fastcall __ub_update_physpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages + + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); + ub_adjust_maxheld(ub, UB_PHYSPAGES); +} + +void fastcall __ub_update_oomguarpages(struct user_beancounter *ub) +{ + ub->ub_parms[UB_OOMGUARPAGES].held = + ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; + ub_adjust_maxheld(ub, UB_OOMGUARPAGES); +} + +void fastcall __ub_update_privvm(struct user_beancounter *ub) +{ + ub->ub_parms[UB_PRIVVMPAGES].held = + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) + + ub->ub_unused_privvmpages + + ub->ub_parms[UB_SHMPAGES].held; + ub_adjust_maxheld(ub, UB_PRIVVMPAGES); +} + +static inline int __charge_privvm_locked(struct user_beancounter *ub, + unsigned long s, enum ub_severity strict) +{ + if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) + return -ENOMEM; + + ub->ub_unused_privvmpages += s; + return 0; +} + +static void __unused_privvm_dec_locked(struct user_beancounter *ub, + long size) +{ + /* catch possible overflow */ + if (ub->ub_unused_privvmpages < size) { + uncharge_warn(ub, UB_UNUSEDPRIVVM, + size, ub->ub_unused_privvmpages); + size = ub->ub_unused_privvmpages; + } + ub->ub_unused_privvmpages -= size; + __ub_update_privvm(ub); +} + +void __ub_unused_privvm_dec(struct mm_struct *mm, long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_unused_privvm_sub(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long count) +{ + if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + __ub_unused_privvm_dec(mm, count); +} + +void ub_unused_privvm_add(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_unused_privvmpages += size; + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int ub_protected_charge(struct mm_struct *mm, unsigned long size, + unsigned long newflags, struct vm_area_struct *vma) +{ + unsigned long flags; + struct file *file; + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return PRIVVM_NO_CHARGE; + + flags = vma->vm_flags; + if (!((newflags ^ flags) & VM_WRITE)) + return PRIVVM_NO_CHARGE; + + file = vma->vm_file; + if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) + return PRIVVM_NO_CHARGE; + + if (flags & VM_WRITE) + return PRIVVM_TO_SHARED; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) + goto err; + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_TO_PRIVATE; + +err: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return PRIVVM_ERROR; +} + +int ub_memory_charge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file, int sv) +{ + struct user_beancounter *ub, *ubl; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + size >>= PAGE_SHIFT; + if (size > UB_MAXVALUE) + return -EINVAL; + + BUG_ON(sv != UB_SOFT && sv != UB_HARD); + + if (vm_flags & VM_LOCKED) { + if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) + goto out_err; + } + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ubl = top_beancounter(ub); + spin_lock_irqsave(&ubl->ub_lock, flags); + if (__charge_privvm_locked(ubl, size, sv)) + goto out_private; + spin_unlock_irqrestore(&ubl->ub_lock, flags); + } + return 0; + +out_private: + spin_unlock_irqrestore(&ubl->ub_lock, flags); + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); +out_err: + return -ENOMEM; +} + +void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, + unsigned vm_flags, struct file *vm_file) +{ + struct user_beancounter *ub; + unsigned long flags; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + size >>= PAGE_SHIFT; + + if (vm_flags & VM_LOCKED) + uncharge_beancounter(ub, UB_LOCKEDPAGES, size); + if (VM_UB_PRIVATE(vm_flags, vm_file)) { + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __unused_privvm_dec_locked(ub, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); + } +} + +int ub_locked_charge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) +{ + struct user_beancounter *ub; + + ub = mm->mm_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + +int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + return charge_beancounter(ub, UB_LOCKEDPAGES, + size >> PAGE_SHIFT, UB_HARD); +} + +void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + + +static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_tmpfs_respages++; + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_inc(ub); +} + +static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, + unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + /* catch possible overflow */ + if (ub->ub_tmpfs_respages < size) { + uncharge_warn(ub, UB_TMPFSPAGES, + size, ub->ub_tmpfs_respages); + size = ub->ub_tmpfs_respages; + } + ub->ub_tmpfs_respages -= size; + /* update values what is the most interesting */ + __ub_update_physpages(ub); + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, + unsigned long size) +{ + struct user_beancounter *ub; + + for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) + do_ub_tmpfs_respages_sub(ub, size); +} + +int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) +{ + int ret; + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return 0; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); + if (ret == 0) + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); + return ret; +} + +void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) +{ + unsigned long flags; + struct user_beancounter *ub; + + ub = shi->shmi_ub; + if (ub == NULL) + return; + + ub = top_beancounter(ub); + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); + __ub_update_privvm(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +#ifdef CONFIG_BC_SWAP_ACCOUNTING +static inline void do_ub_swapentry_inc(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_swap_pages++; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, + struct user_beancounter *ub) +{ + si->swap_ubs[num] = get_beancounter(ub); + for (; ub != NULL; ub = ub->parent) + do_ub_swapentry_inc(ub); +} +EXPORT_SYMBOL(ub_swapentry_inc); + +static inline void do_ub_swapentry_dec(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + if (ub->ub_swap_pages <= 0) + uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); + else + ub->ub_swap_pages--; + __ub_update_oomguarpages(ub); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter *ub, *ubp; + + ub = si->swap_ubs[num]; + si->swap_ubs[num] = NULL; + for (ubp = ub; ubp != NULL; ubp = ubp->parent) + do_ub_swapentry_dec(ubp); + put_beancounter(ub); +} +EXPORT_SYMBOL(ub_swapentry_dec); + +int ub_swap_init(struct swap_info_struct *si, pgoff_t num) +{ + struct user_beancounter **ubs; + + ubs = vmalloc(num * sizeof(struct user_beancounter *)); + if (ubs == NULL) + return -ENOMEM; + + memset(ubs, 0, num * sizeof(struct user_beancounter *)); + si->swap_ubs = ubs; + return 0; +} + +void ub_swap_fini(struct swap_info_struct *si) +{ + if (si->swap_ubs) { + vfree(si->swap_ubs); + si->swap_ubs = NULL; + } +} +#endif + +static int vmguar_enough_memory(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + struct user_beancounter *ub; + + if (event != VIRTINFO_ENOUGHMEM) + return old_ret; + /* + * If it's a kernel thread, don't care about it. + * Added in order aufsd to run smoothly over ramfs. + */ + if (!current->mm) + return NOTIFY_DONE; + + ub = top_beancounter(current->mm->mm_ub); + if (ub->ub_parms[UB_PRIVVMPAGES].held > + ub->ub_parms[UB_VMGUARPAGES].barrier) + return old_ret; + + return NOTIFY_OK; +} + +static struct vnotifier_block vmguar_notifier_block = { + .notifier_call = vmguar_enough_memory +}; + +static int __init init_vmguar_notifier(void) +{ + virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); + return 0; +} + +static void __exit fini_vmguar_notifier(void) +{ + virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); +} + +module_init(init_vmguar_notifier); +module_exit(fini_vmguar_notifier); + +#ifdef CONFIG_PROC_FS +static int bc_vmaux_show(struct seq_file *f, void *v) +{ + struct user_beancounter *ub; + unsigned long swap, unmap; + int i; + + ub = seq_beancounter(f); + + swap = unmap = 0; + for_each_online_cpu(i) { + swap += per_cpu_ptr(ub->ub_percpu, i)->swapin; + unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap; + } + + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM], + ub->ub_unused_privvmpages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], + ub->ub_tmpfs_respages); + seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES], + ub->ub_swap_pages); + + seq_printf(f, bc_proc_lu_fmt, "swapin", swap); + seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); + return 0; +} +static struct bc_proc_entry bc_vmaux_entry = { + .name = "vmaux", + .u.show = bc_vmaux_show, +}; + +static int __init bc_vmaux_init(void) +{ + bc_register_proc_entry(&bc_vmaux_entry); + return 0; +} + +late_initcall(bc_vmaux_init); +#endif diff -uprN linux-2.6.24/kernel/capability.c linux-2.6.24.ovz/kernel/capability.c --- linux-2.6.24/kernel/capability.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/capability.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,16 +10,23 @@ #include #include #include +#include #include #include #include #include +#ifndef CONFIG_VE +kernel_cap_t cap_bset = CAP_INIT_EFF_SET; +EXPORT_SYMBOL(cap_bset); +#endif + /* * This lock protects task->cap_* for all tasks including current. * Locking rule: acquire this prior to tasklist_lock. */ -static DEFINE_SPINLOCK(task_capability_lock); +DEFINE_SPINLOCK(task_capability_lock); +EXPORT_SYMBOL(task_capability_lock); /* * For sys_getproccap() and sys_setproccap(), any of the three @@ -99,7 +106,7 @@ static inline int cap_set_pg(int pgrp_nr pgrp = find_vpid(pgrp_nr); do_each_pid_task(pgrp, PIDTYPE_PGID, g) { target = g; - while_each_thread(g, target) { + while_each_thread_ve(g, target) { if (!security_capset_check(target, effective, inheritable, permitted)) { @@ -129,7 +136,7 @@ static inline int cap_set_all(kernel_cap int ret = -EPERM; int found = 0; - do_each_thread(g, target) { + do_each_thread_ve(g, target) { if (target == current || is_container_init(target->group_leader)) continue; found = 1; @@ -138,7 +145,7 @@ static inline int cap_set_all(kernel_cap continue; ret = 0; security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); + } while_each_thread_ve(g, target); if (!found) ret = 0; diff -uprN linux-2.6.24/kernel/cgroup.c linux-2.6.24.ovz/kernel/cgroup.c --- linux-2.6.24/kernel/cgroup.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cgroup.c 2008-03-25 18:53:59.000000000 -0500 @@ -1686,12 +1686,12 @@ void cgroup_iter_start(struct cgroup *cg struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); write_unlock(&css_set_lock); } read_lock(&css_set_lock); @@ -2229,9 +2229,9 @@ static void cgroup_init_subsys(struct cg struct task_struct *g, *p; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { ss->fork(ss, p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } @@ -2577,9 +2577,6 @@ int cgroup_clone(struct task_struct *tsk again: root = subsys->root; if (root == &rootnode) { - printk(KERN_INFO - "Not cloning cgroup for unused subsystem %s\n", - subsys->name); mutex_unlock(&cgroup_mutex); return 0; } diff -uprN linux-2.6.24/kernel/compat.c linux-2.6.24.ovz/kernel/compat.c --- linux-2.6.24/kernel/compat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/compat.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -40,10 +41,37 @@ int put_compat_timespec(const struct tim __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +long compat_nanosleep_restart(struct restart_block *restart) +{ + struct compat_timespec __user *rmtp; + struct timespec rmt; + mm_segment_t oldfs; + long ret; + + rmtp = (struct compat_timespec __user *)(restart->arg1); + restart->arg1 = (unsigned long)&rmt; + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep_restart(restart); + set_fs(oldfs); + + if (ret) { + restart->fn = compat_nanosleep_restart; + restart->arg1 = (unsigned long)rmtp; + + if (rmtp && put_compat_timespec(&rmt, rmtp)) + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(compat_nanosleep_restart); + asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) { struct timespec tu, rmt; + mm_segment_t oldfs; long ret; if (get_compat_timespec(&tu, rqtp)) @@ -52,11 +80,21 @@ asmlinkage long compat_sys_nanosleep(str if (!timespec_valid(&tu)) return -EINVAL; - ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep(&tu, + rmtp ? (struct timespec __user *)&rmt : NULL, + HRTIMER_MODE_REL, CLOCK_MONOTONIC); + set_fs(oldfs); + + if (ret) { + struct restart_block *restart + = ¤t_thread_info()->restart_block; + + restart->fn = compat_nanosleep_restart; + restart->arg1 = (unsigned long)rmtp; - if (ret && rmtp) { - if (put_compat_timespec(&rmt, rmtp)) + if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; } diff -uprN linux-2.6.24/kernel/configs.c linux-2.6.24.ovz/kernel/configs.c --- linux-2.6.24/kernel/configs.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/configs.c 2008-03-25 18:53:59.000000000 -0500 @@ -79,8 +79,7 @@ static int __init ikconfig_init(void) struct proc_dir_entry *entry; /* create the current config file */ - entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, - &proc_root); + entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL); if (!entry) return -ENOMEM; diff -uprN linux-2.6.24/kernel/cpt/Makefile linux-2.6.24.ovz/kernel/cpt/Makefile --- linux-2.6.24/kernel/cpt/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,53 @@ +# +# +# kernel/cpt/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o + +vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ + cpt_mm.o cpt_files.o cpt_kernel.o \ + cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ + cpt_conntrack.o cpt_epoll.o + +vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ + rst_mm.o rst_files.o \ + rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ + rst_conntrack.o rst_epoll.o + +ifeq ($(CONFIG_BEANCOUNTERS), y) +vzcpt-objs += cpt_ubc.o +vzrst-objs += rst_ubc.o +endif + +ifeq ($(CONFIG_INOTIFY_USER), y) +vzcpt-objs += cpt_inotify.o +vzrst-objs += rst_inotify.o +endif + +vzrst-objs += cpt_exports.o + +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_obj.o cpt_kernel.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y) +vzcpt-objs += cpt_iterative.o +vzrst-objs += rst_iterative.o +endif + +ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) +vzcpt-objs += cpt_pagein.o +vzrst-objs += rst_pagein.o +endif + +ifeq ($(CONFIG_X86_64), y) +vzcpt-objs += cpt_x8664.o +ifeq ($(CONFIG_VZ_CHECKPOINT), m) +vzrst-objs += cpt_x8664.o +endif +endif diff -uprN linux-2.6.24/kernel/cpt/cpt_conntrack.c linux-2.6.24.ovz/kernel/cpt/cpt_conntrack.c --- linux-2.6.24/kernel/cpt/cpt_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_conntrack.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,365 @@ +/* + * + * kernel/cpt/cpt_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +/* How does it work? + * + * Network is disabled, so new conntrack entries will not appear. + * However, some of them can disappear because of timeouts. + * + * So, we take read_lock, collect all required information atomically, + * essentially, creating parallel "refcount" structures holding pointers. + * We delete conntrack timers as well, so the structures cannot disappear + * after releasing the lock. Now, after releasing lock we can dump everything + * safely. And on exit we restore timers to their original values. + * + * Note, this approach is not going to work in VE0. + */ + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack_tuple_hash *cth; + int index; +}; + +static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) +{ + v->cpt_dst = tuple->dst.ip; + v->cpt_dstport = tuple->dst.u.all; + v->cpt_protonum = tuple->dst.protonum; + v->cpt_dir = tuple->dst.dir; + + v->cpt_src = tuple->src.ip; + v->cpt_srcport = tuple->src.u.all; +} + +static int dump_one_expect(struct cpt_ip_connexpect_image *v, + struct ip_conntrack_expect *exp, + int sibling, cpt_context_t *ctx) +{ + int err = 0; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + encode_tuple(&v->cpt_tuple, &exp->tuple); + encode_tuple(&v->cpt_mask, &exp->mask); + v->cpt_sibling_conntrack = sibling; + v->cpt_flags = exp->flags; + v->cpt_seq = exp->id; + v->cpt_dir = 0; + v->cpt_manip_proto = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED + v->cpt_manip_proto = exp->saved_proto.all; + v->cpt_dir = exp->dir; +#endif + v->cpt_timeout = 0; + if (exp->master->helper->timeout) + v->cpt_timeout = exp->timeout.expires - jiffies; + return err; +} + +/* NOTE. We use one page to dump list of expectations. This may be not enough + * in theory. In practice there is only one expectation per conntrack record. + * Moreover, taking into account that _ALL_ of expecations are saved in one + * global list, which is looked up each incoming/outpging packet, the system + * would be severely dead when even one conntrack would have so much of + * expectations. Shortly, I am not going to repair this. + */ + +static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, + cpt_context_t *ctx) +{ + int err = 0; + unsigned long pg; + struct cpt_ip_connexpect_image *v; + struct ip_conntrack_expect *exp; + + if (ct->expecting == 0) + return err; + if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) + return -ENOBUFS; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return -ENOMEM; + v = (struct cpt_ip_connexpect_image *)pg; + + read_lock_bh(&ip_conntrack_lock); + list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { + int sibling; + + if (exp->master != ct) + continue; + + if (ct->helper == NULL) { + eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + err = -EINVAL; + break; + } + + sibling = 0; +#if 0 + /* That's all? No need to calculate sibling? */ + if (exp->sibling) { + struct ct_holder *c; + for (c = list; c; c = c->next) { + if (tuplehash_to_ctrack(c->cth) == exp->sibling) { + sibling = c->index; + break; + } + } + /* NOTE: exp->sibling could be not "confirmed" and, hence, + * out of hash table. We should just ignore such a sibling, + * the connection is going to be retried, the packet + * apparently was lost somewhere. + */ + if (sibling == 0) + dprintk_ctx("sibling conntrack is not found\n"); + } +#endif + + /* If the expectation still does not have exp->sibling + * and timer is not running, it is about to die on another + * cpu. Skip it. */ + if (!sibling && + ct->helper->timeout && + !timer_pending(&exp->timeout)) { + dprintk_ctx("conntrack: expectation: no timer\n"); + continue; + } + + err = dump_one_expect(v, exp, sibling, ctx); + if (err) + break; + + v++; + } + read_unlock_bh(&ip_conntrack_lock); + + if (err == 0 && (unsigned long)v != pg) + ctx->write((void*)pg, (unsigned long)v - pg, ctx); + + free_page(pg); + return err; +} + +static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, + cpt_context_t *ctx) +{ + struct ip_conntrack_tuple_hash *h = c->cth; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + struct cpt_ip_conntrack_image v; + int err = 0; + + if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_CONNTRACK; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + read_lock_bh(&ip_conntrack_lock); + v.cpt_status = ct->status; + v.cpt_timeout = ct->timeout.expires - jiffies; + v.cpt_ct_helper = (ct->helper != NULL); + v.cpt_index = c->index; + v.cpt_id = ct->id; + v.cpt_mark = 0; +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + v.cpt_mark = ct->mark; +#endif + encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); + encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); + memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); + memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); + + v.cpt_masq_index = 0; + v.cpt_initialized = 0; + v.cpt_num_manips = 0; + v.cpt_nat_helper = 0; +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + v.cpt_masq_index = ct->nat.masq_index; +#endif + /* "help" data is used by pptp, difficult to support */ + v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; + v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; + v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; + v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; + v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; + v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; +#endif + read_unlock_bh(&ip_conntrack_lock); + + ctx->write(&v, sizeof(v), ctx); + + err = dump_expect_list(ct, list, ctx); + + cpt_close_object(ctx); + return err; +} + +int cpt_dump_ip_conntrack(cpt_context_t * ctx) +{ + struct ct_holder *ct_list = NULL; + struct ct_holder *c, **cp; + int err = 0; + int index = 0; + int idx; + + if (get_exec_env()->_ip_conntrack == NULL) + return 0; + + for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) { + err = -ENOMEM; + goto done; + } + memset(c, 0, sizeof(struct ct_holder)); + c->next = ct_list; + ct_list = c; + } + + c = ct_list; + + read_lock_bh(&ip_conntrack_lock); + for (idx = 0; idx < ip_conntrack_htable_size; idx++) { + struct ip_conntrack_tuple_hash *h; + list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { + /* Skip reply tuples, they are covered by original + * direction. */ + if (DIRECTION(h)) + continue; + + /* Oops, we have not enough of holders... + * It is impossible. */ + if (unlikely(c == NULL)) { + read_unlock_bh(&ip_conntrack_lock); + eprintk_ctx("unexpected conntrack appeared\n"); + err = -ENOMEM; + goto done; + } + + /* If timer is not running, it means that it + * has just been scheduled on another cpu. + * We should skip this conntrack, it is about to be + * destroyed. */ + if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { + dprintk_ctx("conntrack: no timer\n"); + continue; + } + + /* Timer is deleted. refcnt is _not_ decreased. + * We are going to restore the timer on exit + * from this function. */ + c->cth = h; + c->index = ++index; + c = c->next; + } + } + read_unlock_bh(&ip_conntrack_lock); + + /* No conntracks? Good. */ + if (index == 0) + goto done; + + /* Comb the list a little. */ + cp = &ct_list; + while ((c = *cp) != NULL) { + /* Discard unused entries; they can appear, if some + * entries were timed out since we preallocated the list. + */ + if (c->cth == NULL) { + *cp = c->next; + kfree(c); + continue; + } + + /* Move conntracks attached to expectations to the beginning + * of the list. */ + if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { + *cp = c->next; + c->next = ct_list; + ct_list = c; + dprintk_ctx("conntrack: %d moved in list\n", c->index); + continue; + } + cp = &c->next; + } + + cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); + + for (c = ct_list; c; c = c->next) { + err = dump_one_ct(c, ct_list, ctx); + if (err) + goto done; + } + + cpt_close_section(ctx); + +done: + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->cth) { + /* Restore timer. refcnt is preserved. */ + add_timer(&tuplehash_to_ctrack(c->cth)->timeout); + } + kfree(c); + } + return err; +} + +#endif diff -uprN linux-2.6.24/kernel/cpt/cpt_context.c linux-2.6.24.ovz/kernel/cpt/cpt_context.c --- linux-2.6.24/kernel/cpt/cpt_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_context.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,257 @@ +/* + * + * kernel/cpt/cpt_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + + +static void file_write(const void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->write(file, addr, count, &pos); + set_fs(oldfs); + if (err != count && !ctx->write_error) + ctx->write_error = err < 0 ? err : -EIO; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +void cpt_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->write = file_write; + ctx->pwrite = file_pwrite; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +int cpt_open_dumpfile(struct cpt_context *ctx) +{ + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + return -ENOMEM; + __cpt_release_buf(ctx); + return 0; +} + +int cpt_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + if (ctx->write_error) + eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); + return ctx->write_error; +} + +int cpt_major_hdr_out(struct cpt_context *ctx) +{ + struct cpt_major_hdr hdr; + + if (ctx->file == NULL) + return 0; + + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_image_version = CPT_VERSION_20; +#ifdef CONFIG_X86_64 + hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; +#elif defined(CONFIG_X86_32) + hdr.cpt_os_arch = CPT_OS_ARCH_I386; +#elif defined(CONFIG_IA64) + hdr.cpt_os_arch = CPT_OS_ARCH_IA64; +#else +#error Arch is not supported +#endif + hdr.cpt_ve_features = (__u32)ctx->features; + hdr.cpt_ve_features2 = (__u32)(ctx->features>>32); + hdr.cpt_pagesize = (__u16)PAGE_SIZE; + hdr.cpt_hz = HZ; + hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; + hdr.cpt_start_sec = ctx->start_time.tv_sec; + hdr.cpt_start_nsec = ctx->start_time.tv_nsec; + hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; + hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; + hdr.cpt_iptables_mask = ctx->iptables_mask; + + ctx->write(&hdr, sizeof(hdr), ctx); + return 0; +} + +int cpt_close_section(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_section >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_section; + ctx->pwrite(&next, 8, ctx, ctx->current_section); + ctx->current_section = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_section); + +int cpt_open_section(struct cpt_context *ctx, __u32 type) +{ + struct cpt_section_hdr hdr; + + if (ctx->file == NULL) + return 0; + + cpt_close_section(ctx); + + ctx->current_section = ctx->file->f_pos; + ctx->sections[type] = ctx->current_section; + + hdr.cpt_next = 0; + hdr.cpt_section = type; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_align = 0; + ctx->write(&hdr, sizeof(hdr), ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_section); + + +int cpt_close_object(struct cpt_context *ctx) +{ + if (ctx->file && ctx->current_object >= 0) { + __u64 next = ctx->file->f_pos - ctx->current_object; + ctx->pwrite(&next, 8, ctx, ctx->current_object); + ctx->current_object = -1; + } + return 0; +} +EXPORT_SYMBOL(cpt_close_object); + +int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) +{ + if (ctx->file == NULL) + return 0; + + cpt_close_object(ctx); + + ctx->current_object = ctx->file->f_pos; + if (obj) + cpt_obj_setpos(obj, ctx->current_object, ctx); + + return 0; +} +EXPORT_SYMBOL(cpt_open_object); + +int cpt_push_object(loff_t *saved, struct cpt_context *ctx) +{ + if (ctx->file) { + *saved = ctx->current_object; + ctx->current_object = ctx->file->f_pos; + } + return 0; +} +EXPORT_SYMBOL(cpt_push_object); + +int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) +{ + ctx->current_object = *saved; + return 0; +} +EXPORT_SYMBOL(cpt_pop_object); + +int cpt_dump_tail(struct cpt_context *ctx) +{ + struct cpt_major_tail hdr; + int i; + + if (ctx->file == NULL) + return 0; + + cpt_open_section(ctx, CPT_SECT_TRAILER); + memset(&hdr, 0, sizeof(hdr)); + hdr.cpt_next = sizeof(hdr); + hdr.cpt_object = CPT_OBJ_TRAILER; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_VOID; + hdr.cpt_lazypages = 0; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + hdr.cpt_lazypages = ctx->lazypages; +#endif + hdr.cpt_64bit = ctx->tasks64; + hdr.cpt_signature[0] = CPT_SIGNATURE0; + hdr.cpt_signature[1] = CPT_SIGNATURE1; + hdr.cpt_signature[2] = CPT_SIGNATURE2; + hdr.cpt_signature[3] = CPT_SIGNATURE3; + hdr.cpt_nsect = CPT_SECT_MAX_INDEX; + for (i = 0; i < CPT_SECT_MAX_INDEX; i++) + hdr.cpt_sections[i] = ctx->sections[i]; + + ctx->write(&hdr, sizeof(hdr), ctx); + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_context.h linux-2.6.24.ovz/kernel/cpt/cpt_context.h --- linux-2.6.24/kernel/cpt/cpt_context.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_context.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,215 @@ +#include +#include +#include + +#define CPT_CTX_ERROR -1 +#define CPT_CTX_IDLE 0 +#define CPT_CTX_SUSPENDING 1 +#define CPT_CTX_SUSPENDED 2 +#define CPT_CTX_DUMPING 3 +#define CPT_CTX_UNDUMPING 4 +#define CPT_CTX_UNDUMPED 5 + +#define CPT_TID(tsk) task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm +#define CPT_FID "%d,%d(%s)" + + +typedef struct cpt_context +{ + struct list_head ctx_list; + int refcount; + int ctx_state; + int objcount; + int sticky; + struct semaphore main_sem; + + struct file *errorfile; + struct file *statusfile; + struct file *lockfile; + + int errno; + char *error_msg; + loff_t err_offset; + + struct file *file; + char *tmpbuf; + int pagesize; +#ifdef CONFIG_VZ_CHECKPOINT_ITER + int iter_done; + void *iter_dir; + struct user_beancounter *iter_ub; +#endif + loff_t current_section; + loff_t current_object; + + loff_t sections[CPT_SECT_MAX]; + + __u32 errormask; + __u32 write_error; + + struct list_head object_array[CPT_OBJ_MAX]; + + void (*write)(const void *addr, size_t count, struct cpt_context *ctx); + void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); + ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); + void (*align)(struct cpt_context *ctx); + int ve_id; + int contextid; + struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst + * corresponging to start_time */ + __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when + * VE did not migrate. */ + struct timespec start_time; + struct timespec delta_time; + __s64 delta_nsec; + int image_version; + __u16 image_arch; + __u64 iptables_mask; + __u64 features; + +#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) +#define CPT_ANONVMA_HSIZE (1<ve_id, ##arg) + +#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) +#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) + +#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) +#define eprintk_ctx(f, arg...) \ +do { \ + eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ + if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ + ctx->err_offset += snprintf((char*)(ctx->error_msg + \ + ctx->err_offset), \ + PAGE_SIZE - ctx->err_offset, \ + "Error: " f, ##arg); \ +} while(0) + +#define CPT_TMPBUF_FREE 0x789adf12 +#define CPT_TMPBUF_BUSY 0xabcd9876 + +static inline void *cpt_get_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; + return buf; +} + +static inline void __cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_release_buf(cpt_context_t *ctx) +{ + void *buf = ctx->tmpbuf; + + BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); + *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; +} + +static inline void cpt_flush_error(cpt_context_t *ctx) +{ + mm_segment_t oldfs; + + if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { + if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { + oldfs = get_fs(); + set_fs(KERNEL_DS); + ctx->errorfile->f_op->write(ctx->errorfile, + ctx->error_msg, ctx->err_offset, + &ctx->errorfile->f_pos); + set_fs(oldfs); + } + ctx->error_msg[0] = 0; + ctx->err_offset = 0; + } +} diff -uprN linux-2.6.24/kernel/cpt/cpt_dump.c linux-2.6.24.ovz/kernel/cpt/cpt_dump.c --- linux-2.6.24/kernel/cpt/cpt_dump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_dump.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1244 @@ +/* + * + * kernel/cpt/cpt_dump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_net.h" +#include "cpt_socket.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + + +static int vps_child_level(struct task_struct *root, struct task_struct *c) +{ + int level = 0; + int veid = VE_TASK_INFO(c)->owner_env->veid; + + while (VE_TASK_INFO(c)->owner_env->veid == veid) { + if (c->pid != c->tgid) + c = c->group_leader; + if (c == root) + return level; + + c = c->parent; + level++; + } + return -1; +} + +static inline int freezable(struct task_struct * p) +{ + if (p->exit_state) + return 0; + + switch (p->state) { + case EXIT_ZOMBIE: + case EXIT_DEAD: + case TASK_STOPPED: +#if TASK_TRACED != TASK_STOPPED + case TASK_TRACED: +#endif + return 0; + default: + return 1; + } +} + +static void wake_ve(cpt_context_t *ctx) +{ + struct task_struct *p, *g; + + do_each_thread_ve(g, p) { + spin_lock_irq(&p->sighand->siglock); + if (p->flags & PF_FROZEN) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread_ve(g, p); +} + +/* + * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... + * + * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context + * of another process. Apparently, it is unacceptable on SMP. + * Let's take freeze_processes() in kernel/power/process.c as an example. + * Unserialized modifications tsk->flags easily + * (believe or not, but it happens with probability of almost 100% :-)) + * creates the situation when setting PF_FREEZE in freeze_processes(), + * which quickly spins raising PF_FREEZE of all the processes, + * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. + * + * So, to make things clean, we require that those flags may be modified + * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE + * is just a kind of signal. + * + * It is not enough, because we are still not allowed to change tsk->flags + * in context of another process, we can corrupt another flags, when the process + * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, + * which can be changed atomically. + * + * PF_FROZEN also changes in context of another process, but this happens + * only when the process is already in refrigerator() which does not modify + * tsk->flags. + */ + +static int check_process_external(struct task_struct *p) +{ + if (pid_alive(p)) { + if (p->pids[PIDTYPE_PID].pid->level == 0) + return PIDTYPE_PID; + if (p->pids[PIDTYPE_PGID].pid->level == 0) + return PIDTYPE_PGID; + if (p->pids[PIDTYPE_SID].pid->level == 0) + return PIDTYPE_SID; + } + + return PIDTYPE_MAX; +} + +enum +{ + OBSTACLE_NOGO = -1, + OBSTACLE_TIMEOUT = -2, + OBSTACLE_TRYAGAIN = -3, +}; + +#define SUSPEND_TIMEOUT (10UL*HZ) + +static int vps_stop_tasks(struct cpt_context *ctx) +{ + unsigned long start_time = jiffies; + unsigned long target, timeout; + struct task_struct *p, *g; + int todo; + int round = 0; + + do_gettimespec(&ctx->start_time); + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup; + + read_lock(&tasklist_lock); + + atomic_inc(&get_exec_env()->suspend); + timeout = HZ/5; + target = jiffies + timeout; + + for(;;) { + struct task_struct *root; + todo = 0; + + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + atomic_dec(&get_exec_env()->suspend); + return -ESRCH; + } + + do_each_thread_ve(g, p) { + if (vps_child_level(root, p) >= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_PGID: + eprintk_ctx("external process group %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_pgrp_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + case PIDTYPE_SID: + eprintk_ctx("external process session %d/%d(%s) inside CT " + "(e.g. vzctl enter or vzctl exec).\n", + task_session_vnr(p), p->pid, p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + if (p->vfork_done) { + /* Task between vfork()...exec() + * cannot be frozen, because parent + * wait in uninterruptible state. + * So, we do nothing, waiting for + * exec(), unless: + */ + if (p->state == TASK_STOPPED || + p->state == TASK_TRACED) { + eprintk_ctx("task " CPT_FID " is stopped while vfork(). " + "Checkpointing is impossible.\n", + CPT_TID(p)); + todo = OBSTACLE_NOGO; + /* It is fatal, _user_ stopped + * vfork()ing task, so that we + * cannot suspend now. + */ + } else { + todo = OBSTACLE_TRYAGAIN; + } + goto out; + } + if (p->signal->group_exit_task && + p->signal->notify_count) { + /* exec() waits for threads' death */ + wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p)); + todo = OBSTACLE_TRYAGAIN; + goto out; + } + if (p->state == TASK_TRACED +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) + && !p->stopped_state +#endif + ) { + int ptrace_id = p->pn_state; + /* Debugger waits for signal. */ + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + case PN_STOP_ENTRY: + case PN_STOP_FORK: + case PN_STOP_VFORK: + case PN_STOP_SIGNAL: + case PN_STOP_EXIT: + case PN_STOP_LEAVE: + break; + default: + eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id); + todo = OBSTACLE_NOGO; + goto out; + } + } +#ifdef CONFIG_UTRACE + if (check_utrace(p, root, ctx)) { + eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } +#endif + if (p->flags & PF_NOFREEZE) { + eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p)); + todo = OBSTACLE_NOGO; + goto out; + } + + if (!freezable(p)) + continue; + + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_FROZEN)) { + set_tsk_thread_flag(p, TIF_FREEZE); + signal_wake_up(p, 0); + } + spin_unlock_irq(&p->sighand->siglock); + + if (p->flags & PF_FROZEN) { + if (p->state != TASK_UNINTERRUPTIBLE) + printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p)); + continue; + } + + if (round == 10) + wprintk_ctx(CPT_FID " is running\n", CPT_TID(p)); + + todo++; + } else { + if (p != current) { + eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", + task_pid_vnr(p), task_pid_nr(p), p->comm); + todo = OBSTACLE_NOGO; + goto out; + } + } + } while_each_thread_ve(g, p); + + if (todo > 0) { + /* No visible obstacles, but VE did not freeze + * for timeout. Interrupt suspend, if it is major + * timeout or signal; if it is minor timeout + * we will wake VE and restart suspend. + */ + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) + || signal_pending(current)) + todo = OBSTACLE_TIMEOUT; + else if (time_after(jiffies, target)) + todo = OBSTACLE_TRYAGAIN; + } + +out: + if (todo < 0) { + atomic_dec(&get_exec_env()->suspend); + + wake_ve(ctx); + +#if 0 + /* This is sign of failure of printk(), which is not + * ours. So, no prefixes. */ + printk(">\n"); +#endif + } + + read_unlock(&tasklist_lock); + + if (!todo) { + atomic_dec(&get_exec_env()->suspend); + return 0; + } + + switch (todo) { + case OBSTACLE_NOGO: + eprintk_ctx("suspend is impossible now.\n"); + return -EAGAIN; + + case OBSTACLE_TIMEOUT: + eprintk_ctx("interrupted or timed out.\n"); + return -EINTR; + + case OBSTACLE_TRYAGAIN: + if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) || + signal_pending(current)) { + wprintk_ctx("suspend timed out\n"); + return -EAGAIN; + } + + wprintk_ctx("minor suspend timeout (%lu) expired, " + "trying again\n", timeout); + + /* Try again. VE is awake, give it some time to run. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + + /* After a short wait restart suspend + * with longer timeout */ + atomic_inc(&get_exec_env()->suspend); + timeout = min(timeout<<1, SUSPEND_TIMEOUT); + target = jiffies + timeout; + break; + + default: + if (round > 0) { + /* VE is partially frozen, give processes + * a chance to enter to refrigerator(). */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/20); + } else { + yield(); + } + } + + read_lock(&tasklist_lock); + round++; + } +} + +static int cpt_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int cpt_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + cpt_unlock_sockets(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->flags & PF_FROZEN) { + tsk->flags &= ~PF_FROZEN; + wake_up_process(tsk); + } else if (freezable(tsk)) { + eprintk_ctx("strange, %s not frozen\n", tsk->comm ); + } + spin_unlock_irq(&tsk->sighand->siglock); + put_task_struct(tsk); + } + + cpt_resume_network(ctx); + + cpt_unlock_ve(ctx); + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + return 0; +} + +int cpt_kill(struct cpt_context *ctx) +{ + int err = 0; + struct ve_struct *env; + cpt_object_t *obj; + struct task_struct *root_task = NULL; + long delay; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + /* from here cpt_kill succeeds */ + virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); + + if (current->ve_task_info.owner_env == env) { + wprintk_ctx("attempt to kill ve from inside, escaping...\n"); + ve_move_task(current, get_ve0()); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) { + wait_for_completion(&ctx->pgin_notify); + put_task_struct(ctx->pgin_task); + ctx->pgin_task = NULL; + } +#endif + + cpt_kill_sockets(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk->exit_state) { + put_task_struct(tsk); + continue; + } + + if (task_pid_vnr(tsk) == 1) { + root_task = tsk; + continue; + } + + tsk->robust_list = NULL; +#ifdef CONFIG_COMPAT + tsk->compat_robust_list = NULL; +#endif + tsk->clear_child_tid = NULL; + + if (tsk->ptrace) { + write_lock_irq(&tasklist_lock); + tsk->ptrace = 0; + if (!list_empty(&tsk->ptrace_list)) { + list_del_init(&tsk->ptrace_list); + remove_parent(tsk); + tsk->parent = tsk->parent; + add_parent(tsk); + } + write_unlock_irq(&tasklist_lock); + } + + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + put_task_struct(tsk); + } + + yield(); + + if (root_task != NULL) { + send_sig(SIGKILL, root_task, 1); + + spin_lock_irq(&root_task->sighand->siglock); + sigfillset(&root_task->blocked); + sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(root_task, TIF_SIGPENDING); + clear_tsk_thread_flag(root_task, TIF_FREEZE); + if (root_task->flags & PF_FROZEN) + root_task->flags &= ~PF_FROZEN; + spin_unlock_irq(&root_task->sighand->siglock); + + wake_up_process(root_task); + put_task_struct(root_task); + } + + cpt_finish_ubc(ctx); + cpt_object_destroy(ctx); + + delay = 1; + while (atomic_read(&env->counter) != 1) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + delay = (delay < HZ) ? (delay << 1) : HZ; + schedule_timeout(delay); + } + put_ve(env); + + return err; +} + +#ifdef CONFIG_BEANCOUNTERS +static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = &(t->task_bc); + cpt_add_ubc(tbc->exec_ub, ctx); + cpt_add_ubc(tbc->task_ub, ctx); + cpt_add_ubc(tbc->fork_sub, ctx); +} +#else +static void inline collect_task_ubc(struct task_struct *t, + struct cpt_context *ctx) +{ return; } +#endif + +static cpt_object_t * remember_task(struct task_struct * child, + cpt_object_t * head, cpt_context_t * ctx) +{ + cpt_object_t *cobj; + + if (freezable(child) && !(child->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); + put_task_struct(child); + return NULL; + } + + if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); + if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(child); + return NULL; + } + cobj->o_count = 1; + cpt_obj_setobj(cobj, child, ctx); + insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); + collect_task_ubc(child, ctx); + return cobj; +} + +static int vps_collect_tasks(struct cpt_context *ctx) +{ + int err = -ESRCH; + cpt_object_t *obj; + struct task_struct *root; + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (root) + get_task_struct(root); + read_unlock(&tasklist_lock); + + if (!root) { + err = -ESRCH; + eprintk_ctx("vps_collect_tasks: cannot find root\n"); + goto out; + } + + if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { + put_task_struct(root); + return -ENOMEM; + } + obj->o_count = 1; + cpt_obj_setobj(obj, root, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + collect_task_ubc(root, ctx); + + /* Collect process subtree recursively */ + for_each_object(obj, CPT_OBJ_TASK) { + cpt_object_t *head = obj; + struct task_struct *tsk = obj->o_obj; + struct task_struct *child; + + if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { + eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); + err = -EINVAL; + goto out; + } + + if (tsk->state == TASK_RUNNING) + printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk)); + + wait_task_inactive(tsk); + + err = check_task_state(tsk, ctx); + if (err) + goto out; + + if (tsk->pid == tsk->tgid) { + child = tsk; + for (;;) { + read_lock(&tasklist_lock); + child = next_thread(child); + if (child != tsk) + get_task_struct(child); + read_unlock(&tasklist_lock); + + if (child == tsk) + break; + + if (child->parent != tsk->parent) { + put_task_struct(child); + eprintk_ctx("illegal thread structure, kernel bug\n"); + err = -EINVAL; + goto out; + } + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + } + } + + /* About locking. VE is frozen. But lists of children + * may change at least for init, when entered task reparents + * to init and when reparented task exits. If we take care + * of this case, we still can unlock while scanning + * tasklists. + */ + read_lock(&tasklist_lock); + list_for_each_entry(child, &tsk->children, sibling) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + + list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) { + if (child->parent != tsk) + continue; + if (child->pid != child->tgid) + continue; + get_task_struct(child); + read_unlock(&tasklist_lock); + + if ((head = remember_task(child, head, ctx)) == NULL) { + eprintk_ctx("task obj allocation failure\n"); + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + } + read_unlock(&tasklist_lock); + } + + return 0; + +out: + while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) { + struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + struct task_struct *tsk; + + list_del(head); + tsk = obj->o_obj; + put_task_struct(tsk); + free_cpt_object(obj, ctx); + } + return err; +} + +static int cpt_collect(struct cpt_context *ctx) +{ + int err; + + if ((err = cpt_collect_mm(ctx)) != 0) + return err; + + if ((err = cpt_collect_sysv(ctx)) != 0) + return err; + + if ((err = cpt_collect_files(ctx)) != 0) + return err; + + if ((err = cpt_collect_fs(ctx)) != 0) + return err; + + if ((err = cpt_collect_namespace(ctx)) != 0) + return err; + + if ((err = cpt_collect_signals(ctx)) != 0) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL) + return -ECHRNG; + + return 0; +} + +static int cpt_dump_veinfo(cpt_context_t *ctx) +{ + struct cpt_veinfo_image *i = cpt_get_buf(ctx); + struct ve_struct *ve; + struct timespec delta; + struct ipc_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_VEINFO); + cpt_open_object(NULL, ctx); + + memset(i, 0, sizeof(*i)); + + i->cpt_next = CPT_NULL; + i->cpt_object = CPT_OBJ_VEINFO; + i->cpt_hdrlen = sizeof(*i); + i->cpt_content = CPT_CONTENT_VOID; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + if (ns->shm_ctlall > 0xFFFFFFFFU) + i->shm_ctl_all = 0xFFFFFFFFU; + if (ns->shm_ctlmax > 0xFFFFFFFFU) + i->shm_ctl_max = 0xFFFFFFFFU; + i->shm_ctl_mni = ns->shm_ctlmni; + + i->msg_ctl_max = ns->msg_ctlmax; + i->msg_ctl_mni = ns->msg_ctlmni; + i->msg_ctl_mnb = ns->msg_ctlmnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + i->sem_ctl_arr[0] = ns->sem_ctls[0]; + i->sem_ctl_arr[1] = ns->sem_ctls[1]; + i->sem_ctl_arr[2] = ns->sem_ctls[2]; + i->sem_ctl_arr[3] = ns->sem_ctls[3]; + + do_posix_clock_monotonic_gettime(&delta); + _set_normalized_timespec(&delta, + delta.tv_sec - ve->start_timespec.tv_sec, + delta.tv_nsec - ve->start_timespec.tv_nsec); + i->start_timespec_delta = cpt_timespec_export(&delta); + i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; + + i->last_pid = ve->ve_ns->pid_ns->last_pid; + + ctx->write(i, sizeof(*i), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_utsname(cpt_context_t *ctx) +{ + int len; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + + cpt_open_section(ctx, CPT_SECT_UTSNAME); + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.nodename); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.nodename, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_open_object(NULL, ctx); + len = strlen(ns->name.domainname); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(ns->name.domainname, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + + cpt_close_section(ctx); + return 0; +} + +#ifndef CONFIG_IA64 +static int cpt_dump_vsyscall(cpt_context_t *ctx) +{ + struct cpt_page_block *pgb = cpt_get_buf(ctx); + + cpt_open_section(ctx, CPT_SECT_VSYSCALL); + cpt_open_object(NULL, ctx); + + pgb->cpt_next = CPT_NULL; + pgb->cpt_object = CPT_OBJ_VSYSCALL; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = CPT_CONTENT_DATA; + pgb->cpt_start = cpt_ptr_export(vsyscall_addr); + pgb->cpt_end = pgb->cpt_start + PAGE_SIZE; + + ctx->write(pgb, sizeof(*pgb), ctx); + cpt_release_buf(ctx); + + ctx->write(vsyscall_addr, PAGE_SIZE, ctx); + + cpt_close_object(ctx); + cpt_close_section(ctx); + return 0; +} +#endif + +int cpt_dump(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err, err2 = 0; + + if (!ctx->ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + + down_read(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + if (!env->is_locked) + goto out_noenv; + err = -EINVAL; + if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) { + printk(KERN_WARNING "CT: checkpointing not supported yet" + " for hidden pid namespaces.\n"); + goto out_noenv; + } + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 2: real checkpointing */ + err = cpt_open_dumpfile(ctx); + if (err) + goto out; + + cpt_major_hdr_out(ctx); + + if (!err) + err = cpt_dump_veinfo(ctx); + if (!err) + err = cpt_dump_ubc(ctx); + if (!err) + err = cpt_dump_files(ctx); + if (!err) + err = cpt_dump_files_struct(ctx); + if (!err) + err = cpt_dump_fs_struct(ctx); + /* netdevices should be dumped after dumping open files + as we need to restore netdevice binding to /dev/net/tun file */ + if (!err) + err = cpt_dump_ifinfo(ctx); + if (!err) + err = cpt_dump_namespace(ctx); + if (!err) + err = cpt_dump_sighand(ctx); + if (!err) + err = cpt_dump_vm(ctx); + if (!err) + err = cpt_dump_sysvsem(ctx); + if (!err) + err = cpt_dump_sysvmsg(ctx); + if (!err) + err = cpt_dump_tasks(ctx); + if (!err) + err = cpt_dump_orphaned_sockets(ctx); +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + if (!err) + err = cpt_dump_ip_conntrack(ctx); +#endif + if (!err) { + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL) + err = -ECHRNG; + } + if (!err) + err = cpt_dump_utsname(ctx); + +#ifndef CONFIG_IA64 + if (!err) + err = cpt_dump_vsyscall(ctx); +#endif + + if (!err) + err = cpt_dump_tail(ctx); + + err2 = cpt_close_dumpfile(ctx); + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); +out_noenv: + up_read(&env->op_sem); + put_ve(env); + return err ? : err2; +} + +int cpt_vps_suspend(struct cpt_context *ctx) +{ + struct ve_struct *oldenv, *env; + struct nsproxy *old_ns; + int err = 0; + + ctx->kernel_config_flags = test_kernel_config(); + cpt_object_init(ctx); + + if (!ctx->ve_id) { + env = get_exec_env(); + if (env == get_ve0()) + return -EINVAL; + wprintk("undefined ve_id\n"); + ctx->ve_id = env->veid; + get_ve(env); + } else { + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + } + +#ifdef CONFIG_VE_IPTABLES + ctx->iptables_mask = env->_iptables_modules; +#endif + ctx->features = env->features; + + down_write(&env->op_sem); + err = -ESRCH; + if (!env->is_running) + goto out_noenv; + + err = -EBUSY; + if (env->is_locked) + goto out_noenv; + env->is_locked = 1; + downgrade_write(&env->op_sem); + + oldenv = set_exec_env(env); + old_ns = current->nsproxy; + current->nsproxy = env->ve_ns; + + /* Phase 0: find and stop all the tasks */ + if ((err = vps_stop_tasks(ctx)) != 0) + goto out; + + if ((err = cpt_suspend_network(ctx)) != 0) + goto out_wake; + + /* At the moment all the state is frozen. We do not need to lock + * the state, which can be changed only if the tasks are running. + */ + + /* Phase 1: collect task tree */ + if ((err = vps_collect_tasks(ctx)) != 0) + goto out_wake; + + /* Phase 1': collect all the resources */ + if ((err = cpt_collect(ctx)) != 0) + goto out; + +out: + current->nsproxy = old_ns; + set_exec_env(oldenv); + up_read(&env->op_sem); + put_ve(env); + return err; + +out_noenv: + up_write(&env->op_sem); + put_ve(env); + return err; + +out_wake: + read_lock(&tasklist_lock); + wake_ve(ctx); + read_unlock(&tasklist_lock); + goto out; +} + +static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps) +{ + struct net *net = get_exec_env()->ve_ns->net_ns; + struct net_device *dev; + + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + if (dev != net->loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + && dev->open != tun_net_open +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + *caps |= (1<flags & _TIF_IA32)) + *caps |= flags & ((1<mm && p->mm->context.vdso) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + *caps |= flags & (1<mm && p->mm->context.vdso) + *caps |= flags & (1<= 0) { + switch (check_process_external(p)) { + case PIDTYPE_PID: + eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<pid, p->comm); + *caps |= (1<nsproxy) { + ns = p->nsproxy->mnt_ns; + if (ns) + get_mnt_ns(ns); + } + task_unlock(p); + if (ns) { + if (ns != current->nsproxy->mnt_ns) { + eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); + *caps |= (1<pid, virt_pid(p), p->comm); + *caps |= (1<list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + path = __d_path(mnt->mnt_root, mnt, + env->fs_root, env->fs_rootmnt, + path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name); + *caps |= (1<ve_id) + return -EINVAL; + + env = get_ve_by_id(ctx->ve_id); + if (env == NULL) + return -ESRCH; + + *caps = flags & (1<nsproxy; + current->nsproxy = env->ve_ns; + + check_unsupported_netdevices(ctx, caps); + + read_lock(&tasklist_lock); + root = find_task_by_vpid(1); + if (!root) { + read_unlock(&tasklist_lock); + eprintk_ctx("cannot find ve init\n"); + err = -ESRCH; + goto out; + } + get_task_struct(root); + for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) + check_one_process(ctx, caps, flags, env, root, p); + read_unlock(&tasklist_lock); + + task_lock(root); + n = NULL; + if (root->nsproxy) { + n = root->nsproxy->mnt_ns; + if (n) + get_mnt_ns(n); + } + task_unlock(root); + if (n) { + char *path_buf; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) { + put_mnt_ns(n); + err = -ENOMEM; + goto out_root; + } + + check_unsupported_mounts(ctx, caps, env, n, path_buf); + + free_page((unsigned long) path_buf); + put_mnt_ns(n); + } + + err = 0; + +out_root: + put_task_struct(root); +out: + current->nsproxy = old_ns; + set_exec_env(old_env); + put_ve(env); + + return err; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_dump.h linux-2.6.24.ovz/kernel/cpt/cpt_dump.h --- linux-2.6.24/kernel/cpt/cpt_dump.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_dump.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,16 @@ +int cpt_dump(struct cpt_context *cpt); +int rst_undump(struct cpt_context *cpt); +int cpt_suspend(struct cpt_context *cpt); +int cpt_resume(struct cpt_context *cpt); +int cpt_kill(struct cpt_context *cpt); +int rst_clean(struct cpt_context *cpt); +int rst_resume(struct cpt_context *cpt); +int rst_kill(struct cpt_context *cpt); + +int cpt_freeze_one(pid_t pid, int freeze); +int cpt_vps_suspend(struct cpt_context *ctx); +int vps_rst_undump(struct cpt_context *ctx); + +int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); + +int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx); diff -uprN linux-2.6.24/kernel/cpt/cpt_epoll.c linux-2.6.24.ovz/kernel/cpt/cpt_epoll.c --- linux-2.6.24/kernel/cpt/cpt_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_epoll.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,115 @@ +/* + * + * kernel/cpt/cpt_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations eventpoll_fops; + +int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct eventpoll *ep; + struct rb_node *rbp; + struct cpt_epoll_image ei; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + /* eventpoll.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_EPOLL; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_ARRAY; + ei.cpt_file = obj->o_pos; + + ctx->write(&ei, sizeof(ei), ctx); + + mutex_lock(&epmutex); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + loff_t saved_obj; + cpt_object_t *tobj; + struct cpt_epoll_file_image efi; + struct epitem *epi; + epi = rb_entry(rbp, struct epitem, rbn); + tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); + if (tobj == NULL) { + eprintk_ctx("epoll device refers to an external file\n"); + err = -EBUSY; + break; + } + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + efi.cpt_next = CPT_NULL; + efi.cpt_object = CPT_OBJ_EPOLL_FILE; + efi.cpt_hdrlen = sizeof(efi); + efi.cpt_content = CPT_CONTENT_VOID; + efi.cpt_file = tobj->o_pos; + efi.cpt_fd = epi->ffd.fd; + efi.cpt_events = epi->event.events; + efi.cpt_data = epi->event.data; + efi.cpt_revents = 0; + efi.cpt_ready = 0; + if (!list_empty(&epi->rdllink)) + efi.cpt_ready = 1; + + ctx->write(&efi, sizeof(efi), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&epmutex); + + cpt_close_object(ctx); + + return err; +} + diff -uprN linux-2.6.24/kernel/cpt/cpt_exports.c linux-2.6.24.ovz/kernel/cpt/cpt_exports.c --- linux-2.6.24/kernel/cpt/cpt_exports.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_exports.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,13 @@ +#include +#include + +#include "cpt_obj.h" + +EXPORT_SYMBOL(alloc_cpt_object); +EXPORT_SYMBOL(intern_cpt_object); +EXPORT_SYMBOL(insert_cpt_object); +EXPORT_SYMBOL(__cpt_object_add); +EXPORT_SYMBOL(cpt_object_add); +EXPORT_SYMBOL(cpt_object_get); +EXPORT_SYMBOL(lookup_cpt_object); +EXPORT_SYMBOL(lookup_cpt_obj_bypos); diff -uprN linux-2.6.24/kernel/cpt/cpt_files.c linux-2.6.24.ovz/kernel/cpt/cpt_files.c --- linux-2.6.24/kernel/cpt/cpt_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_files.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1614 @@ +/* + * + * kernel/cpt/cpt_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) +{ + char *path; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) + return; + + path = d_path(d, mnt, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) + eprintk("<%s>", path); + free_page(pg); +} + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx) +{ + if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { + struct nameidata nd; + if (path_lookup(path, 0, &nd)) { + eprintk_ctx("d_path cannot be looked up %s\n", path); + return -EINVAL; + } + if (nd.dentry != d || nd.mnt != mnt) { + eprintk_ctx("d_path is invisible %s\n", path); + path_release(&nd); + return -EINVAL; + } + path_release(&nd); + } + return 0; +} + +static int +cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx) +{ + int result = 0; + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) + char *path; + unsigned long pg; + struct dentry * renamed_dentry; + + if (de->d_sb->s_magic != FSMAGIC_VEFS) + return 0; + if (de->d_inode->i_nlink != 0 || + atomic_read(&de->d_inode->i_writecount) > 0) + return 0; + + renamed_dentry = vefs_replaced_dentry(de); + if (renamed_dentry == NULL) + return 0; + + pg = __get_free_page(GFP_KERNEL); + if (!pg) + return 0; + + path = d_path(de, mnt, (char *)pg, PAGE_SIZE); + if (!IS_ERR(path)) { + int len; + struct nameidata nd; + + len = pg + PAGE_SIZE - 1 - (unsigned long)path; + if (len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + + if (path_lookup(path, 0, &nd) == 0) { + if (mnt == nd.mnt && + vefs_is_renamed_dentry(nd.dentry, renamed_dentry)) + result = 1; + path_release(&nd); + } + } + free_page(pg); +#endif + return result; +} + +static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, + int replaced, cpt_context_t *ctx) +{ + int len; + char *path; + char *pg = cpt_get_buf(ctx); + loff_t saved; + + path = d_path(d, mnt, pg, PAGE_SIZE); + len = PTR_ERR(path); + + if (IS_ERR(path)) { + struct cpt_object_hdr o; + char tmp[1]; + + /* VZ changes d_path() to return EINVAL, when path + * is not supposed to be visible inside VE. + * This changes behaviour of d_path() comparing + * to mainstream kernel, f.e. d_path() fails + * on any kind of shared memory. Maybe, there are + * another cases, but I am aware only about this one. + * So, we just ignore error on shmem mounts and proceed. + * Otherwise, checkpointing is prohibited because + * of reference to an invisible file. + */ + if (len != -EINVAL || + mnt != get_exec_env()->shmem_mnt) + eprintk_ctx("d_path err=%d\n", len); + else + len = 0; + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + tmp[0] = 0; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(tmp, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + + __cpt_release_buf(ctx); + return len; + } else { + struct cpt_object_hdr o; + + len = pg + PAGE_SIZE - 1 - path; + if (replaced && + len >= sizeof("(deleted) ") - 1 && + !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { + len -= sizeof("(deleted) ") - 1; + path += sizeof("(deleted) ") - 1; + } + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + path[len] = 0; + + if (cpt_verify_overmount(path, d, mnt, ctx)) { + __cpt_release_buf(ctx); + return -EINVAL; + } + + cpt_push_object(&saved, ctx); + cpt_open_object(NULL, ctx); + ctx->write(&o, sizeof(o), ctx); + ctx->write(path, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved, ctx); + __cpt_release_buf(ctx); + } + return 0; +} + +int cpt_dump_string(const char *s, struct cpt_context *ctx) +{ + int len; + struct cpt_object_hdr o; + + cpt_open_object(NULL, ctx); + len = strlen(s); + o.cpt_next = CPT_NULL; + o.cpt_object = CPT_OBJ_NAME; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&o, sizeof(o), ctx); + ctx->write(s, len+1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +static int +cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) +{ + return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx); +} + +int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err; + struct cpt_inode_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_INODE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { + cpt_release_buf(ctx); + return err; + } + + v->cpt_dev = d->d_inode->i_sb->s_dev; + v->cpt_ino = d->d_inode->i_ino; + v->cpt_mode = sbuf.mode; + v->cpt_nlink = sbuf.nlink; + v->cpt_uid = sbuf.uid; + v->cpt_gid = sbuf.gid; + v->cpt_rdev = d->d_inode->i_rdev; + v->cpt_size = sbuf.size; + v->cpt_atime = cpt_timespec_export(&sbuf.atime); + v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); + v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); + v->cpt_blksize = sbuf.blksize; + v->cpt_blocks = sbuf.blocks; + v->cpt_sb = d->d_inode->i_sb->s_magic; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + +int cpt_collect_files(cpt_context_t * ctx) +{ + int err; + cpt_object_t *obj; + int index = 0; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) + return -ENOMEM; + } + + /* Collect files from fd sets */ + for_each_object(obj, CPT_OBJ_FILES) { + int fd; + struct files_struct *f = obj->o_obj; + + cpt_obj_setindex(obj, index++, ctx); + + if (obj->o_count != atomic_read(&f->count)) { + eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); + return -EBUSY; + } + + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) + return -ENOMEM; + } + } + + /* Collect files queued by AF_UNIX sockets. */ + if ((err = cpt_collect_passedfds(ctx)) < 0) + return err; + + /* OK. At this point we should count all the references. */ + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct file *parent; + cpt_object_t *ino_obj; + + if (obj->o_count != atomic_read(&file->f_count)) { + eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count)); + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + return -EBUSY; + } + + switch (file->f_dentry->d_inode->i_sb->s_magic) { + case FSMAGIC_FUTEX: + case FSMAGIC_MQUEUE: + case FSMAGIC_BDEV: +#ifndef CONFIG_INOTIFY_USER + case FSMAGIC_INOTIFY: +#endif + eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); + return -EBUSY; + } + + /* Collect inode. It is necessary mostly to resolve deleted + * hard links. */ + ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (ino_obj == NULL) + return -ENOMEM; + + parent = ino_obj->o_parent; + if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) + ino_obj->o_parent = file; + + if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { + int maj = imajor(file->f_dentry->d_inode); + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + err = cpt_collect_tty(file, ctx); + if (err) + return err; + } + } + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + err = cpt_collect_socket(file, ctx); + if (err) + return err; + } + } + + err = cpt_index_sockets(ctx); + + return err; +} + +/* /dev/ptmx is special, all the files share one inode, but real tty backend + * is attached via file->private_data. + */ + +static inline int is_cloning_inode(struct inode *ino) +{ + return S_ISCHR(ino->i_mode) && + ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); +} + +static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) +{ + pid_t pid; + struct cpt_flock_image *v = cpt_get_buf(ctx); + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_FLOCK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_owner = owner; + + pid = fl->fl_pid; + if (pid) { + pid = pid_to_vpid(fl->fl_pid); + if (pid == -1) { + if (!(fl->fl_flags&FL_FLOCK)) { + eprintk_ctx("posix lock from another container?\n"); + cpt_release_buf(ctx); + return -EBUSY; + } + pid = 0; + } + } + + v->cpt_pid = pid; + v->cpt_start = fl->fl_start; + v->cpt_end = fl->fl_end; + v->cpt_flags = fl->fl_flags; + v->cpt_type = fl->fl_type; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int cpt_dump_flock(struct file *file, struct cpt_context *ctx) +{ + int err = 0; + struct file_lock *fl; + + lock_kernel(); + for (fl = file->f_dentry->d_inode->i_flock; + fl; fl = fl->fl_next) { + if (file != fl->fl_file) + continue; + if (fl->fl_flags & FL_LEASE) { + eprintk_ctx("lease lock is not supported\n"); + err = -EINVAL; + break; + } + if (fl->fl_flags & FL_POSIX) { + cpt_object_t *obj; + obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); + if (obj) { + dump_one_flock(fl, obj->o_index, ctx); + continue; + } else { + eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); + err = -EINVAL; + } + } + if (fl->fl_flags & FL_FLOCK) { + dump_one_flock(fl, -1, ctx); + continue; + } + } + unlock_kernel(); + return err; +} + +static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) +{ + int err = 0; + cpt_object_t *iobj; + struct cpt_file_image *v = cpt_get_buf(ctx); + struct kstat sbuf; + int replaced = 0; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILE; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_flags = file->f_flags; + v->cpt_mode = file->f_mode; + v->cpt_pos = file->f_pos; + v->cpt_uid = file->f_uid; + v->cpt_gid = file->f_gid; + + vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); + + v->cpt_i_mode = sbuf.mode; + v->cpt_lflags = 0; + if (IS_ROOT(file->f_dentry)) + v->cpt_lflags |= CPT_DENTRY_ROOT; + else if (d_unhashed(file->f_dentry)) { + if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { + v->cpt_lflags |= CPT_DENTRY_REPLACED; + replaced = 1; + } else { + v->cpt_lflags |= CPT_DENTRY_DELETED; + } + } + if (is_cloning_inode(file->f_dentry->d_inode)) + v->cpt_lflags |= CPT_DENTRY_CLONING; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) + v->cpt_lflags |= CPT_DENTRY_PROC; + v->cpt_inode = CPT_NULL; + if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { + iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (iobj) + v->cpt_inode = iobj->o_pos; + } + v->cpt_priv = CPT_NULL; + v->cpt_fown_fd = -1; + if (S_ISCHR(v->cpt_i_mode)) { + iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); + if (iobj) { + v->cpt_priv = iobj->o_pos; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_tty_fasync(file, ctx); + } +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + if (file->f_op && file->f_op->open == tun_chr_open) + v->cpt_lflags |= CPT_DENTRY_TUNTAP; +#endif + } + if (S_ISSOCK(v->cpt_i_mode)) { + if (obj->o_index < 0) { + eprintk_ctx("BUG: no socket index\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_priv = obj->o_index; + if (file->f_flags&FASYNC) + v->cpt_fown_fd = cpt_socket_fasync(file, ctx); + } + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_EPOLL; + } + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + v->cpt_priv = file->f_dentry->d_inode->i_ino; + v->cpt_lflags |= CPT_DENTRY_INOTIFY; + } + + v->cpt_fown_pid = (file->f_owner.pid == NULL ? + CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid)); + v->cpt_fown_uid = file->f_owner.uid; + v->cpt_fown_euid = file->f_owner.euid; + v->cpt_fown_signo = file->f_owner.signum; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (!S_ISSOCK(v->cpt_i_mode)) { + err = cpt_dump_filename(file, replaced, ctx); + if (err) + return err; + if ((file->f_mode & FMODE_WRITE) && + file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS) + vefs_track_notify(file->f_dentry, 1); + } + + if (file->f_dentry->d_inode->i_flock) + err = cpt_dump_flock(file, ctx); + + cpt_close_object(ctx); + + return err; +} + +/* About this weird function... Crappy code dealing with SYSV shared memory + * defines TMPFS inode and file with f_op doing only mmap. So... + * Maybe, this is wrong and leaks something. It is clear access to + * SYSV shmem via mmap is quite unusual and impossible from user space. + */ +static int dump_content_shm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_obj_bits *v; + loff_t saved_pos; + unsigned long addr; + + addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, + PROT_READ, MAP_SHARED, 0); + if (IS_ERR((void*)addr)) + return PTR_ERR((void*)addr); + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v = cpt_get_buf(ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = file->f_dentry->d_inode->i_size; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); + ctx->align(ctx); + do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + return 0; +} + +static int data_is_zero(char *addr, int len) +{ + int i; + unsigned long zerolong = 0; + + for (i=0; if_op == NULL) + return -EINVAL; + + do_read = file->f_op->read; + if (file->f_op == &shmem_file_operations) { + do_read = file->f_dentry->d_inode->i_fop->read; + cpt_dump_content_sysvshm(file, ctx); + if (!do_read) { + wprintk_ctx("TMPFS is not configured?\n"); + return dump_content_shm(file, ctx); + } + } + + if (!(file->f_mode & FMODE_READ) || + (file->f_flags & O_DIRECT)) { + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_RDONLY); + if (IS_ERR(file)) { + cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); + eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } else { + atomic_inc(&file->f_count); + } + + for (;;) { + mm_segment_t oldfs; + int err; + + (void)cpt_get_buf(ctx); + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); + set_fs(oldfs); + if (err < 0) { + eprintk_ctx("dump_content_regular: do_read: %d", err); + fput(file); + __cpt_release_buf(ctx); + return err; + } + if (err == 0) { + __cpt_release_buf(ctx); + break; + } + if (data_is_zero(ctx->tmpbuf, err)) { + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + } else { + if (obj_opened == CPT_NULL) { + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + obj_opened = ctx->file->f_pos; + pgb.cpt_next = CPT_NULL; + pgb.cpt_object = CPT_OBJ_PAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_DATA; + pgb.cpt_start = pos - err; + pgb.cpt_end = pgb.cpt_start; + ctx->write(&pgb, sizeof(pgb), ctx); + } + ctx->write(ctx->tmpbuf, err, ctx); + pgb.cpt_end += err; + } + __cpt_release_buf(ctx); + } + + fput(file); + + if (obj_opened != CPT_NULL) { + ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + obj_opened = CPT_NULL; + } + return 0; +} + + +static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + int maj; + + maj = imajor(ino); + if (maj == MEM_MAJOR) { + /* Well, OK. */ + return 0; + } + if (maj == PTY_MASTER_MAJOR || + (maj >= UNIX98_PTY_MASTER_MAJOR && + maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || + maj == PTY_SLAVE_MAJOR || + maj == UNIX98_PTY_SLAVE_MAJOR || + maj == TTYAUX_MAJOR) { + return cpt_dump_content_tty(file, ctx); + } +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + if (file->f_op && file->f_op->open == tun_chr_open) + return 0; +#endif + eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); + return -EINVAL; +} + +static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + + /* We are not going to transfer them. */ + eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); + return -EINVAL; +} + +static int dump_content_fifo(struct file *file, struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + cpt_object_t *obj; + loff_t saved_pos; + int readers; + int writers; + int anon = 0; + + mutex_lock(&ino->i_mutex); + readers = ino->i_pipe->readers; + writers = ino->i_pipe->writers; + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file1 = obj->o_obj; + if (file1->f_dentry->d_inode == ino) { + if (file1->f_mode & FMODE_READ) + readers--; + if (file1->f_mode & FMODE_WRITE) + writers--; + } + } + mutex_unlock(&ino->i_mutex); + if (readers || writers) { + struct dentry *dr = file->f_dentry->d_sb->s_root; + if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) + anon = 1; + + if (anon) { + eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); + return -EBUSY; + } + /* If fifo has external readers/writers, we are in troubles. + * If the buffer is not empty, we must move its content. + * But if the fifo is owned by a service, we cannot do + * this. See? + * + * For now we assume, that if fifo is opened by another + * process, we do not own it and, hence, migrate without + * data. + */ + return 0; + } + + /* OK, we must save fifo state. No semaphores required. */ + + if (ino->i_pipe->nrbufs) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + struct pipe_inode_info *info; + int count, buf, nrbufs; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + if (!info->bufs[buf].ops->can_merge) { + mutex_unlock(&ino->i_mutex); + eprintk_ctx("unknown format of pipe buffer\n"); + return -EINVAL; + } + count += info->bufs[buf].len; + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + if (!count) { + mutex_unlock(&ino->i_mutex); + return 0; + } + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = count; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + count = 0; + buf = info->curbuf; + nrbufs = info->nrbufs; + while (--nrbufs >= 0) { + struct pipe_buffer *b = info->bufs + buf; + /* need to ->pin first? */ + void * addr = b->ops->map(info, b, 0); + ctx->write(addr + b->offset, b->len, ctx); + b->ops->unmap(info, b, addr); + buf = (buf+1) & (PIPE_BUFFERS-1); + } + + mutex_unlock(&ino->i_mutex); + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + return 0; +} + +static int dump_content_socket(struct file *file, struct cpt_context *ctx) +{ + return 0; +} + +struct cpt_dirent { + unsigned long ino; + char *name; + int namelen; + int found; +}; + +static int cpt_filldir(void * __buf, const char * name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct cpt_dirent * dirent = __buf; + + if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) { + memcpy(dirent->name, name, namelen); + dirent->name[namelen] = '\0'; + dirent->namelen = namelen; + dirent->found = 1; + return 1; + } + return 0; +} + +static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt, + struct inode *ino, struct cpt_context *ctx) +{ + int err = -EBUSY; + struct file *f = NULL; + struct cpt_dirent entry; + struct dentry *de, *found = NULL; + + dprintk_ctx("deleted reference to existing inode, try to find file\n"); + /* 1. Try to find not deleted dentry in ino->i_dentry list */ + spin_lock(&dcache_lock); + list_for_each_entry(de, &ino->i_dentry, d_alias) { + if (!IS_ROOT(de) && d_unhashed(de)) + continue; + found = de; + dget_locked(found); + break; + } + spin_unlock(&dcache_lock); + if (found) { + err = cpt_dump_dentry(found, mnt, 0, ctx); + dput(found); + if (!err) { + dprintk_ctx("dentry found in aliases\n"); + return 0; + } + } + + /* 2. Try to find file in current dir */ + de = dget_parent(d); + if (!de) + return -EINVAL; + + mntget(mnt); + f = dentry_open(de, mnt, O_RDONLY); + if (IS_ERR(f)) + return PTR_ERR(f); + + entry.ino = ino->i_ino; + entry.name = cpt_get_buf(ctx); + entry.found = 0; + err = vfs_readdir(f, cpt_filldir, &entry); + if (err || !entry.found) { + err = err ? err : -ENOENT; + goto err_readdir; + } + + found = lookup_one_len(entry.name, de, entry.namelen); + if (IS_ERR(found)) { + err = PTR_ERR(found); + goto err_readdir; + } + + err = -ENOENT; + if (found->d_inode != ino) + goto err_lookup; + + dprintk_ctx("dentry found in dir\n"); + __cpt_release_buf(ctx); + err = cpt_dump_dentry(found, mnt, 0, ctx); + +err_lookup: + dput(found); +err_readdir: + fput(f); + __cpt_release_buf(ctx); + return err; +} + +static int dump_one_inode(struct file *file, struct dentry *d, + struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct inode *ino = d->d_inode; + cpt_object_t *iobj; + int dump_it = 0; + + iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); + if (!iobj) + return -EINVAL; + + if (iobj->o_pos >= 0) + return 0; + + if ((!IS_ROOT(d) && d_unhashed(d)) && + !cpt_replaced(d, mnt, ctx)) + dump_it = 1; + if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { + /* One more bug in epoll: invalid inode mode. + * What a load of crap... + */ + if (ino->i_sb->s_magic == FSMAGIC_EPOLL && + (ino->i_mode & S_IFMT) == 0) + return 0; + dump_it = 1; + } + + if (!dump_it) + return 0; + + cpt_open_object(iobj, ctx); + cpt_dump_inode(d, mnt, ctx); + + if (!IS_ROOT(d) && d_unhashed(d)) { + struct file *parent; + parent = iobj->o_parent; + if (!parent || + (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { + /* Inode is not deleted, but it does not + * have references from inside checkpointed + * process group. */ + if (ino->i_nlink != 0) { + err = find_linked_dentry(d, mnt, ino, ctx); + if (err) { + eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); + return -EBUSY; + } + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } else { + /* Refer to _another_ file name. */ + err = cpt_dump_filename(parent, 0, ctx); + if (err) + return err; + if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) + dump_it = 0; + } + } + if (dump_it) { + if (S_ISREG(ino->i_mode)) { + if ((err = dump_content_regular(file, ctx)) != 0) { + eprintk_ctx("dump_content_regular "); + cpt_printk_dentry(d, mnt); + } + } else if (S_ISDIR(ino->i_mode)) { + /* We cannot do anything. The directory should be + * empty, so it is not a big deal. + */ + } else if (S_ISCHR(ino->i_mode)) { + err = dump_content_chrdev(file, ctx); + } else if (S_ISBLK(ino->i_mode)) { + err = dump_content_blkdev(file, ctx); + } else if (S_ISFIFO(ino->i_mode)) { + err = dump_content_fifo(file, ctx); + } else if (S_ISSOCK(ino->i_mode)) { + err = dump_content_socket(file, ctx); + } else { + eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic); + err = -EINVAL; + } + } + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_files(struct cpt_context *ctx) +{ + int epoll_nr, inotify_nr; + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TTY); + for_each_object(obj, CPT_OBJ_TTY) { + int err; + + if ((err = cpt_dump_tty(obj, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_INODE); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_inode(file, file->f_dentry, + file->f_vfsmnt, ctx)) != 0) + return err; + } + for_each_object(obj, CPT_OBJ_FS) { + struct fs_struct *fs = obj->o_obj; + int err; + + if (fs->root && + (err = dump_one_inode(NULL, fs->root, fs->rootmnt, ctx)) != 0) + return err; + if (fs->pwd && + (err = dump_one_inode(NULL, fs->pwd, fs->pwdmnt, ctx)) != 0) + return err; + if (fs->altroot && + (err = dump_one_inode(NULL, fs->altroot, fs->altrootmnt, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + epoll_nr = 0; + inotify_nr = 0; + cpt_open_section(ctx, CPT_SECT_FILES); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + int err; + + if ((err = dump_one_file(obj, file, ctx)) != 0) + return err; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) + epoll_nr++; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) + inotify_nr++; + } + cpt_close_section(ctx); + + if (epoll_nr) { + cpt_open_section(ctx, CPT_SECT_EPOLL); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { + int err; + if ((err = cpt_dump_epolldev(obj, ctx)) != 0) + return err; + } + } + cpt_close_section(ctx); + } + + if (inotify_nr) { + cpt_open_section(ctx, CPT_SECT_INOTIFY); + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { + int err = -EINVAL; +#ifdef CONFIG_INOTIFY_USER + if ((err = cpt_dump_inotify(obj, ctx)) != 0) +#endif + return err; + } + } + cpt_close_section(ctx); + } + + cpt_open_section(ctx, CPT_SECT_SOCKET); + for_each_object(obj, CPT_OBJ_SOCKET) { + int err; + + if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) + return err; + } + cpt_close_section(ctx); + + return 0; +} + +static int dump_filedesc(int fd, struct file *file, + struct files_struct *f, struct cpt_context *ctx) +{ + struct cpt_fd_image *v = cpt_get_buf(ctx); + cpt_object_t *obj; + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILEDESC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_fd = fd; + obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); + if (!obj) BUG(); + v->cpt_file = obj->o_pos; + v->cpt_flags = 0; + if (FD_ISSET(fd, f->fdt->close_on_exec)) + v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + + return 0; +} + +static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct files_struct *f = obj->o_obj; + struct cpt_files_struct_image *v = cpt_get_buf(ctx); + int fd; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FILES; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = obj->o_index; + v->cpt_max_fds = f->fdt->max_fds; + v->cpt_next_fd = f->next_fd; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + for (fd = 0; fd < f->fdt->max_fds; fd++) { + struct file *file = fcheck_files(f, fd); + if (file) + dump_filedesc(fd, file, f, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_files_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); + + for_each_object(obj, CPT_OBJ_FILES) { + int err; + + if ((err = dump_one_file_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_fs(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->fs) { + if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->pwd && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->root && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->root->d_inode, ctx) == NULL) + return -ENOMEM; + if (tsk->fs->altroot && + cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot->d_inode, ctx) == NULL) + return -ENOMEM; + } + } + return 0; +} + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) +{ + struct file file; + + memset(&file, 0, sizeof(file)); + + file.f_dentry = d; + file.f_vfsmnt = mnt; + file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; + return dump_one_file(NULL, &file, ctx); +} + +static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct fs_struct *fs = obj->o_obj; + struct cpt_fs_struct_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + int err; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_FS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_umask = fs->umask; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + err = cpt_dump_dir(fs->root, fs->rootmnt, ctx); + if (!err) + err = cpt_dump_dir(fs->pwd, fs->pwdmnt, ctx); + if (!err && fs->altroot) + err = cpt_dump_dir(fs->altroot, fs->altrootmnt, ctx); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_fs_struct(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_FS); + + for_each_object(obj, CPT_OBJ_FS) { + int err; + + if ((err = dump_one_fs(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct mnt_namespace *n = obj->o_obj; + struct list_head *p; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); + + path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + + if (check_one_vfsmount(mnt)) { + eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); + err = -EINVAL; + break; + } + } + up_read(&namespace_sem); + + free_page((unsigned long) path_buf); + + return err; +} + +int cpt_collect_namespace(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->nsproxy && tsk->nsproxy->mnt_ns && + cpt_object_add(CPT_OBJ_NAMESPACE, + tsk->nsproxy->mnt_ns, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + if ((err = check_one_namespace(obj, ctx)) != 0) + return err; + } + + return 0; +} + +struct args_t +{ + int* pfd; + char* path; +}; + +static int dumptmpfs(void *arg) +{ + int i; + struct args_t *args = arg; + int *pfd = args->pfd; + int fd0, fd2; + char *path = args->path; + char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump tmpfs\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + set_fs(KERNEL_DS); + fd0 = sc_open("/dev/null", O_RDONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); + if (fd0 < 0 || fd2 < 0) { + eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2); + module_put(THIS_MODULE); + return 255 << 8; + } + if (fd0 != 0) + sc_dup2(fd0, 0); + if (fd2 != 2) + sc_dup2(fd2, 2); + + for (i = 3; i < current->files->fdt->max_fds; i++) { + sc_close(i); + } + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) +{ + int err; + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + int n; + loff_t saved_obj; + struct args_t args; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + err = sc_pipe(pfd); + if (err < 0) + return err; + args.pfd = pfd; + args.path = path; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + err = pid = local_kernel_thread(dumptmpfs, (void*)&args, + SIGCHLD | CLONE_VFORK, 0); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +static int loopy_root(struct vfsmount *mnt) +{ + struct list_head *p; + + list_for_each(p, &mnt->mnt_ns->list) { + struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list); + if (m == mnt) + return 0; + if (m->mnt_sb == mnt->mnt_sb) + return 1; + } + /* Cannot happen */ + return 0; +} + +static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx) +{ + struct list_head *p; + int err = -EINVAL; + + /* One special case: mount --bind /a /a */ + if (mnt->mnt_root == mnt->mnt_mountpoint) + return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx); + + list_for_each_prev(p, &mnt->mnt_list) { + struct vfsmount * m; + + if (p == &mnt->mnt_ns->list) + break; + + m = list_entry(p, struct vfsmount, mnt_list); + + if (m->mnt_sb != mnt->mnt_sb) + continue; + + err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx); + if (err == 0) + break; + } + return err; +} + +static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_vfsmount_image v; + loff_t saved_obj; + char *path_buf, *path; + + path_buf = (char *) __get_free_page(GFP_KERNEL); + if (!path_buf) + return -ENOMEM; + + path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); + if (IS_ERR(path)) { + free_page((unsigned long) path_buf); + return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); + } + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_VFSMOUNT; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_mntflags = mnt->mnt_flags; + if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) { + v.cpt_mntflags |= CPT_MNT_EXT; + } else { + if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt)) + v.cpt_mntflags |= CPT_MNT_BIND; + } + v.cpt_flags = mnt->mnt_sb->s_flags; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_dump_string(mnt->mnt_devname ? : "none", ctx); + cpt_dump_string(path, ctx); + cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); + + if (v.cpt_mntflags & CPT_MNT_BIND) + err = cpt_dump_bind_mnt(mnt, ctx); + else if (!(v.cpt_mntflags & CPT_MNT_EXT) && + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { + mntget(mnt); + up_read(&namespace_sem); + err = cpt_dump_tmpfs(path, ctx); + down_read(&namespace_sem); + if (!err) { + if (list_empty(&mnt->mnt_list)) + err = -EBUSY; + } + mntput(mnt); + } + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS) + vefs_track_force_stop(mnt->mnt_sb); + + free_page((unsigned long) path_buf); + + return err; +} + +static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mnt_namespace *n = obj->o_obj; + struct cpt_object_hdr v; + struct list_head *p; + loff_t saved_obj; + int err = 0; + + cpt_open_object(obj, ctx); + + v.cpt_next = -1; + v.cpt_object = CPT_OBJ_NAMESPACE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + + down_read(&namespace_sem); + list_for_each(p, &n->list) { + err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); + if (err) + break; + } + up_read(&namespace_sem); + + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + return err; +} + +int cpt_dump_namespace(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_NAMESPACE); + + for_each_object(obj, CPT_OBJ_NAMESPACE) { + int err; + + if ((err = dump_one_namespace(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_files.h linux-2.6.24.ovz/kernel/cpt/cpt_files.h --- linux-2.6.24/kernel/cpt/cpt_files.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_files.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,71 @@ +int cpt_collect_files(cpt_context_t *); +int cpt_collect_fs(cpt_context_t *); +int cpt_collect_namespace(cpt_context_t *); +int cpt_collect_sysvsem_undo(cpt_context_t *); +int cpt_collect_tty(struct file *, cpt_context_t *); +int cpt_dump_files(struct cpt_context *ctx); +int cpt_dump_files_struct(struct cpt_context *ctx); +int cpt_dump_fs_struct(struct cpt_context *ctx); +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); +int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); +struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx); +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); + +int rst_posix_locks(struct cpt_context *ctx); + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_restore_fs(struct cpt_context *ctx); + +int cpt_collect_sysv(cpt_context_t *); +int cpt_dump_sysvsem(struct cpt_context *ctx); +int cpt_dump_sysvmsg(struct cpt_context *ctx); +int rst_sysv_ipc(struct cpt_context *ctx); +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_dump_namespace(struct cpt_context *ctx); +int rst_root_namespace(struct cpt_context *ctx); + +int rst_stray_files(struct cpt_context *ctx); +int rst_tty_jobcontrol(struct cpt_context *ctx); + +void rst_flush_filejobs(struct cpt_context *); +int rst_do_filejobs(struct cpt_context *); + +int rst_eventpoll(struct cpt_context *); +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); +int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); + +int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx); +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx); + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx); +int rst_inotify(cpt_context_t *ctx); +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx); + + +int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, + cpt_context_t *ctx); + +#define check_one_vfsmount(mnt) \ + (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ + strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) + +extern const struct file_operations shmem_file_operations; diff -uprN linux-2.6.24/kernel/cpt/cpt_fsmagic.h linux-2.6.24.ovz/kernel/cpt/cpt_fsmagic.h --- linux-2.6.24/kernel/cpt/cpt_fsmagic.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_fsmagic.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,17 @@ +/* Collected from kernel sources. */ + +#define FSMAGIC_TMPFS 0x01021994 +#define FSMAGIC_PIPEFS 0x50495045 +#define FSMAGIC_SOCKFS 0x534F434B +#define FSMAGIC_PFMFS 0xa0b4d889 +#define FSMAGIC_BDEV 0x62646576 +#define FSMAGIC_EPOLL 0x03111965 +#define FSMAGIC_FUTEX 0x0BAD1DEA +#define FSMAGIC_INOTIFY 0x2BAD1DEA +#define FSMAGIC_MQUEUE 0x19800202 +#define FSMAGIC_PROC 0x9fa0 +#define FSMAGIC_DEVPTS 0x1CD1 +#define FSMAGIC_AUTOFS 0x0187 +#define FSMAGIC_EXT2 0xEF53 +#define FSMAGIC_REISER 0x52654973 +#define FSMAGIC_VEFS 0x565a4653 diff -uprN linux-2.6.24/kernel/cpt/cpt_inotify.c linux-2.6.24.ovz/kernel/cpt/cpt_inotify.c --- linux-2.6.24/kernel/cpt/cpt_inotify.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_inotify.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,144 @@ +/* + * + * kernel/cpt/cpt_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations inotify_fops; + +int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx) +{ + int err = 0; + struct file *file = obj->o_obj; + struct inotify_device *dev; + struct inotify_watch *watch; + struct inotify_kernel_event *kev; + struct cpt_inotify_image ii; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + dev = file->private_data; + + /* inotify_user.c does not protect open /proc/N/fd, silly. + * Opener will get an invalid file with uninitialized private_data + */ + if (unlikely(dev == NULL)) { + eprintk_ctx("bad inotify dev\n"); + return -EINVAL; + } + + cpt_open_object(NULL, ctx); + + ii.cpt_next = CPT_NULL; + ii.cpt_object = CPT_OBJ_INOTIFY; + ii.cpt_hdrlen = sizeof(ii); + ii.cpt_content = CPT_CONTENT_ARRAY; + ii.cpt_file = obj->o_pos; + ii.cpt_user = dev->user->uid; + ii.cpt_max_events = dev->max_events; + ii.cpt_last_wd = dev->ih->last_wd; + + ctx->write(&ii, sizeof(ii), ctx); + + mutex_lock(&dev->ih->mutex); + list_for_each_entry(watch, &dev->ih->watches, h_list) { + loff_t saved_obj; + loff_t saved_obj2; + struct cpt_inotify_wd_image wi; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + wi.cpt_next = CPT_NULL; + wi.cpt_object = CPT_OBJ_INOTIFY_WATCH; + wi.cpt_hdrlen = sizeof(wi); + wi.cpt_content = CPT_CONTENT_ARRAY; + wi.cpt_wd = watch->wd; + wi.cpt_mask = watch->mask; + + ctx->write(&wi, sizeof(wi), ctx); + + cpt_push_object(&saved_obj2, ctx); + err = cpt_dump_dir(watch->dentry, watch->mnt, ctx); + cpt_pop_object(&saved_obj2, ctx); + if (err) + break; + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&dev->ih->mutex); + + if (err) + return err; + + mutex_lock(&dev->ev_mutex); + list_for_each_entry(kev, &dev->events, list) { + loff_t saved_obj; + struct cpt_inotify_ev_image ei; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + ei.cpt_next = CPT_NULL; + ei.cpt_object = CPT_OBJ_INOTIFY_EVENT; + ei.cpt_hdrlen = sizeof(ei); + ei.cpt_content = CPT_CONTENT_NAME; + ei.cpt_wd = kev->event.wd; + ei.cpt_mask = kev->event.mask; + ei.cpt_cookie = kev->event.cookie; + ei.cpt_namelen = kev->name ? strlen(kev->name) : 0; + + ctx->write(&ei, sizeof(ei), ctx); + + if (kev->name) { + ctx->write(kev->name, ei.cpt_namelen+1, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + mutex_unlock(&dev->ev_mutex); + + cpt_close_object(ctx); + + return err; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_kernel.c linux-2.6.24.ovz/kernel/cpt/cpt_kernel.c --- linux-2.6.24/kernel/cpt/cpt_kernel.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_kernel.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,177 @@ +/* + * + * kernel/cpt/cpt_kernel.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#define __KERNEL_SYSCALLS__ 1 + +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +int debug_level = 1; + +#ifdef CONFIG_X86_32 + +/* + * Create a kernel thread + */ +extern void kernel_thread_helper(void); +int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.xfs = __KERNEL_PERCPU; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); +} +#endif + +#ifdef CONFIG_IA64 +pid_t +asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/); + return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL, pid); +} +#endif + +int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) +{ + pid_t ret; + + if (current->fs == NULL) { + /* do_fork_pid() hates processes without fs, oopses. */ + printk("CPT BUG: local_kernel_thread: current->fs==NULL\n"); + return -EINVAL; + } + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ret = asm_kernel_thread(fn, arg, flags, pid); + if (ret < 0) + module_put(THIS_MODULE); + return ret; +} + +#ifdef __i386__ +int __execve(const char *file, char **argv, char **envp) +{ + long res; + __asm__ volatile ("int $0x80" + : "=a" (res) + : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), + "d" ((long)(envp)) : "memory"); + return (int)res; +} +#endif + +int sc_execve(char *cmd, char **argv, char **env) +{ + int ret; +#ifndef __i386__ + ret = kernel_execve(cmd, argv, env); +#else + ret = __execve(cmd, argv, env); +#endif + return ret; +} + +unsigned int test_cpu_caps(void) +{ + unsigned int flags = 0; + +#ifdef CONFIG_X86 + if (boot_cpu_has(X86_FEATURE_CMOV)) + flags |= 1 << CPT_CPU_X86_CMOV; + if (cpu_has_fxsr) + flags |= 1 << CPT_CPU_X86_FXSR; + if (cpu_has_xmm) + flags |= 1 << CPT_CPU_X86_SSE; +#ifndef CONFIG_X86_64 + if (cpu_has_xmm2) +#endif + flags |= 1 << CPT_CPU_X86_SSE2; + if (cpu_has_mmx) + flags |= 1 << CPT_CPU_X86_MMX; + if (boot_cpu_has(X86_FEATURE_3DNOW)) + flags |= 1 << CPT_CPU_X86_3DNOW; + if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) + flags |= 1 << CPT_CPU_X86_3DNOW2; + if (boot_cpu_has(X86_FEATURE_SYSCALL)) + flags |= 1 << CPT_CPU_X86_SYSCALL; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_SYSCALL) && + boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + flags |= 1 << CPT_CPU_X86_SYSCALL32; +#endif + if (boot_cpu_has(X86_FEATURE_SEP) +#ifdef CONFIG_X86_64 + && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL +#endif + ) + flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32)); +#ifdef CONFIG_X86_64 + flags |= 1 << CPT_CPU_X86_EMT64; +#endif +#endif +#ifdef CONFIG_IA64 + flags |= 1 << CPT_CPU_X86_IA64; + flags |= 1 << CPT_CPU_X86_FXSR; +#endif + return flags; +} + +unsigned int test_kernel_config(void) +{ + unsigned int flags = 0; +#ifdef CONFIG_X86 +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + flags |= 1 << CPT_KERNEL_CONFIG_PAE; +#endif +#endif + return flags; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_kernel.h linux-2.6.24.ovz/kernel/cpt/cpt_kernel.h --- linux-2.6.24/kernel/cpt/cpt_kernel.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_kernel.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,95 @@ +/* Interface to kernel vars which we had to _add_. */ + +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) +#define TASK_TRACED TASK_STOPPED +#define unix_peer(sk) ((sk)->sk_pair) +#define page_mapcount(pg) ((pg)->mapcount) +#else +#define unix_peer(sk) (unix_sk(sk)->peer) +#endif + +#ifdef CONFIG_IA64 +#define cpu_has_fxsr 1 +#endif + +#define CPT_SIG_IGNORE_MASK (\ + (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \ + (1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1))) + +static inline void do_gettimespec(struct timespec *ts) +{ + struct timeval tv; + do_gettimeofday(&tv); + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_usec*1000; +} + +int local_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); +int asm_kernel_thread(int (*fn)(void *), + void * arg, + unsigned long flags, + pid_t pid); + +#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) +void vefs_track_force_stop(struct super_block *super); + +void vefs_track_notify(struct dentry *vdentry, int track_cow); + +struct dentry * vefs_replaced_dentry(struct dentry *de); +int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde); +#else +static inline void vefs_track_force_stop(struct super_block *super) { }; + +static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; +#endif + +unsigned int test_cpu_caps(void); +unsigned int test_kernel_config(void); + +#define test_one_flag_old(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + wprintk("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } +#define test_one_flag(src, dst, flag, message, ret) \ +if (src & (1 << flag)) \ + if (!(dst & (1 << flag))) { \ + eprintk_ctx("Destination cpu does not have " message "\n"); \ + ret = 1; \ + } + +static inline void +_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static inline struct timespec +_ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); + if (unlikely(nsec < 0)) + _set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); + + return ts; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_mm.c linux-2.6.24.ovz/kernel/cpt/cpt_mm.c --- linux-2.6.24/kernel/cpt/cpt_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_mm.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,915 @@ +/* + * + * kernel/cpt/cpt_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif +#include "cpt_ubc.h" + +static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + if (!list_empty(&aio_ctx->run_list)) { + /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ + eprintk_ctx("run list is not empty, cannot suspend AIO\n"); + return -EBUSY; + } + + /* Wait for pending IOCBs. Linux AIO is mostly _fake_. + * It is actually synchronous, except for direct IO and + * some funny raw USB things, which cannot happen inside VE. + * However, we do this for future. + * + * Later note: in 2.6.16 we may allow O_DIRECT, so that + * it is not meaningless code. + */ + wait_for_all_aios(aio_ctx); + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("were not able to suspend AIO\n"); + return -EBUSY; + } + + return 0; +} + +static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file) { + if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) + return -ENOMEM; + } + } +#ifdef CONFIG_BEANCOUNTERS + if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) + return -ENOMEM; +#endif + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + return 0; +} + +int cpt_collect_mm(cpt_context_t * ctx) +{ + cpt_object_t *obj; + int err; + int index; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) + return -ENOMEM; + } + + index = 1; + for_each_object(obj, CPT_OBJ_MM) { + struct mm_struct *mm = obj->o_obj; + if (obj->o_count != atomic_read(&mm->mm_users)) { + eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); + return -EAGAIN; + } + cpt_obj_setindex(obj, index++, ctx); + + if ((err = collect_one_mm(mm, ctx)) != 0) + return err; + } + + return 0; +} + +static int zcnt, scnt, scnt0, ucnt; + +/* Function where_is_anon_page() returns address of a anonymous page in mm + * of already dumped process. This happens f.e. after fork(). We do not use + * this right now, just keep statistics, it is diffucult to restore such state, + * but the most direct use is to save space in dumped image. */ + + +static inline unsigned long +vma_address0(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + address |= 1; + return address; +} + +static int really_this_one(struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + int result; + + pgd = pgd_offset(mm, address); + if (unlikely(!pgd_present(*pgd))) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (unlikely(!pmd_present(*pmd))) + return 0; + + result = 0; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) + result = 1; + pte_unmap_unlock(pte, ptl); + return result; +} + +static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, + struct page *page, cpt_context_t * ctx) +{ + loff_t mmptr = CPT_NULL; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int idx = mmobj->o_index; + + if (!PageAnon(page)) + return CPT_NULL; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return CPT_NULL; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + unsigned long addr = vma_address0(page, vma); + cpt_object_t *obj; + + /* We do not try to support mremapped regions (addr != mapaddr), + * only mmaps directly inherited via fork(). + * With this limitation we may check self-consistency of + * vmas (vm_start, vm_pgoff, anon_vma) before + * doing __copy_page_range() in rst_mm. + */ + if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { + obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); + if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { + if (really_this_one(vma, addr, page)) { + mmptr = obj->o_pos; + idx = obj->o_index; + } + } + } + } + page_unlock_anon_vma(anon_vma); + + return mmptr; +} + +struct page_area +{ + int type; + unsigned long start; + unsigned long end; + pgoff_t pgoff; + loff_t mm; + __u64 list[16]; +}; + +struct page_desc +{ + int type; + pgoff_t index; + loff_t mm; + int shared; +}; + +enum { + PD_ABSENT, + PD_COPY, + PD_ZERO, + PD_CLONE, + PD_FUNKEY, + PD_LAZY, + PD_ITER, + PD_ITERYOUNG, +}; + +/* 0: page can be obtained from backstore, or still not mapped anonymous page, + or something else, which does not requre copy. + 1: page requires copy + 2: page requres copy but its content is zero. Quite useless. + 3: wp page is shared after fork(). It is to be COWed when modified. + 4: page is something unsupported... We copy it right now. + */ + + + +static void page_get_desc(cpt_object_t *mmobj, + struct vm_area_struct *vma, unsigned long addr, + struct page_desc *pdesc, cpt_context_t * ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *pg = NULL; + pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; + + pdesc->index = linear_index; + pdesc->shared = 0; + pdesc->mm = CPT_NULL; + + if (vma->vm_flags & VM_IO) { + pdesc->type = PD_ABSENT; + return; + } + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_absent; + pud = pud_offset(pgd, addr); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_absent; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_absent; +#ifdef CONFIG_X86 + if (pmd_huge(*pmd)) { + eprintk_ctx("page_huge\n"); + goto out_unsupported; + } +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +retry: +#endif + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = *ptep; + pte_unmap(ptep); + + if (pte_none(pte)) + goto out_absent_unlock; + + if (!pte_present(pte)) { + if (pte_file(pte)) { + pdesc->index = pte_to_pgoff(pte); + goto out_absent_unlock; + } + if (vma->vm_flags & VM_SHARED) { + /* It is impossible: shared mappings cannot be in swap */ + eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); + goto out_unsupported_unlock; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + /* Otherwise it is in swap. */ + if (!ctx->lazy_vm) { + int err; + /* If lazy transfer is not enabled, + * raise it from swap now, so that we + * save at least when the page is shared. + */ + spin_unlock(ptl); + err = handle_mm_fault(mm, vma, addr, 0); + if (err == VM_FAULT_SIGBUS) + goto out_absent; + if (err == VM_FAULT_OOM) + goto out_absent; + err = 0; + goto retry; + } +#endif + pdesc->type = PD_LAZY; + goto out_unlock; + } + + if ((pg = vm_normal_page(vma, addr, pte)) == NULL) { + pdesc->type = PD_COPY; + goto out_unlock; + } + + get_page(pg); + spin_unlock(ptl); + + if (pg->mapping && !PageAnon(pg)) { + if (vma->vm_file == NULL) { + eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); + goto out_unsupported; + } + if (vma->vm_file->f_mapping != pg->mapping) { + eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", + addr, vma->vm_file->f_mapping, pg->mapping, + mmobj->o_pos); + goto out_unsupported; + } + pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + /* Page is in backstore. For us it is like + * it is not present. + */ + goto out_absent; + } + + if (PageReserved(pg)) { + /* Special case: ZERO_PAGE is used, when an + * anonymous page is accessed but not written. */ + if (pg == ZERO_PAGE(addr)) { + if (pte_write(pte)) { + eprintk_ctx("not funny already, writable ZERO_PAGE\n"); + goto out_unsupported; + } + zcnt++; + goto out_absent; + } + eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, + addr, mmobj->o_pos); + goto out_unsupported; + } + + if (pg == ZERO_PAGE(addr)) { + wprintk_ctx("that's how it works now\n"); + } + + if (!pg->mapping) { + eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, + mmobj->o_pos); + goto out_unsupported; + } + + if (pg->mapping && page_mapcount(pg) > 1) { + pdesc->shared = 1; + pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); + if (pdesc->mm != CPT_NULL) { + scnt0++; + pdesc->type = PD_CLONE; + goto out_put; + } else { + scnt++; + } + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + if (ctx->iter_done && + test_bit(PG_checkpointed, &pg->flags)) { + if (pte_write(pte)) { + wprintk_ctx("writable PG_checkpointed page\n"); + } + pdesc->index = page_to_pfn(pg); + pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER; + goto out_put; + } +#endif + pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY; + +out_put: + if (pg) + put_page(pg); + return; + +out_unlock: + spin_unlock(ptl); + goto out_put; + +out_absent_unlock: + spin_unlock(ptl); +out_absent: + pdesc->type = PD_ABSENT; + goto out_put; + +out_unsupported_unlock: + spin_unlock(ptl); +out_unsupported: + ucnt++; + pdesc->type = PD_FUNKEY; + goto out_put; +} + +/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() + * does not really need this thing. It just stores some page fault stats there. + * + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages + * before accessing vma. + */ +void dump_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct cpt_context *ctx) +{ +#define MAX_PAGE_BATCH 16 + struct page *pg[MAX_PAGE_BATCH]; + int npages = (end - start)/PAGE_SIZE; + int count = 0; + + while (count < npages) { + int copy = npages - count; + int n; + + if (copy > MAX_PAGE_BATCH) + copy = MAX_PAGE_BATCH; + n = get_user_pages(current, vma->vm_mm, start, copy, + 0, 1, pg, NULL); + if (n == copy) { + int i; + for (i=0; iwrite(maddr, PAGE_SIZE, ctx); + kunmap(pg[i]); + } + } else { + eprintk_ctx("get_user_pages fault"); + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + return; + } + start += n*PAGE_SIZE; + count += n; + for ( ; n > 0; n--) + page_cache_release(pg[n-1]); + } + return; +} + +int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, + int copy, + struct cpt_context *ctx) +{ + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; + pgb->cpt_hdrlen = sizeof(*pgb); + pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; + + ctx->write(pgb, sizeof(*pgb), ctx); + if (copy == PD_COPY || copy == PD_LAZY) + dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_remappage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_REMAPPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, + struct cpt_context *ctx) +{ + struct cpt_copypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_COPYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + pgb.cpt_source = pa->mm; + + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_lazypage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = CPT_OBJ_LAZYPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, + (pa->end-pa->start)/PAGE_SIZE, ctx); +#endif + ctx->write(&pgb, sizeof(pgb), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + +int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa, + cpt_context_t *ctx) +{ + struct cpt_iterpage_block pgb; + loff_t saved_object; + + cpt_push_object(&saved_object, ctx); + + pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES : + CPT_OBJ_ITERYOUNGPAGES; + pgb.cpt_hdrlen = sizeof(pgb); + pgb.cpt_content = CPT_CONTENT_VOID; + pgb.cpt_start = pa->start; + pgb.cpt_end = pa->end; + ctx->write(&pgb, sizeof(pgb), ctx); + + ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx); + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + return 0; +} + + +static int can_expand(struct page_area *pa, struct page_desc *pd) +{ + if (pa->start == pa->end) + return 1; + if (pa->type != pd->type) + return 0; + if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) { + if (pa->end - pa->start >= PAGE_SIZE*16) + return 0; + pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index; + } + if (pa->type == PD_ABSENT) + return pd->index == pa->pgoff + 1; + if (pa->type == PD_CLONE) + return pd->mm == pa->mm; + return 1; +} + +static int dump_one_vma(cpt_object_t *mmobj, + struct vm_area_struct *vma, struct cpt_context *ctx) +{ + struct cpt_vma_image *v = cpt_get_buf(ctx); + unsigned long addr; + loff_t saved_object; + struct cpt_page_block pgb; + struct page_area pa; + int cloned_pages = 0; + + cpt_push_object(&saved_object, ctx); + + v->cpt_object = CPT_OBJ_VMA; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start = vma->vm_start; + v->cpt_end = vma->vm_end; + v->cpt_flags = vma->vm_flags; + if (vma->vm_flags&VM_HUGETLB) { + eprintk_ctx("huge TLB VMAs are still not supported\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgprot = vma->vm_page_prot.pgprot; + v->cpt_pgoff = vma->vm_pgoff; + v->cpt_file = CPT_NULL; +#ifndef CONFIG_IA64 + if ((void *)vma->vm_start == vma->vm_mm->context.vdso && + vma->vm_ops == &special_mapping_vmops) + v->cpt_type = CPT_VMA_VDSO; + else +#endif + v->cpt_type = CPT_VMA_TYPE_0; + v->cpt_anonvma = 0; + + /* We have to remember what VMAs are bound to one anon_vma. + * So, we store an identifier of group of VMAs. It is handy + * to use absolute address of anon_vma as this identifier. */ + v->cpt_anonvmaid = (unsigned long)vma->anon_vma; + + if (vma->vm_file) { + struct file *filp; + cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); + if (obj == NULL) BUG(); + filp = obj->o_obj; + if (filp->f_op && + filp->f_op->read == NULL && + filp->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_TMPFS) + v->cpt_type = CPT_VMA_TYPE_SHM; + v->cpt_file = obj->o_pos; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + if (v->cpt_type == CPT_VMA_VDSO) + goto out; + + pa.type = PD_ABSENT; + pa.pgoff = vma->vm_pgoff; + pa.mm = CPT_NULL; + pa.start = vma->vm_start; + pa.end = vma->vm_start; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page_desc pd; + + page_get_desc(mmobj, vma, addr, &pd, ctx); + cloned_pages += pd.shared; + + if (pd.type == PD_FUNKEY) { + eprintk_ctx("dump_one_vma: funkey page\n"); + return -EINVAL; + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (pd.type == PD_LAZY && + (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) + pd.type = PD_COPY; +#else + if (pd.type == PD_LAZY) + pd.type = PD_COPY; +#endif + + if (!can_expand(&pa, &pd)) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + pa.start = addr; + } + pa.type = pd.type; + pa.end = addr + PAGE_SIZE; + pa.pgoff = pd.index; + if (addr == pa.start) + pa.list[0] = pd.index; + pa.mm = pd.mm; + } + + if (pa.end > pa.start) { + if (pa.type == PD_COPY || + pa.type == PD_ZERO) { + pgb.cpt_start = pa.start; + pgb.cpt_end = pa.end; + dump_page_block(vma, &pgb, pa.type, ctx); + } else if (pa.type == PD_CLONE) { + dump_copypage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_LAZY) { + dump_lazypage_block(vma, &pa, ctx); + } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { + dump_iterpage_block(vma, &pa, ctx); + cloned_pages++; + } else if (pa.type == PD_ABSENT && + pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { + dump_remappage_block(vma, &pa, ctx); + } + } + + if (cloned_pages) { + __u32 anonvma = 1; + loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); + ctx->pwrite(&anonvma, 4, ctx, anonpos); + } + +out: + cpt_close_object(ctx); + + cpt_pop_object(&saved_object, ctx); + + return 0; +} + +static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, + cpt_context_t *ctx) +{ + loff_t saved_object; + struct cpt_aio_ctx_image aimg; + + if (!list_empty(&aio_ctx->run_list) || + !list_empty(&aio_ctx->active_reqs) || + aio_ctx->reqs_active) { + eprintk_ctx("AIO is active after suspend\n"); + return -EBUSY; + } + + cpt_push_object(&saved_object, ctx); + + aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); + aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; + aimg.cpt_hdrlen = sizeof(aimg); + aimg.cpt_content = CPT_CONTENT_ARRAY; + + aimg.cpt_max_reqs = aio_ctx->max_reqs; + aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; + aimg.cpt_nr = aio_ctx->ring_info.nr; + aimg.cpt_tail = aio_ctx->ring_info.tail; + aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; + + ctx->write(&aimg, sizeof(aimg), ctx); + + cpt_pop_object(&saved_object, ctx); + return 0; +} + +static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct mm_struct *mm = obj->o_obj; + struct vm_area_struct *vma; + struct cpt_mm_image *v = cpt_get_buf(ctx); + + cpt_open_object(obj, ctx); + + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_MM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_start_code = mm->start_code; + v->cpt_end_code = mm->end_code; + v->cpt_start_data = mm->start_data; + v->cpt_end_data = mm->end_data; + v->cpt_start_brk = mm->start_brk; + v->cpt_brk = mm->brk; + v->cpt_start_stack = mm->start_stack; + v->cpt_start_arg = mm->arg_start; + v->cpt_end_arg = mm->arg_end; + v->cpt_start_env = mm->env_start; + v->cpt_end_env = mm->env_end; + v->cpt_def_flags = mm->def_flags; +#ifdef CONFIG_BEANCOUNTERS + v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); +#endif + /* FIXME when coredump mask exceeds 8 bits */ + WARN_ON(mm->flags >> 8); + v->cpt_dumpable = mm->flags; + v->cpt_vps_dumpable = mm->vps_dumpable; + v->cpt_used_hugetlb = 0; /* not used */ +#ifndef CONFIG_IA64 + v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso; +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + +#ifdef CONFIG_X86 + if (mm->context.size) { + loff_t saved_object; + struct cpt_obj_bits b; + int size; + + dprintk_ctx("nontrivial LDT\n"); + + cpt_push_object(&saved_object, ctx); + + cpt_open_object(NULL, ctx); + b.cpt_next = CPT_NULL; + b.cpt_object = CPT_OBJ_BITS; + b.cpt_hdrlen = sizeof(b); + b.cpt_content = CPT_CONTENT_MM_CONTEXT; + b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; + + ctx->write(&b, sizeof(b), ctx); + + size = mm->context.size*LDT_ENTRY_SIZE; + +#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) + ctx->write(mm->context.ldt, size, ctx); +#else + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + ctx->write(kaddr, bytes, ctx); + kunmap(mm->context.ldt_pages[nr]); + } +#endif + + cpt_close_object(ctx); + cpt_pop_object(&saved_object, ctx); + } +#endif + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int err; + + if ((err = dump_one_vma(obj, vma, ctx)) != 0) + return err; + } + + if (mm->ioctx_list) { + struct kioctx *aio_ctx; + int err; + + for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) + if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) + return err; + } + + cpt_close_object(ctx); + + return 0; +} + +int cpt_dump_vm(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + scnt = scnt0 = zcnt = 0; + + cpt_open_section(ctx, CPT_SECT_MM); + + for_each_object(obj, CPT_OBJ_MM) { + int err; + + if ((err = dump_one_mm(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + + if (scnt) + dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); + if (scnt0) + dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); + if (zcnt) + dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_mm.h linux-2.6.24.ovz/kernel/cpt/cpt_mm.h --- linux-2.6.24/kernel/cpt/cpt_mm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_mm.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,43 @@ +int cpt_collect_mm(cpt_context_t *); + +int cpt_dump_vm(struct cpt_context *ctx); + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); + +int cpt_mm_prepare(unsigned long veid); + +int cpt_free_pgin_dir(struct cpt_context *); +int cpt_start_pagein(struct cpt_context *); +int rst_setup_pagein(struct cpt_context *); +int rst_complete_pagein(struct cpt_context *, int); +int rst_pageind(struct cpt_context *); +int cpt_iteration(cpt_context_t *ctx); +int rst_iteration(cpt_context_t *ctx); +void rst_drop_iter_dir(cpt_context_t *ctx); +int rst_iter(struct vm_area_struct *vma, u64 pfn, + unsigned long addr, cpt_context_t * ctx); + +int rst_swapoff(struct cpt_context *); + +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, + unsigned long map_address); +#endif + +#ifdef CONFIG_X86_64 +extern char *syscall32_page; +#define vsyscall_addr syscall32_page +#define CPT_SYSENTER_RETURN VSYSCALL32_SYSEXIT +#elif defined(CONFIG_X86_32) +extern void *syscall_page; +extern struct vm_operations_struct syscall_vm_ops; +extern void SYSENTER_RETURN; +#define vsyscall_addr syscall_page +#define CPT_SYSENTER_RETURN (current->mm->context.vdso + \ + (unsigned long)&SYSENTER_RETURN) +#endif + +extern struct vm_operations_struct special_mapping_vmops; diff -uprN linux-2.6.24/kernel/cpt/cpt_net.c linux-2.6.24.ovz/kernel/cpt/cpt_net.c --- linux-2.6.24/kernel/cpt/cpt_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_net.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,518 @@ +/* + * + * kernel/cpt/cpt_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_syscalls.h" + +static void cpt_dump_tuntap(struct net_device *dev, struct cpt_context * ctx) +{ +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + struct cpt_tuntap_image v; + struct tun_struct *tun; + cpt_object_t *obj; + + if (dev->open != tun_net_open) + return; + + tun = netdev_priv(dev); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_TUNTAP; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_owner = tun->owner; + v.cpt_flags = tun->flags; + v.cpt_attached = tun->attached; + + if (tun->bind_file) { + obj = lookup_cpt_object(CPT_OBJ_FILE, tun->bind_file, ctx); + BUG_ON(!obj); + v.cpt_bindfile = obj->o_pos; + } + + v.cpt_if_flags = tun->if_flags; + BUG_ON(sizeof(v.cpt_dev_addr) != sizeof(tun->dev_addr)); + memcpy(v.cpt_dev_addr, tun->dev_addr, sizeof(v.cpt_dev_addr)); + BUG_ON(sizeof(v.cpt_chr_filter) != sizeof(tun->chr_filter)); + memcpy(v.cpt_chr_filter, tun->chr_filter, sizeof(v.cpt_chr_filter)); + BUG_ON(sizeof(v.cpt_net_filter) != sizeof(tun->net_filter)); + memcpy(v.cpt_net_filter, tun->net_filter, sizeof(v.cpt_net_filter)); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); +#endif + return; +} + +int cpt_dump_link(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_ns->net_ns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_DEVICE); + for_each_netdev(net, dev) { + struct cpt_netdev_image v; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_DEVICE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + v.cpt_index = dev->ifindex; + v.cpt_flags = dev->flags; + memcpy(v.cpt_name, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_dump_tuntap(dev, ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + + if (dev != net->loopback_dev +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) + && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) +#endif +#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + && dev != get_exec_env()->_venet_dev +#endif +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + && dev->open != tun_net_open +#endif + ) { + eprintk_ctx("unsupported netdevice %s\n", dev->name); + cpt_close_section(ctx); + return -EBUSY; + } + } + cpt_close_section(ctx); + return 0; +} + +int cpt_suspend_network(struct cpt_context *ctx) +{ + get_exec_env()->disable_net = 1; + synchronize_net(); + return 0; +} + +int cpt_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +int cpt_dump_ifaddr(struct cpt_context * ctx) +{ + struct net *net = get_exec_env()->ve_ns->net_ns; + struct net_device *dev; + + cpt_open_section(ctx, CPT_SECT_NET_IFADDR); + for_each_netdev(net, dev) { + struct in_device *idev = in_dev_get(dev); + struct in_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { + struct cpt_ifaddr_image v; + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET; + v.cpt_masklen = ifa->ifa_prefixlen; + v.cpt_flags = ifa->ifa_flags; + v.cpt_scope = ifa->ifa_scope; + memset(&v.cpt_address, 0, sizeof(v.cpt_address)); + memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + v.cpt_address[0] = ifa->ifa_local; + v.cpt_peer[0] = ifa->ifa_address; + v.cpt_broadcast[0] = ifa->ifa_broadcast; + memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in_dev_put(idev); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + for_each_netdev(net, dev) { + struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_ifaddr *ifa; + + if (!idev) + continue; + + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct cpt_ifaddr_image v; + + if (dev == net->loopback_dev && + ifa->prefix_len == 128 && + ifa->addr.s6_addr32[0] == 0 && + ifa->addr.s6_addr32[1] == 0 && + ifa->addr.s6_addr32[2] == 0 && + ifa->addr.s6_addr32[3] == htonl(1)) + continue; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_IFADDR; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_index = dev->ifindex; + v.cpt_family = AF_INET6; + v.cpt_masklen = ifa->prefix_len; + v.cpt_flags = ifa->flags; + v.cpt_scope = ifa->scope; + v.cpt_valid_lft = ifa->valid_lft; + v.cpt_prefered_lft = ifa->prefered_lft; + memcpy(&v.cpt_address, &ifa->addr, 16); + memcpy(&v.cpt_peer, &ifa->addr, 16); + memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); + memcpy(v.cpt_label, dev->name, IFNAMSIZ); + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + } + in6_dev_put(idev); + } +#endif + cpt_close_section(ctx); + return 0; +} + +static int cpt_dump_route(struct cpt_context * ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + struct cpt_object_hdr v; + mm_segment_t oldfs; + char *pg; + + err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETROUTE; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.g.rtgen_family = AF_INET; + + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err < 0) + goto out_sock; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + cpt_open_section(ctx, CPT_SECT_NET_ROUTE); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NET_ROUTE; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NLMARRAY; + + ctx->write(&v, sizeof(v), ctx); + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +restart: +#endif + for (;;) { + struct nlmsghdr *h; + + iov.iov_base = pg; + iov.iov_len = PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + if (msg.msg_flags & MSG_TRUNC) { + err = -ENOBUFS; + goto out_sock_pg; + } + + h = (struct nlmsghdr*)pg; + while (NLMSG_OK(h, err)) { + if (h->nlmsg_type == NLMSG_DONE) { + err = 0; + goto done; + } + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); + err = errm->error; + eprintk_ctx("NLMSG error: %d\n", errm->error); + goto done; + } + if (h->nlmsg_type != RTM_NEWROUTE) { + eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); + err = -EINVAL; + goto done; + } + ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); + h = NLMSG_NEXT(h, err); + } + if (err) { + eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); + err = -EINVAL; + break; + } + } +done: +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (!err && req.g.rtgen_family == AF_INET) { + req.g.rtgen_family = AF_INET6; + iov.iov_base=&req; + iov.iov_len=sizeof(req); + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, sizeof(req)); + set_fs(oldfs); + + if (err > 0) + goto restart; + } +#endif + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-save", "-c", NULL }; + + i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); + if (i < 0) { + eprintk("cannot enter ve to dump iptables\n"); + module_put(THIS_MODULE); + return 255 << 8; + } + + if (pfd[1] != 1) + sc_dup2(pfd[1], 1); + + for (i=0; ifiles->fdt->max_fds; i++) { + if (i != 1) + sc_close(i); + } + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-save", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-save", argv, NULL); + eprintk("failed to exec iptables-save: %d\n", i); + return 255 << 8; +} + + +static int cpt_dump_iptables(struct cpt_context * ctx) +{ + int err = 0; +#ifdef CONFIG_VE_IPTABLES + int pid; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + char buf[16]; + loff_t pos; + int n; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) + return 0; + + err = sc_pipe(pfd); + if (err < 0) { + eprintk_ctx("sc_pipe: %d\n", err); + return err; + } + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("local_kernel_thread: %d\n", err); + goto out; + } + + f = fget(pfd[0]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); + + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_NAME; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_NAME; + + ctx->write(&v, sizeof(v), ctx); + + pos = ctx->file->f_pos; + do { + oldfs = get_fs(); set_fs(KERNEL_DS); + n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); + set_fs(oldfs); + if (n > 0) + ctx->write(buf, n, ctx); + } while (n > 0); + + if (n < 0) + eprintk_ctx("read: %d\n", n); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-save exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-save terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + if (ctx->file->f_pos != pos) { + buf[0] = 0; + ctx->write(buf, 1, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + cpt_close_section(ctx); + } else { + pos = ctx->current_section; + cpt_close_object(ctx); + cpt_close_section(ctx); + ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; + ctx->file->f_pos = pos; + } + return n ? : err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); +#endif + return err; +} + +int cpt_dump_ifinfo(struct cpt_context * ctx) +{ + int err; + + rtnl_lock(); + err = cpt_dump_link(ctx); + if (!err) + err = cpt_dump_ifaddr(ctx); + rtnl_unlock(); + if (!err) + err = cpt_dump_route(ctx); + if (!err) + err = cpt_dump_iptables(ctx); + return err; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_net.h linux-2.6.24.ovz/kernel/cpt/cpt_net.h --- linux-2.6.24/kernel/cpt/cpt_net.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_net.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,7 @@ +int cpt_dump_ifinfo(struct cpt_context *ctx); +int rst_restore_net(struct cpt_context *ctx); +int cpt_suspend_network(struct cpt_context *ctx); +int cpt_resume_network(struct cpt_context *ctx); +int rst_resume_network(struct cpt_context *ctx); +int cpt_dump_ip_conntrack(struct cpt_context *ctx); +int rst_restore_ip_conntrack(struct cpt_context * ctx); diff -uprN linux-2.6.24/kernel/cpt/cpt_obj.c linux-2.6.24.ovz/kernel/cpt/cpt_obj.c --- linux-2.6.24/kernel/cpt/cpt_obj.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_obj.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,162 @@ +/* + * + * kernel/cpt/cpt_obj.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = kmalloc(sizeof(cpt_object_t), gfp); + if (obj) { + INIT_LIST_HEAD(&obj->o_list); + INIT_LIST_HEAD(&obj->o_hash); + INIT_LIST_HEAD(&obj->o_alist); + obj->o_count = 1; + obj->o_pos = CPT_NULL; + obj->o_lock = 0; + obj->o_parent = NULL; + obj->o_index = CPT_NOINDEX; + obj->o_obj = NULL; + obj->o_image = NULL; + ctx->objcount++; + } + return obj; +} + +void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) +{ + list_del(&obj->o_alist); + kfree(obj); + ctx->objcount--; +} + +void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) +{ + list_add_tail(&obj->o_list, &ctx->object_array[type]); +} + +void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, + cpt_object_t *head, cpt_context_t *ctx) +{ + list_add(&obj->o_list, &head->o_list); +} + +cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, + unsigned gfp_mask, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) { + obj->o_count++; + return obj; + } + + if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { + if (p) + cpt_obj_setobj(obj, p, ctx); + intern_cpt_object(type, obj, ctx); + return obj; + } + return NULL; +} + +cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + return __cpt_object_add(type, p, GFP_KERNEL, ctx); +} + +cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(type, p, ctx); + + if (obj) + obj->o_count++; + + return obj; +} + +int cpt_object_init(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i]); + } + return 0; +} + +int cpt_object_destroy(cpt_context_t *ctx) +{ + int i; + + for (i=0; iobject_array[i])) { + struct list_head *head = ctx->object_array[i].next; + cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); + list_del(head); + if (obj->o_image) + kfree(obj->o_image); + free_cpt_object(obj, ctx); + } + } + if (ctx->objcount != 0) + eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); + return 0; +} + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_obj == p) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_pos == pos) + return obj; + } + return NULL; +} + +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, type) { + if (obj->o_index == index) + return obj; + } + return NULL; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_obj.h linux-2.6.24.ovz/kernel/cpt/cpt_obj.h --- linux-2.6.24/kernel/cpt/cpt_obj.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_obj.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,62 @@ +#ifndef __CPT_OBJ_H_ +#define __CPT_OBJ_H_ 1 + +#include +#include + +typedef struct _cpt_object +{ + struct list_head o_list; + struct list_head o_hash; + int o_count; + int o_index; + int o_lock; + loff_t o_pos; + loff_t o_ppos; + void *o_obj; + void *o_image; + void *o_parent; + struct list_head o_alist; +} cpt_object_t; + +struct cpt_context; + +#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) + + +extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); +extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); + +cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); +cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); + +static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) +{ + cpt->o_pos = pos; + /* Add to pos hash table */ +} + +static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) +{ + cpt->o_obj = ptr; + /* Add to hash table */ +} + +static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) +{ + cpt->o_index = index; + /* Add to index hash table */ +} + + +extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); +extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); +extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); +extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); + +extern int cpt_object_init(struct cpt_context *ctx); +extern int cpt_object_destroy(struct cpt_context *ctx); + +#endif /* __CPT_OBJ_H_ */ diff -uprN linux-2.6.24/kernel/cpt/cpt_proc.c linux-2.6.24.ovz/kernel/cpt/cpt_proc.c --- linux-2.6.24/kernel/cpt/cpt_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_proc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,596 @@ +/* + * + * kernel/cpt/cpt_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void cpt_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + cpt_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); + if (ctx->pgin_dir) + cpt_free_pgin_dir(ctx); + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); +#endif + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + if (ctx->file) + fput(ctx->file); + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } + if (ctx->statusfile) + fput(ctx->statusfile); + if (ctx->lockfile) + fput(ctx->lockfile); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + cpt_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * cpt_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + cpt_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +static cpt_context_t * cpt_context_lookup(unsigned int contextid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == contextid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +int cpt_context_lookup_veid(unsigned int veid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->ve_id == veid && ctx->ctx_state > 0) { + spin_unlock(&cpt_context_lock); + return 1; + } + } + spin_unlock(&cpt_context_lock); + return 0; +} + +static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + int try; + + unlock_kernel(); + + if (cmd == CPT_VMPREP) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = cpt_mm_prepare(arg); +#else + err = -EINVAL; +#endif + goto out_lock; + } + + if (cmd == CPT_TEST_CAPS) { + unsigned int src_flags, dst_flags = arg; + + err = 0; + src_flags = test_cpu_caps(); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = cpt_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + if (ctx->contextid && ctx->contextid != contextid) { + err = -EINVAL; + goto out_nosem; + } + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state == CPT_CTX_DUMPING) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->write == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_SET_LAZY: + ctx->lazy_vm = arg; + break; + case CPT_ITER: + err = cpt_iteration(ctx); + break; + case CPT_PAGEIND: + err = cpt_start_pagein(ctx); + break; +#endif + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_SET_CPU_FLAGS: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->dst_cpu_flags = arg; + ctx->src_cpu_flags = test_cpu_caps(); + break; + case CPT_SUSPEND: + if (cpt_context_lookup_veid(ctx->ve_id) || + ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ctx_state = CPT_CTX_SUSPENDING; + try = 0; + do { + err = cpt_vps_suspend(ctx); + if (err) + cpt_resume(ctx); + if (err == -EAGAIN) + msleep(1000); + try++; + } while (err == -EAGAIN && try < 3); + if (err) { + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_SUSPENDED; + } + break; + case CPT_DUMP: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + if (!ctx->file) { + err = -EBADF; + break; + } + err = cpt_dump(ctx); + break; + case CPT_RESUME: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (ctx->ctx_state == CPT_CTX_IDLE) { + err = -ENOENT; + break; + } + err = cpt_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_TEST_VECAPS: + { + __u32 dst_flags = arg; + __u32 src_flags; + + err = cpt_vps_caps(ctx, &src_flags); + if (err) + break; + + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err); + test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err); + if (src_flags & CPT_UNSUPPORTED_MASK) + err = 2; + break; + } + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int cpt_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int cpt_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + module_put(THIS_MODULE); + return 0; +} + + +static struct file_operations cpt_fops = { + .owner = THIS_MODULE, + .open = cpt_open, + .release = cpt_release, + .ioctl = cpt_ioctl, +}; + +static struct proc_dir_entry *proc_ent; + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .ctl_name = 9475, + .procname = "cpt", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_cpt(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = create_proc_entry_mod("cpt", 0600, NULL, THIS_MODULE); + if (!proc_ent) + goto err_out; + + cpt_fops.read = proc_ent->proc_fops->read; + cpt_fops.write = proc_ent->proc_fops->write; + cpt_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &cpt_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_cpt); + +static void __exit exit_cpt(void) +{ + remove_proc_entry("cpt", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_cpt); diff -uprN linux-2.6.24/kernel/cpt/cpt_process.c linux-2.6.24.ovz/kernel/cpt/cpt_process.c --- linux-2.6.24/kernel/cpt/cpt_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_process.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1371 @@ +/* + * + * kernel/cpt/cpt_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + +#ifdef CONFIG_X86_32 +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) +#endif + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx) +{ +#ifdef CONFIG_X86_64 + if (!(task_thread_info(tsk)->flags&_TIF_IA32)) { + if (task_pt_regs(tsk)->rip >= VSYSCALL_START && + task_pt_regs(tsk)->rip < VSYSCALL_END) { + eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); + return -EAGAIN; + } + } +#endif + return 0; +} + +#ifdef CONFIG_X86 + +static u32 encode_segment(u32 segreg) +{ + segreg &= 0xFFFF; + + if (segreg == 0) + return CPT_SEG_ZERO; + if ((segreg & 3) != 3) { + wprintk("Invalid RPL of a segment reg %x\n", segreg); + return CPT_SEG_ZERO; + } + + /* LDT descriptor, it is just an index to LDT array */ + if (segreg & 4) + return CPT_SEG_LDT + (segreg >> 3); + + /* TLS descriptor. */ + if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && + (segreg >> 3) <= GDT_ENTRY_TLS_MAX) + return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); + + /* One of standard desriptors */ +#ifdef CONFIG_X86_64 + if (segreg == __USER32_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER32_CS) + return CPT_SEG_USER32_CS; + if (segreg == __USER_DS) + return CPT_SEG_USER64_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER64_CS; +#else + if (segreg == __USER_DS) + return CPT_SEG_USER32_DS; + if (segreg == __USER_CS) + return CPT_SEG_USER32_CS; +#endif + wprintk("Invalid segment reg %x\n", segreg); + return CPT_SEG_ZERO; +} + +#ifdef CONFIG_X86_64 +static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, + struct task_struct *tsk) +{ + d->cpt_ebp = s->rbp; + d->cpt_ebx = s->rbx; + d->cpt_eax = s->rax; + d->cpt_ecx = s->rcx; + d->cpt_edx = s->rdx; + d->cpt_esi = s->rsi; + d->cpt_edi = s->rdi; + d->cpt_orig_eax = s->orig_rax; + d->cpt_eip = s->rip; + d->cpt_xcs = encode_segment(s->cs); + d->cpt_eflags = s->eflags; + d->cpt_esp = s->rsp; + d->cpt_xss = encode_segment(s->ss); + d->cpt_xds = encode_segment(tsk->thread.ds); + d->cpt_xes = encode_segment(tsk->thread.es); +} + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + cpt_open_object(NULL, ctx); + + if (task_thread_info(tsk)->flags & _TIF_IA32) { + struct cpt_x86_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + ri.cpt_fs = encode_segment(tsk->thread.fsindex); + ri.cpt_gs = encode_segment(tsk->thread.gsindex); + + xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); + + ctx->write(&ri, sizeof(ri), ctx); + } else { + struct cpt_x86_64_regs ri; + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_64_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_fsbase = tsk->thread.fs; + ri.cpt_gsbase = tsk->thread.gs; + ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); + ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); + ri.cpt_ds = encode_segment(tsk->thread.ds); + ri.cpt_es = encode_segment(tsk->thread.es); + ri.cpt_debugreg[0] = tsk->thread.debugreg0; + ri.cpt_debugreg[1] = tsk->thread.debugreg1; + ri.cpt_debugreg[2] = tsk->thread.debugreg2; + ri.cpt_debugreg[3] = tsk->thread.debugreg3; + ri.cpt_debugreg[4] = 0; + ri.cpt_debugreg[5] = 0; + ri.cpt_debugreg[6] = tsk->thread.debugreg6; + ri.cpt_debugreg[7] = tsk->thread.debugreg7; + + memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); + + ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); + ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); + + ctx->write(&ri, sizeof(ri), ctx); + + } + cpt_close_object(ctx); + + return 0; +} + +#else + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_x86_regs ri; + struct pt_regs *pt_regs; + + cpt_open_object(NULL, ctx); + + ri.cpt_next = sizeof(ri); + ri.cpt_object = CPT_OBJ_X86_REGS; + ri.cpt_hdrlen = sizeof(ri); + ri.cpt_content = CPT_CONTENT_VOID; + + ri.cpt_debugreg[0] = tsk->thread.debugreg[0]; + ri.cpt_debugreg[1] = tsk->thread.debugreg[1]; + ri.cpt_debugreg[2] = tsk->thread.debugreg[2]; + ri.cpt_debugreg[3] = tsk->thread.debugreg[3]; + ri.cpt_debugreg[4] = tsk->thread.debugreg[4]; + ri.cpt_debugreg[5] = tsk->thread.debugreg[5]; + ri.cpt_debugreg[6] = tsk->thread.debugreg[6]; + ri.cpt_debugreg[7] = tsk->thread.debugreg[7]; + + pt_regs = task_pt_regs(tsk); + + ri.cpt_fs = encode_segment(pt_regs->xfs); + ri.cpt_gs = encode_segment(tsk->thread.gs); + + ri.cpt_ebx = pt_regs->ebx; + ri.cpt_ecx = pt_regs->ecx; + ri.cpt_edx = pt_regs->edx; + ri.cpt_esi = pt_regs->esi; + ri.cpt_edi = pt_regs->edi; + ri.cpt_ebp = pt_regs->ebp; + ri.cpt_eax = pt_regs->eax; + ri.cpt_xds = pt_regs->xds; + ri.cpt_xes = pt_regs->xes; + ri.cpt_orig_eax = pt_regs->orig_eax; + ri.cpt_eip = pt_regs->eip; + ri.cpt_xcs = pt_regs->xcs; + ri.cpt_eflags = pt_regs->eflags; + ri.cpt_esp = pt_regs->esp; + ri.cpt_xss = pt_regs->xss; + + ri.cpt_xcs = encode_segment(pt_regs->xcs); + ri.cpt_xss = encode_segment(pt_regs->xss); + ri.cpt_xds = encode_segment(pt_regs->xds); + ri.cpt_xes = encode_segment(pt_regs->xes); + + ctx->write(&ri, sizeof(ri), ctx); + cpt_close_object(ctx); + + return 0; +} +#endif +#endif + +#ifdef CONFIG_IA64 + +/* + PMD? + */ + +#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \ + CPT_TID(tsk), err); return -EINVAL; } } while (0) + +static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk, + struct cpt_context *ctx) +{ + int err; + struct unw_frame_info info; + struct ia64_fpreg fpval; + int i; + + unw_init_from_blocked_task(&info, tsk); + _C(unw_unwind_to_user(&info)); + + /* NAT_BITS */ + do { + unsigned long scratch_unat; + + scratch_unat = info.sw->caller_unat; + if (info.pri_unat_loc) + scratch_unat = *info.pri_unat_loc; + + r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat); + /* Just to be on safe side. */ + r->nat[0] &= 0xFFFFFFFFUL; + } while (0); + + /* R4-R7 */ + for (i = 4; i <= 7; i++) { + char nat = 0; + _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0)); + r->nat[0] |= (nat != 0) << i; + } + + /* B1-B5 */ + for (i = 1; i <= 5; i++) { + _C(unw_access_br(&info, i, &r->br[i], 0)); + } + + /* AR_EC, AR_LC */ + _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0)); + _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0)); + + /* F2..F5, F16..F31 */ + for (i = 2; i <= 5; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + for (i = 16; i <= 31; i++) { + _C(unw_get_fr(&info, i, &fpval)); + memcpy(&r->fr[i*2], &fpval, 16); + } + return 0; +} + +#undef _C + +static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) +{ + int err; + unsigned long pg; + struct cpt_ia64_regs *r; + struct ia64_psr *psr; + struct switch_stack *sw; + struct pt_regs *pt; + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (tsk->exit_state) + return 0; + + pt = task_pt_regs(tsk); + + sw = (struct switch_stack *) (tsk->thread.ksp + 16); + + if ((pg = __get_free_page(GFP_KERNEL)) == 0) + return -ENOMEM; + + r = (void*)pg; + /* To catch if we forgot some register */ + memset(r, 0xA5, sizeof(*r)); + + r->gr[0] = 0; + r->fr[0] = r->fr[1] = 0; + r->fr[2] = 0x8000000000000000UL; + r->fr[3] = 0xffff; + + r->nat[0] = r->nat[1] = 0; + + err = ass_to_mouth(r, tsk, ctx); + if (err) { + printk("ass_to_mouth error %d\n", err); + goto out; + } + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&r->gr[1], &pt->r1, 8*(2-1)); + memcpy(&r->gr[2], &pt->r2, 8*(4-2)); + memcpy(&r->gr[8], &pt->r8, 8*(12-8)); + memcpy(&r->gr[12], &pt->r12, 8*(14-12)); + memcpy(&r->gr[14], &pt->r14, 8*(15-14)); + memcpy(&r->gr[15], &pt->r15, 8*(16-15)); + memcpy(&r->gr[16], &pt->r16, 8*(32-16)); + + r->br[0] = pt->b0; + r->br[6] = pt->b6; + r->br[7] = pt->b7; + + r->ar_bspstore = pt->ar_bspstore; + r->ar_unat = pt->ar_unat; + r->ar_pfs = pt->ar_pfs; + r->ar_ccv = pt->ar_ccv; + r->ar_fpsr = pt->ar_fpsr; + r->ar_csd = pt->ar_csd; + r->ar_ssd = pt->ar_ssd; + r->ar_rsc = pt->ar_rsc; + + r->cr_iip = pt->cr_iip; + r->cr_ipsr = pt->cr_ipsr; + + r->pr = pt->pr; + + r->cfm = pt->cr_ifs; + r->ar_rnat = pt->ar_rnat; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&r->fr[2*6], &pt->f6, 16*(10-6)); + memcpy(&r->fr[2*10], &pt->f10, 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&r->fr[2*12], &sw->f12, 16*(16-12)); + /* fpregs 32...127 */ + psr = ia64_psr(task_pt_regs(tsk)); + preempt_disable(); + if (ia64_is_local_fpu_owner(tsk) && psr->mfh) { + psr->mfh = 0; + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + ia64_save_fpu(&tsk->thread.fph[0]); + } + preempt_enable(); + memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32)); + + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) { + memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr)); + memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr)); + } else { + memset(r->ibr, 0, sizeof(r->ibr)); + memset(r->dbr, 0, sizeof(r->dbr)); + } + + r->loadrs = pt->loadrs; + r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19)); + if ((long)pt->cr_ifs > 0) + r->num_regs += (pt->cr_ifs & 0x7f); + + if (r->num_regs > 96) { + eprintk_ctx(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp = ia64_rse_rnat_addr(ptr); + + r->gr[32+reg] = *ptr; + + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + if (*rnatp & (1UL<nat[0] |= (1UL<<(reg+32)); + else + r->nat[1] |= (1UL<<(reg-32)); + } + } + if (r->nat[0] | r->nat[1]) + wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk), + r->nat[1], r->nat[0]); + + cpt_open_object(NULL, ctx); + r->cpt_next = sizeof(*r); + r->cpt_object = CPT_OBJ_IA64_REGS; + r->cpt_hdrlen = sizeof(*r); + r->cpt_content = CPT_CONTENT_VOID; + ctx->write(r, sizeof(*r), ctx); + cpt_close_object(ctx); + err = 0; + +out: + free_page(pg); + return err; +} +#endif + +static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + void *start; + + cpt_open_object(NULL, ctx); + +#ifdef CONFIG_X86_64 + size = tsk->thread.rsp0 - tsk->thread.rsp; + start = (void*)tsk->thread.rsp; +#elif defined(CONFIG_X86_32) + size = tsk->thread.esp0 - tsk->thread.esp; + start = (void*)tsk->thread.esp; +#elif defined(CONFIG_IA64) + size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp; + start = (void*)tsk->thread.ksp; +#else +#error Arch is not supported +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = CPT_CONTENT_STACK; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(start, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} + +#ifdef CONFIG_X86 +/* Formats of i387_fxsave_struct are the same for x86_64 + * and i386. Plain luck. */ + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + struct cpt_obj_bits hdr; + unsigned long size; + int type; + + cpt_open_object(NULL, ctx); + + type = CPT_CONTENT_X86_FPUSTATE; + size = sizeof(struct i387_fxsave_struct); +#ifndef CONFIG_X86_64 + if (!cpu_has_fxsr) { + size = sizeof(struct i387_fsave_struct); + type = CPT_CONTENT_X86_FPUSTATE_OLD; + } +#endif + + hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); + hdr.cpt_object = CPT_OBJ_BITS; + hdr.cpt_hdrlen = sizeof(hdr); + hdr.cpt_content = type; + hdr.cpt_size = size; + + ctx->write(&hdr, sizeof(hdr), ctx); + ctx->write(&tsk->thread.i387, size, ctx); + ctx->align(ctx); + cpt_close_object(ctx); + return 0; +} +#endif + +#ifdef CONFIG_IA64 + +static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) +{ + return 0; +} +#endif + +static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) +{ + si->cpt_signo = info->si_signo; + si->cpt_errno = info->si_errno; + si->cpt_code = info->si_code; + + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + si->cpt_pid = info->si_tid; + si->cpt_uid = info->si_overrun; + si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); + si->cpt_utime = info->si_sys_private; + break; + case __SI_POLL: + si->cpt_pid = info->si_band; + si->cpt_uid = info->si_fd; + break; + case __SI_FAULT: + si->cpt_sigval = cpt_ptr_export(info->si_addr); +#ifdef __ARCH_SI_TRAPNO + si->cpt_pid = info->si_trapno; +#endif + break; + case __SI_CHLD: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = info->si_status; + si->cpt_stime = info->si_stime; + si->cpt_utime = info->si_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + si->cpt_pid = info->si_pid; + si->cpt_uid = info->si_uid; + si->cpt_sigval = cpt_ptr_export(info->si_ptr); + break; + } + return 0; +} + +static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) +{ + struct sigqueue *q; + loff_t saved_obj; + + if (list_empty(&list->list)) + return 0; + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(q, &list->list, list) { + struct cpt_siginfo_image si; + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_qflags = q->flags; + si.cpt_user = q->user->uid; + + if (encode_siginfo(&si, &q->info)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + } + cpt_pop_object(&saved_obj, ctx); + return 0; +} + + + +static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct signal_struct *sig = obj->o_obj; + struct cpt_signal_image *v = cpt_get_buf(ctx); + struct task_struct *tsk; + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + if (sig->__pgrp <= 0) { + eprintk_ctx("bad pgid\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid(sig->__pgrp); + if (tsk == NULL) + v->cpt_pgrp_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_pgrp = pid_to_vpid(sig->__pgrp); + + v->cpt_old_pgrp = 0; +/* if (!sig->tty_old_pgrp) { + eprintk_ctx("bad tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + }*/ + if (sig->tty_old_pgrp) { + v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID); + if (tsk == NULL) { + v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; + tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID); + } + read_unlock(&tasklist_lock); + if (tsk == NULL) { + eprintk_ctx("tty_old_pgrp does not exist anymore\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp); + if ((int)v->cpt_old_pgrp < 0) { + dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp)); + v->cpt_old_pgrp = -1; + v->cpt_old_pgrp_type = CPT_PGRP_STRAY; + } + } + + if (sig->__session <= 0) { + eprintk_ctx("bad session\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_session_type = CPT_PGRP_NORMAL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid(sig->__session); + if (tsk == NULL) + v->cpt_session_type = CPT_PGRP_ORPHAN; + read_unlock(&tasklist_lock); + v->cpt_session = pid_to_vpid(sig->__session); + + v->cpt_leader = sig->leader; + v->cpt_ctty = CPT_NULL; + if (sig->tty) { + cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); + if (cobj) + v->cpt_ctty = cobj->o_pos; + else { + eprintk_ctx("controlling tty is not found\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); + + v->cpt_curr_target = 0; + if (sig->curr_target) + v->cpt_curr_target = task_pid_vnr(sig->curr_target); + v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); + v->cpt_group_exit_code = sig->group_exit_code; + v->cpt_group_exit_task = 0; + if (sig->group_exit_task) + v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task); + v->cpt_notify_count = sig->notify_count; + v->cpt_group_stop_count = sig->group_stop_count; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) + v->cpt_utime = sig->utime; + v->cpt_stime = sig->stime; + v->cpt_cutime = sig->cutime; + v->cpt_cstime = sig->cstime; + v->cpt_nvcsw = sig->nvcsw; + v->cpt_nivcsw = sig->nivcsw; + v->cpt_cnvcsw = sig->cnvcsw; + v->cpt_cnivcsw = sig->cnivcsw; + v->cpt_min_flt = sig->min_flt; + v->cpt_maj_flt = sig->maj_flt; + v->cpt_cmin_flt = sig->cmin_flt; + v->cpt_cmaj_flt = sig->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = sig->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + dump_sigqueue(&sig->shared_pending, ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) +{ + if (tsk->splice_pipe) { + eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#ifdef CONFIG_KEYS + if (tsk->request_key_auth || tsk->thread_keyring) { + eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_NUMA + if (tsk->mempolicy) { + eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif +#ifdef CONFIG_TUX + if (tsk->tux_info) { + eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk)); + return -EBUSY; + } +#endif + return 0; +} + +static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + int last_thread; + struct cpt_task_image *v = cpt_get_buf(ctx); + cpt_object_t *tobj; + cpt_object_t *tg_obj; + loff_t saved_obj; + int i; + int err; + struct timespec delta; + struct mm_struct * tsk_mm; + struct files_struct * tsk_files; + struct fs_struct * tsk_fs; + struct mnt_namespace * tsk_ns; + + cpt_open_object(obj, ctx); + + v->cpt_signal = CPT_NULL; + tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); + if (!tg_obj) BUG(); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_TASK; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_state = tsk->state; + if (tsk->state == EXIT_ZOMBIE) { + eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } else if (tsk->state == EXIT_DEAD) { + if (tsk->exit_state != EXIT_DEAD && + tsk->exit_state != EXIT_ZOMBIE) { + eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (tsk->exit_state) { + v->cpt_state = tsk->exit_state; + if (tsk->state != EXIT_DEAD) { + eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n", + tsk->state, tsk->exit_state, CPT_TID(tsk)); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if (cpt_check_unsupported(tsk, ctx)) { + cpt_release_buf(ctx); + return -EBUSY; + } + + v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART); + v->cpt_ptrace = tsk->ptrace; + v->cpt_prio = tsk->prio; + v->cpt_exit_code = tsk->exit_code; + v->cpt_exit_signal = tsk->exit_signal; + v->cpt_pdeath_signal = tsk->pdeath_signal; + v->cpt_static_prio = tsk->static_prio; + v->cpt_rt_priority = tsk->rt_priority; + v->cpt_policy = tsk->policy; + if (v->cpt_policy != SCHED_NORMAL) { + eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + /* Unpleasant moment. When leader of thread group exits, + * it remains in zombie state until all the group exits. + * We save not-NULL pointers to process mm/files/fs, so + * that we can restore this thread group. + */ + tsk_mm = tsk->mm; + tsk_files = tsk->files; + tsk_fs = tsk->fs; + tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL; + + if (tsk->exit_state && !thread_group_empty(tsk) && + thread_group_leader(tsk)) { + struct task_struct * p = tsk; + + read_lock(&tasklist_lock); + do { + if (p->mm) + tsk_mm = p->mm; + if (p->files) + tsk_files = p->files; + if (p->fs) + tsk_fs = p->fs; + if (p->nsproxy && p->nsproxy->mnt_ns) + tsk_ns = p->nsproxy->mnt_ns; + p = next_thread(p); + } while (p != tsk); + read_unlock(&tasklist_lock); + } + + v->cpt_mm = CPT_NULL; + if (tsk_mm) { + tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx); + if (!tobj) BUG(); + v->cpt_mm = tobj->o_pos; + } + v->cpt_files = CPT_NULL; + if (tsk_files) { + tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx); + if (!tobj) BUG(); + v->cpt_files = tobj->o_pos; + } + v->cpt_fs = CPT_NULL; + if (tsk_fs) { + tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx); + if (!tobj) BUG(); + v->cpt_fs = tobj->o_pos; + } + v->cpt_namespace = CPT_NULL; + if (tsk_ns) { + tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx); + if (!tobj) BUG(); + v->cpt_namespace = tobj->o_pos; + + if (tsk_ns != current->nsproxy->mnt_ns) + eprintk_ctx("namespaces are not supported:" + "process " CPT_FID "\n", CPT_TID(tsk)); + } + v->cpt_sysvsem_undo = CPT_NULL; + if (tsk->sysvsem.undo_list && !tsk->exit_state) { + tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); + if (!tobj) BUG(); + v->cpt_sysvsem_undo = tobj->o_pos; + } + v->cpt_sighand = CPT_NULL; + if (tsk->sighand) { + tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); + if (!tobj) BUG(); + v->cpt_sighand = tobj->o_pos; + } + v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); + v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); + v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); + + v->cpt_pid = task_pid_vnr(tsk); + v->cpt_tgid = task_tgid_vnr(tsk); + v->cpt_ppid = 0; + if (tsk->parent) { + if (tsk->parent != tsk->real_parent && + !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { + eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + cpt_release_buf(ctx); + return -EBUSY; + } + v->cpt_ppid = task_pid_vnr(tsk->parent); + } + v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0; + v->cpt_pgrp = task_pgrp_vnr(tsk); + v->cpt_session = task_session_vnr(tsk); + v->cpt_old_pgrp = 0; + if (tsk->signal->tty_old_pgrp) + v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp); + v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0; + v->cpt_set_tid = (unsigned long)tsk->set_child_tid; + v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; + memcpy(v->cpt_comm, tsk->comm, 16); + v->cpt_user = tsk->user->uid; + v->cpt_uid = tsk->uid; + v->cpt_euid = tsk->euid; + v->cpt_suid = tsk->suid; + v->cpt_fsuid = tsk->fsuid; + v->cpt_gid = tsk->gid; + v->cpt_egid = tsk->egid; + v->cpt_sgid = tsk->sgid; + v->cpt_fsgid = tsk->fsgid; + v->cpt_ngids = 0; + if (tsk->group_info && tsk->group_info->ngroups != 0) { + int i = tsk->group_info->ngroups; + if (i > 32) { + /* Shame... I did a simplified version and _forgot_ + * about this. Later, later. */ + eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + v->cpt_ngids = i; + for (i--; i>=0; i--) + v->cpt_gids[i] = tsk->group_info->small_block[i]; + } + v->cpt_prctl_uac = 0; + v->cpt_prctl_fpemu = 0; + v->__cpt_pad1 = 0; +#ifdef CONFIG_IA64 + v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT; + v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT; +#endif + memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); + memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); + memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); + v->cpt_keepcap = tsk->keep_capabilities; + + v->cpt_did_exec = tsk->did_exec; + v->cpt_exec_domain = -1; + v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<cpt_64bit = 0; +#ifdef CONFIG_X86_64 + /* Clear x86_64 specific flags */ + v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!(task_thread_info(tsk)->flags & _TIF_IA32)) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif +#ifdef CONFIG_IA64 + /* Clear ia64 specific flags */ + //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); + if (!IS_IA32_PROCESS(task_pt_regs(tsk))) { + ctx->tasks64++; + v->cpt_64bit = 1; + } +#endif + v->cpt_thrstatus = task_thread_info(tsk)->status; + v->cpt_addr_limit = -1; + + v->cpt_personality = tsk->personality; + +#ifdef CONFIG_X86 + for (i=0; i=3) { + eprintk_ctx("too many tls descs\n"); + cpt_release_buf(ctx); + return -EINVAL; + } +#ifndef CONFIG_X86_64 + v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; +#else + v->cpt_tls[i] = tsk->thread.tls_array[i]; +#endif + } +#endif + + v->cpt_restart.fn = CPT_RBL_0; + if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) { + struct restart_block *rb = &task_thread_info(tsk)->restart_block; + ktime_t e; + + if (rb->fn == hrtimer_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (rb->fn == compat_nanosleep_restart) { + v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; + + e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } +#endif + if (rb->fn == do_restart_poll) { + u64 timeout_jiffies; + + timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2; + e.tv64 = timeout_jiffies * TICK_NSEC; + + v->cpt_restart.fn = CPT_RBL_POLL; + v->cpt_restart.arg0 = rb->arg0; + v->cpt_restart.arg1 = rb->arg1; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = 0; + dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); + goto continue_dump; + } + if (rb->fn == futex_wait_restart) { + v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT; + + e.tv64 = rb->futex.time; + e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr; + v->cpt_restart.arg1 = rb->futex.val; + v->cpt_restart.arg2 = ktime_to_ns(e); + v->cpt_restart.arg3 = rb->futex.flags; + goto continue_dump; + } + eprintk_ctx("unknown restart block %p\n", rb->fn); + return -EINVAL; + } + +continue_dump: + v->cpt_it_real_incr = 0; + v->cpt_it_prof_incr = 0; + v->cpt_it_virt_incr = 0; + v->cpt_it_real_value = 0; + v->cpt_it_prof_value = 0; + v->cpt_it_virt_value = 0; + if (thread_group_leader(tsk) && tsk->exit_state == 0) { + ktime_t rem; + + v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); + v->cpt_it_prof_incr = tsk->signal->it_prof_incr; + v->cpt_it_virt_incr = tsk->signal->it_virt_incr; + + rem = hrtimer_get_remaining(&tsk->signal->real_timer); + + if (hrtimer_active(&tsk->signal->real_timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + v->cpt_it_real_value = ktime_to_ns(rem); + dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value); + } + v->cpt_it_prof_value = tsk->signal->it_prof_expires; + v->cpt_it_virt_value = tsk->signal->it_virt_expires; + } + v->cpt_used_math = (tsk_used_math(tsk) != 0); + + if (tsk->notifier) { + eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + cpt_release_buf(ctx); + return -EINVAL; + } + + v->cpt_utime = tsk->utime; + v->cpt_stime = tsk->stime; + delta = tsk->start_time; + _set_normalized_timespec(&delta, + delta.tv_sec - get_exec_env()->start_timespec.tv_sec, + delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec); + v->cpt_starttime = cpt_timespec_export(&delta); + v->cpt_nvcsw = tsk->nvcsw; + v->cpt_nivcsw = tsk->nivcsw; + v->cpt_min_flt = tsk->min_flt; + v->cpt_maj_flt = tsk->maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + v->cpt_cutime = tsk->cutime; + v->cpt_cstime = tsk->cstime; + v->cpt_cnvcsw = tsk->cnvcsw; + v->cpt_cnivcsw = tsk->cnivcsw; + v->cpt_cmin_flt = tsk->cmin_flt; + v->cpt_cmaj_flt = tsk->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#else + v->cpt_cutime = tsk->signal->cutime; + v->cpt_cstime = tsk->signal->cstime; + v->cpt_cnvcsw = tsk->signal->cnvcsw; + v->cpt_cnivcsw = tsk->signal->cnivcsw; + v->cpt_cmin_flt = tsk->signal->cmin_flt; + v->cpt_cmaj_flt = tsk->signal->cmaj_flt; + + if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) + __asm__("undefined\n"); + + for (i=0; icpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; + v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; + } else { + v->cpt_rlim_cur[i] = CPT_NULL; + v->cpt_rlim_max[i] = CPT_NULL; + } + } +#endif + +#ifdef CONFIG_BEANCOUNTERS + if (tsk->mm) + v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); + else + v->cpt_mm_ub = CPT_NULL; + v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); + v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); + v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); +#endif + + v->cpt_ptrace_message = tsk->ptrace_message; + v->cpt_pn_state = tsk->pn_state; + v->cpt_stopped_state = tsk->stopped_state; + v->cpt_sigsuspend_state = 0; + +#ifdef CONFIG_X86_32 + if (tsk->thread.vm86_info) { + eprintk_ctx("vm86 task is running\n"); + cpt_release_buf(ctx); + return -EBUSY; + } +#endif + + v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + dump_kstack(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_push_object(&saved_obj, ctx); + err = dump_registers(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + if (tsk_used_math(tsk)) { + cpt_push_object(&saved_obj, ctx); + dump_fpustate(tsk, ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->last_siginfo) { + struct cpt_siginfo_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_LASTSIGINFO; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + if (encode_siginfo(&si, tsk->last_siginfo)) + return -EINVAL; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->sas_ss_size) { + struct cpt_sigaltstack_image si; + cpt_push_object(&saved_obj, ctx); + + si.cpt_next = sizeof(si); + si.cpt_object = CPT_OBJ_SIGALTSTACK; + si.cpt_hdrlen = sizeof(si); + si.cpt_content = CPT_CONTENT_VOID; + + si.cpt_stack = tsk->sas_ss_sp; + si.cpt_stacksize = tsk->sas_ss_size; + + ctx->write(&si, sizeof(si), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + if (tsk->robust_list +#ifdef CONFIG_COMPAT + || tsk->compat_robust_list +#endif + ) { + struct cpt_task_aux_image ai; + cpt_push_object(&saved_obj, ctx); + + ai.cpt_next = sizeof(ai); + ai.cpt_object = CPT_OBJ_TASK_AUX; + ai.cpt_hdrlen = sizeof(ai); + ai.cpt_content = CPT_CONTENT_VOID; + + ai.cpt_robust_list = (unsigned long)tsk->robust_list; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags & _TIF_IA32) + ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list; +#endif +#endif + ctx->write(&ai, sizeof(ai), ctx); + cpt_pop_object(&saved_obj, ctx); + } + + dump_sigqueue(&tsk->pending, ctx); + + last_thread = 1; + read_lock(&tasklist_lock); + do { + struct task_struct * next = next_thread(tsk); + if (next != tsk && !thread_group_leader(next)) + last_thread = 0; + } while (0); + read_unlock(&tasklist_lock); + + if (last_thread) { + struct task_struct *prev_tsk; + int err; + loff_t pos = ctx->file->f_pos; + + cpt_push_object(&saved_obj, ctx); + err = dump_one_signal_struct(tg_obj, ctx); + cpt_pop_object(&saved_obj, ctx); + if (err) + return err; + + prev_tsk = tsk; + for (;;) { + if (prev_tsk->tgid == tsk->tgid) { + loff_t tg_pos; + + tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); + ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); + if (thread_group_leader(prev_tsk)) + break; + } + + if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { + eprintk_ctx("bug: thread group leader is lost\n"); + return -EINVAL; + } + + obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); + prev_tsk = obj->o_obj; + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_tasks(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_TASKS); + + for_each_object(obj, CPT_OBJ_TASK) { + int err; + + if ((err = dump_one_process(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} + +int cpt_collect_signals(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + /* Collect process fd sets */ + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { + eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); + return -EBUSY; + } + if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) + return -ENOMEM; + if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) + return -ENOMEM; + } + return 0; +} + + +static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct sighand_struct *sig = obj->o_obj; + struct cpt_sighand_image *v = cpt_get_buf(ctx); + int i; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + for (i=0; i< _NSIG; i++) { + if (sig->action[i].sa.sa_handler != SIG_DFL || + sig->action[i].sa.sa_flags) { + loff_t saved_obj; + struct cpt_sighandler_image *o = cpt_get_buf(ctx); + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + o->cpt_next = CPT_NULL; + o->cpt_object = CPT_OBJ_SIGHANDLER; + o->cpt_hdrlen = sizeof(*o); + o->cpt_content = CPT_CONTENT_VOID; + + o->cpt_signo = i; + o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; + o->cpt_restorer = 0; +#ifdef CONFIG_X86 + o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; +#endif + o->cpt_flags = sig->action[i].sa.sa_flags; + memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); + ctx->write(o, sizeof(*o), ctx); + cpt_release_buf(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_sighand(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); + + for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { + int err; + + if ((err = dump_one_sighand_struct(obj, ctx)) != 0) + return err; + } + + cpt_close_section(ctx); + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_process.h linux-2.6.24.ovz/kernel/cpt/cpt_process.h --- linux-2.6.24/kernel/cpt/cpt_process.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_process.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,13 @@ +int cpt_collect_signals(cpt_context_t *); +int cpt_dump_signal(struct cpt_context *); +int cpt_dump_sighand(struct cpt_context *); +int cpt_dump_tasks(struct cpt_context *); + +int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx); +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); + +int rst_restore_process(struct cpt_context *ctx); +int rst_process_linkage(struct cpt_context *ctx); + +int check_task_state(struct task_struct *tsk, struct cpt_context *ctx); +struct pid *alloc_vpid_safe(pid_t vnr); diff -uprN linux-2.6.24/kernel/cpt/cpt_socket.c linux-2.6.24.ovz/kernel/cpt/cpt_socket.c --- linux-2.6.24/kernel/cpt/cpt_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_socket.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,787 @@ +/* + * + * kernel/cpt/cpt_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); + + +/* Sockets are quite different of another kinds of files. + * There is one simplification: only one struct file can refer to a socket, + * so we could store information about socket directly in section FILES as + * a description of a file and append f.e. array of not-yet-accepted + * connections of listening socket as array of auxiliary data. + * + * Complications are: + * 1. TCP sockets can be orphans. We have to relocate orphans as well, + * so we have to create special section for orphans. + * 2. AF_UNIX sockets are distinguished objects: set of links between + * AF_UNIX sockets is quite arbitrary. + * A. Each socket can refers to many of files due to FD passing. + * B. Each socket except for connected ones can have in queue skbs + * sent by any of sockets. + * + * 2A is relatively easy: after our tasks are frozen we make an additional + * recursive pass throgh set of collected files and get referenced to + * FD passed files. After end of recursion, all the files are treated + * in the same way. All they will be stored in section FILES. + * + * 2B. We have to resolve all those references at some point. + * It is the place where pipe-like approach to image fails. + * + * All this makes socket checkpointing quite chumbersome. + * Right now we collect all the sockets and assign some numeric index value + * to each of them. The socket section is separate and put after section FILES, + * so section FILES refers to sockets by index, section SOCKET refers to FILES + * as usual by position in image. All the refs inside socket section are + * by index. When restoring we read socket section, create objects to hold + * mappings index <-> pos. At the second pass we open sockets (simultaneosly + * with their pairs) and create FILE objects. + */ + + +/* ====== FD passing ====== */ + +/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we + * have to implement this. A problem is that in general case we receive + * skbs from an unknown context, so new files can arrive to checkpointed + * set of processes even after they are stopped. Well, we are going just + * to ignore unknown fds while doing real checkpointing. It is fair because + * links outside checkpointed set are going to fail anyway. + * + * ATTN: the procedure is recursive. We linearize the recursion adding + * newly found files to the end of file list, so they will be analyzed + * in the same loop. + */ + +static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) +{ + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + struct sock *sk; + struct sk_buff *skb; + + if (!S_ISSOCK(inode->i_mode)) + return -ENOTSOCK; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + if (sock->ops->family != AF_UNIX) + return 0; + + sk = sock->sk; + + /* Subtle locking issue. skbs cannot be removed while + * we are scanning, because all the processes are stopped. + * They still can be added to tail of queue. Locking while + * we dereference skb->next is enough to resolve this. + * See above about collision with skbs added after we started + * checkpointing. + */ + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + if (UNIXCB(skb).fp && skb->sk && + (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + int i; + + for (i = fpl->count-1; i >= 0; i--) { + if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) + return -ENOMEM; + } + } + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_collect_passedfds(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { + int err; + + if ((err = collect_one_passedfd(file, ctx)) < 0) + return err; + } + } + + return 0; +} + +/* ====== End of FD passing ====== */ + +/* Must be called under bh_lock_sock() */ + +void clear_backlog(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + while (skb) { + struct sk_buff *next = skb->next; + + skb->next = NULL; + kfree_skb(skb); + skb = next; + } +} + +void release_sock_nobacklog(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + clear_backlog(sk); + sk->sk_lock.owned = 0; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, + struct cpt_context *ctx) +{ + struct cpt_skb_image *v = cpt_get_buf(ctx); + loff_t saved_obj; + struct timeval tmptv; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SKB; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_owner = owner; + v->cpt_queue = type; + skb_get_timestamp(skb, &tmptv); + v->cpt_stamp = cpt_timeval_export(&tmptv); + v->cpt_hspace = skb->data - skb->head; + v->cpt_tspace = skb->end - skb->tail; + v->cpt_h = skb_transport_header(skb) - skb->head; + v->cpt_nh = skb_network_header(skb) - skb->head; + v->cpt_mac = skb_mac_header(skb) - skb->head; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); + memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); + if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { + int i; + for (i=sizeof(v->cpt_cb); icb); i++) { + if (skb->cb[i]) { + wprintk_ctx("dirty skb cb"); + break; + } + } + } + v->cpt_len = skb->len; + v->cpt_mac_len = skb->mac_len; + v->cpt_csum = skb->csum; + v->cpt_local_df = skb->local_df; + v->cpt_pkt_type = skb->pkt_type; + v->cpt_ip_summed = skb->ip_summed; + v->cpt_priority = skb->priority; + v->cpt_protocol = skb->protocol; + v->cpt_security = 0; + v->cpt_gso_segs = skb_shinfo(skb)->gso_segs; + v->cpt_gso_size = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type) { + eprintk_ctx("skb ufo is not supported\n"); + return -EINVAL; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (skb->len + (skb->data - skb->head) > 0) { + struct cpt_obj_bits ob; + loff_t saved_obj2; + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + ob.cpt_next = CPT_NULL; + ob.cpt_object = CPT_OBJ_BITS; + ob.cpt_hdrlen = sizeof(ob); + ob.cpt_content = CPT_CONTENT_DATA; + ob.cpt_size = skb->len + v->cpt_hspace; + + ctx->write(&ob, sizeof(ob), ctx); + + ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); + if (skb->data_len) { + int offset = skb->len - skb->data_len; + while (offset < skb->len) { + int copy = skb->len - offset; + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) + BUG(); + ctx->write(ctx->tmpbuf, copy, ctx); + __cpt_release_buf(ctx); + offset += copy; + } + } + + ctx->align(ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + + if (skb->sk && skb->sk->sk_family == AF_UNIX) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl) { + int i; + + for (i = 0; i < fpl->count; i++) { + struct cpt_fd_image v; + cpt_object_t *obj; + loff_t saved_obj2; + + obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); + + if (!obj) { + eprintk_ctx("lost passed FD\n"); + return -EINVAL; + } + + cpt_push_object(&saved_obj2, ctx); + cpt_open_object(NULL, ctx); + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_FILEDESC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_VOID; + + v.cpt_fd = i; + v.cpt_file = obj->o_pos; + v.cpt_flags = 0; + ctx->write(&v, sizeof(v), ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj2, ctx); + } + } + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + return 0; +} + +static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct sock *sk_cache = NULL; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + int err; + + if (sk->sk_family == AF_UNIX) { + cpt_object_t *obj; + if (skb->sk != sk_cache) { + idx = -1; + sk_cache = NULL; + obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); + if (obj) { + idx = obj->o_index; + sk_cache = skb->sk; + } else if (unix_peer(sk) != skb->sk) + goto next_skb; + } + } + + err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); + if (err) + return err; + +next_skb: + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + return 0; +} + +static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_write_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { + int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&sk->sk_write_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_write_queue.lock); + } + return 0; +} + +void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) +{ + loff_t saved_obj; + if (sk->sk_filter) { + struct cpt_obj_bits v; + + cpt_push_object(&saved_obj, ctx); + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SKFILTER; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_DATA; + v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); + + ctx->write(&v, sizeof(v), ctx); + ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); + cpt_close_object(ctx); + cpt_pop_object(&saved_obj, ctx); + } + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + cpt_push_object(&saved_obj, ctx); + cpt_dump_mcfilter(sk, ctx); + cpt_pop_object(&saved_obj, ctx); + } +} + +/* Dump socket content */ + +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) +{ + struct cpt_sock_image *v = cpt_get_buf(ctx); + struct socket *sock; + struct timeval tmptv; + + cpt_open_object(obj, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_SOCKET; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_file = CPT_NULL; + sock = sk->sk_socket; + if (sock && sock->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); + if (tobj) + v->cpt_file = tobj->o_pos; + } + v->cpt_index = index; + v->cpt_parent = parent; + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + if (sock && !obj->o_lock) { + lockdep_off(); + lock_sock(sk); + lockdep_on(); + obj->o_lock = 1; + } + } + + /* Some bits stored in inode */ + v->cpt_ssflags = sock ? sock->flags : 0; + v->cpt_sstate = sock ? sock->state : 0; + v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; + + /* Common data */ + v->cpt_family = sk->sk_family; + v->cpt_type = sk->sk_type; + v->cpt_state = sk->sk_state; + v->cpt_reuse = sk->sk_reuse; + v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); + v->cpt_shutdown = sk->sk_shutdown; + v->cpt_userlocks = sk->sk_userlocks; + v->cpt_no_check = sk->sk_no_check; + v->cpt_zapped = sock_flag(sk, SOCK_DBG); + v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); + v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); + v->cpt_protocol = sk->sk_protocol; + v->cpt_err = sk->sk_err; + v->cpt_err_soft = sk->sk_err_soft; + v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; + v->cpt_priority = sk->sk_priority; + v->cpt_rcvlowat = sk->sk_rcvlowat; + v->cpt_rcvtimeo = CPT_NULL; + if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; + v->cpt_sndtimeo = CPT_NULL; + if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) + v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; + v->cpt_rcvbuf = sk->sk_rcvbuf; + v->cpt_sndbuf = sk->sk_sndbuf; + v->cpt_bound_dev_if = sk->sk_bound_dev_if; + v->cpt_flags = sk->sk_flags; + v->cpt_lingertime = CPT_NULL; + if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) + v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; + v->cpt_peer_pid = sk->sk_peercred.pid; + v->cpt_peer_uid = sk->sk_peercred.uid; + v->cpt_peer_gid = sk->sk_peercred.gid; + tmptv = ktime_to_timeval(sk->sk_stamp); + v->cpt_stamp = cpt_timeval_export(&tmptv); + + v->cpt_peer = -1; + v->cpt_socketpair = 0; + v->cpt_deleted = 0; + + v->cpt_laddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_laddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); + if (err) { + cpt_release_buf(ctx); + return err; + } + v->cpt_laddrlen = alen; + } + v->cpt_raddrlen = 0; + if (sock) { + int alen = sizeof(v->cpt_raddr); + int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); + if (!err) + v->cpt_raddrlen = alen; + } + + if (sk->sk_family == AF_UNIX) { + if (unix_sk(sk)->dentry) { + struct dentry *d = unix_sk(sk)->dentry; + v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); + if (!v->cpt_deleted) { + int err = 0; + char *path; + unsigned long pg = __get_free_page(GFP_KERNEL); + + if (!pg) { + cpt_release_buf(ctx); + return -ENOMEM; + } + + path = d_path(d, unix_sk(sk)->mnt, (char *)pg, PAGE_SIZE); + + if (!IS_ERR(path)) { + int len = strlen(path); + if (len < 126) { + strcpy(((char*)v->cpt_laddr)+2, path); + v->cpt_laddrlen = len + 2; + } else { + wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); + } + err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); + } else { + eprintk_ctx("cannot get path of an af_unix socket\n"); + err = PTR_ERR(path); + } + free_page(pg); + if (err) { + cpt_release_buf(ctx); + return err; + } + } + } + + /* If the socket is connected, find its peer. If peer is not + * in our table, the socket is connected to external process + * and we consider it disconnected. + */ + if (unix_peer(sk)) { + cpt_object_t *pobj; + pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); + if (pobj) + v->cpt_peer = pobj->o_index; + else + v->cpt_shutdown = SHUTDOWN_MASK; + + if (unix_peer(unix_peer(sk)) == sk) + v->cpt_socketpair = 1; + } + + /* If the socket shares address with another socket it is + * child of some listening socket. Find and record it. */ + if (unix_sk(sk)->addr && + atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && + sk->sk_state != TCP_LISTEN) { + cpt_object_t *pobj; + for_each_object(pobj, CPT_OBJ_SOCKET) { + struct sock *psk = pobj->o_obj; + if (psk->sk_family == AF_UNIX && + psk->sk_state == TCP_LISTEN && + unix_sk(psk)->addr == unix_sk(sk)->addr) { + v->cpt_parent = pobj->o_index; + break; + } + } + } + } + + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + cpt_dump_socket_in(v, sk, ctx); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_dump_sock_attr(sk, ctx); + + dump_rqueue(index, sk, ctx); + if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { + dump_wqueue(index, sk, ctx); + cpt_dump_ofo_queue(index, sk, ctx); + } + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_synwait_queue(sk, index, ctx); + + cpt_close_object(ctx); + + if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) + && sk->sk_state == TCP_LISTEN) + cpt_dump_accept_queue(sk, index, ctx); + + return 0; +} + +int cpt_dump_orphaned_sockets(struct cpt_context *ctx) +{ + int i; + + cpt_open_section(ctx, CPT_SECT_ORPHANS); + + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +retry: + read_lock_bh(lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { + + if (sk->owner_env != get_exec_env()) + continue; + if (sk->sk_socket) + continue; + if (!sock_flag(sk, SOCK_DEAD)) + continue; + if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) + continue; + sock_hold(sk); + read_unlock_bh(lock); + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("BUG: sk locked by whom?\n"); + sk->sk_lock.owned = 1; + bh_unlock_sock(sk); + local_bh_enable(); + + cpt_dump_socket(NULL, sk, -1, -1, ctx); + + local_bh_disable(); + bh_lock_sock(sk); + sk->sk_lock.owned = 0; + clear_backlog(sk); + tcp_done(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + + goto retry; + } + read_unlock_bh(lock); + } + cpt_close_section(ctx); + return 0; +} + +static int can_dump(struct sock *sk, cpt_context_t *ctx) +{ + switch (sk->sk_family) { + case AF_NETLINK: + if (((struct netlink_sock *)sk)->cb) { + eprintk_ctx("netlink socket has active callback\n"); + return 0; + } + break; + } + return 1; +} + +/* We are not going to block suspend when we have external AF_UNIX connections. + * But we cannot stop feed of new packets/connections to our environment + * from outside. Taking into account that it is intrincically unreliable, + * we collect some amount of data, but when checkpointing/restoring we + * are going to drop everything, which does not make sense: skbs sent + * by outside processes, connections from outside etc. etc. + */ + +/* The first pass. When we see socket referenced by a file, we just + * add it to socket table */ +int cpt_collect_socket(struct file *file, cpt_context_t * ctx) +{ + cpt_object_t *obj; + struct socket *sock; + struct sock *sk; + + if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) + return -ENOTSOCK; + sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; + sk = sock->sk; + if (!can_dump(sk, ctx)) + return -EAGAIN; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) + return -ENOMEM; + obj->o_parent = file; + + return 0; +} + +/* + * We should end with table containing: + * * all sockets opened by our processes in the table. + * * all the sockets queued in listening queues on _our_ listening sockets, + * which are connected to our opened sockets. + */ + +static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) +{ + struct sock *sk = obj->o_obj; + cpt_object_t *cobj; + struct sk_buff *skb; + + skb = skb_peek(&sk->sk_receive_queue); + while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { + struct sock *lsk = skb->sk; + if (unix_peer(lsk) && + lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { + if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) + return -ENOMEM; + cobj->o_parent = obj->o_parent; + } + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb->next; + spin_unlock_irq(&sk->sk_receive_queue.lock); + } + + return 0; +} + +int cpt_index_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + unsigned long index = 0; + + /* Collect not-yet-accepted children of listening sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + + if (sk->sk_state != TCP_LISTEN) + continue; + + if (sk->sk_family == AF_UNIX) + collect_one_unix_listening_sock(obj, ctx); + } + + /* Assign indices to all the sockets. */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + cpt_obj_setindex(obj, index++, ctx); + + if (sk->sk_socket && sk->sk_socket->file) { + cpt_object_t *tobj; + tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); + if (tobj) + cpt_obj_setindex(tobj, obj->o_index, ctx); + } + } + + return 0; +} + +void cpt_unlock_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + lockdep_off(); + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + if (sk->sk_socket) + release_sock(sk); + } + } + lockdep_on(); +} + +void cpt_kill_sockets(cpt_context_t * ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && obj->o_lock) { + struct ve_struct *old_env; + old_env = set_exec_env(sk->owner_env); + cpt_kill_socket(sk, ctx); + if (sk->sk_socket) + release_sock_nobacklog(sk); + set_exec_env(old_env); + } + } +} + +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) +{ + struct fasync_struct *fa; + struct inode *inode = file->f_dentry->d_inode; + struct socket *sock; + + sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; + + for (fa = sock->fasync_list; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_socket.h linux-2.6.24.ovz/kernel/cpt/cpt_socket.h --- linux-2.6.24/kernel/cpt/cpt_socket.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_socket.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,33 @@ +struct sock; + +int cpt_collect_passedfds(cpt_context_t *); +int cpt_index_sockets(cpt_context_t *); +int cpt_collect_socket(struct file *, cpt_context_t *); +int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); +int rst_sockets(struct cpt_context *ctx); +int rst_sockets_complete(struct cpt_context *ctx); +int cpt_dump_orphaned_sockets(struct cpt_context *ctx); + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); + +void cpt_unlock_sockets(cpt_context_t *); +void cpt_kill_sockets(cpt_context_t *); + + +int cpt_kill_socket(struct sock *, cpt_context_t *); +int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); +__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); +int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); +int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx); diff -uprN linux-2.6.24/kernel/cpt/cpt_socket_in.c linux-2.6.24.ovz/kernel/cpt/cpt_socket_in.c --- linux-2.6.24/kernel/cpt/cpt_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_socket_in.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,449 @@ +/* + * + * kernel/cpt/cpt_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline __u32 jiffies_export(unsigned long tmo) +{ + __s32 delta = (long)(tmo - jiffies); + return delta; +} + +static inline __u32 tcp_jiffies_export(__u32 tmo) +{ + __s32 delta = tmo - tcp_time_stamp; + return delta; +} + +int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) +{ + struct sk_buff *skb; + struct tcp_sock *tp; + + if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) + return 0; + + tp = tcp_sk(sk); + + skb = skb_peek(&tp->out_of_order_queue); + while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { + int err; + + err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); + if (err) + return err; + + spin_lock_irq(&tp->out_of_order_queue.lock); + skb = skb->next; + spin_unlock_irq(&tp->out_of_order_queue.lock); + } + return 0; +} + +static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + + si->cpt_pred_flags = tp->pred_flags; + si->cpt_rcv_nxt = tp->rcv_nxt; + si->cpt_snd_nxt = tp->snd_nxt; + si->cpt_snd_una = tp->snd_una; + si->cpt_snd_sml = tp->snd_sml; + si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); + si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); + si->cpt_tcp_header_len = tp->tcp_header_len; + si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; + si->cpt_quick = inet_csk(sk)->icsk_ack.quick; + si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; + si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; + si->cpt_ato = inet_csk(sk)->icsk_ack.ato; + si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); + si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); + si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; + si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; + si->cpt_snd_wl1 = tp->snd_wl1; + si->cpt_snd_wnd = tp->snd_wnd; + si->cpt_max_window = tp->max_window; + si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; + si->cpt_mss_cache = tp->mss_cache; + si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ + si->cpt_mss_clamp = tp->rx_opt.mss_clamp; + si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; + si->cpt_ext2_header_len = 0; + si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; + si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; + si->cpt_reordering = tp->reordering; + si->cpt_frto_counter = tp->frto_counter; + si->cpt_frto_highmark = tp->frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + // // si->cpt_adv_cong = tp->adv_cong; +#endif + si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; + si->cpt_backoff = inet_csk(sk)->icsk_backoff; + si->cpt_srtt = tp->srtt; + si->cpt_mdev = tp->mdev; + si->cpt_mdev_max = tp->mdev_max; + si->cpt_rttvar = tp->rttvar; + si->cpt_rtt_seq = tp->rtt_seq; + si->cpt_rto = inet_csk(sk)->icsk_rto; + si->cpt_packets_out = tp->packets_out; + si->cpt_left_out = tp->sacked_out + tp->lost_out; + si->cpt_retrans_out = tp->retrans_out; + si->cpt_lost_out = tp->lost_out; + si->cpt_sacked_out = tp->sacked_out; + si->cpt_fackets_out = tp->fackets_out; + si->cpt_snd_ssthresh = tp->snd_ssthresh; + si->cpt_snd_cwnd = tp->snd_cwnd; + si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; + si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; + si->cpt_snd_cwnd_used = tp->snd_cwnd_used; + si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); + si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); + si->cpt_ka_timeout = 0; + si->cpt_rcv_wnd = tp->rcv_wnd; + si->cpt_rcv_wup = tp->rcv_wup; + si->cpt_write_seq = tp->write_seq; + si->cpt_pushed_seq = tp->pushed_seq; + si->cpt_copied_seq = tp->copied_seq; + si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; + si->cpt_wscale_ok = tp->rx_opt.wscale_ok; + si->cpt_sack_ok = tp->rx_opt.sack_ok; + si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; + si->cpt_snd_wscale = tp->rx_opt.snd_wscale; + si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; + si->cpt_nonagle = tp->nonagle; + si->cpt_keepalive_probes = tp->keepalive_probes; + si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; + si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; + si->cpt_ts_recent = tp->rx_opt.ts_recent; + si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + si->cpt_user_mss = tp->rx_opt.user_mss; + si->cpt_dsack = tp->rx_opt.dsack; + si->cpt_eff_sacks = tp->rx_opt.eff_sacks; + si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; + si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; + si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; + si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; + si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; + si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; + si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; + si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; + si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; + si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; + si->cpt_window_clamp = tp->window_clamp; + si->cpt_rcv_ssthresh = tp->rcv_ssthresh; + si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; + si->cpt_num_sacks = tp->rx_opt.num_sacks; + si->cpt_advmss = tp->advmss; + si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; + si->cpt_ecn_flags = tp->ecn_flags; + si->cpt_prior_ssthresh = tp->prior_ssthresh; + si->cpt_high_seq = tp->high_seq; + si->cpt_retrans_stamp = tp->retrans_stamp; + si->cpt_undo_marker = tp->undo_marker; + si->cpt_undo_retrans = tp->undo_retrans; + si->cpt_urg_seq = tp->urg_seq; + si->cpt_urg_data = tp->urg_data; + si->cpt_pending = inet_csk(sk)->icsk_pending; + si->cpt_urg_mode = tp->urg_mode; + si->cpt_snd_up = tp->snd_up; + si->cpt_keepalive_time = tp->keepalive_time; + si->cpt_keepalive_intvl = tp->keepalive_intvl; + si->cpt_linger2 = tp->linger2; + + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE && + sock_flag(sk, SOCK_KEEPOPEN)) { + si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); + } + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_family == AF_INET6 && + inet_csk(sk)->icsk_af_ops == &ipv6_mapped) + si->cpt_mapped = 1; + } +#endif + + return 0; +} + + +int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + if (sk->sk_family == AF_INET) { + struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); + sin->sin_family = AF_INET; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = inet->rcv_saddr; + si->cpt_laddrlen = sizeof(*sin); + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inet->sport; + memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); + si->cpt_laddrlen = sizeof(*sin6); + } + if (!inet->num) + si->cpt_laddrlen = 0; + + si->cpt_daddr = inet->daddr; + si->cpt_dport = inet->dport; + si->cpt_saddr = inet->saddr; + si->cpt_rcv_saddr = inet->rcv_saddr; + si->cpt_sport = inet->sport; + si->cpt_uc_ttl = inet->uc_ttl; + si->cpt_tos = inet->tos; + si->cpt_cmsg_flags = inet->cmsg_flags; + si->cpt_mc_index = inet->mc_index; + si->cpt_mc_addr = inet->mc_addr; + si->cpt_hdrincl = inet->hdrincl; + si->cpt_mc_ttl = inet->mc_ttl; + si->cpt_mc_loop = inet->mc_loop; + si->cpt_pmtudisc = inet->pmtudisc; + si->cpt_recverr = inet->recverr; + si->cpt_freebind = inet->freebind; + si->cpt_idcounter = inet->id; + + si->cpt_cork_flags = inet->cork.flags; + si->cpt_cork_fragsize = 0; + si->cpt_cork_length = inet->cork.length; + si->cpt_cork_addr = inet->cork.addr; + si->cpt_cork_saddr = inet->cork.fl.fl4_src; + si->cpt_cork_daddr = inet->cork.fl.fl4_dst; + si->cpt_cork_oif = inet->cork.fl.oif; + if (inet->cork.rt) { + si->cpt_cork_fragsize = inet->cork.fragsize; + si->cpt_cork_saddr = inet->cork.rt->fl.fl4_src; + si->cpt_cork_daddr = inet->cork.rt->fl.fl4_dst; + si->cpt_cork_oif = inet->cork.rt->fl.oif; + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + si->cpt_udp_pending = up->pending; + si->cpt_udp_corkflag = up->corkflag; + si->cpt_udp_encap = up->encap_type; + si->cpt_udp_len = up->len; + } + + if (sk->sk_family == AF_INET6) { + memcpy(si->cpt_saddr6, &np->saddr, 16); + memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); + memcpy(si->cpt_daddr6, &np->daddr, 16); + si->cpt_flow_label6 = np->flow_label; + si->cpt_frag_size6 = np->frag_size; + si->cpt_hop_limit6 = np->hop_limit; + si->cpt_mcast_hops6 = np->mcast_hops; + si->cpt_mcast_oif6 = np->mcast_oif; + si->cpt_rxopt6 = np->rxopt.all; + si->cpt_mc_loop6 = np->mc_loop; + si->cpt_recverr6 = np->recverr; + si->cpt_sndflow6 = np->sndflow; + si->cpt_pmtudisc6 = np->pmtudisc; + si->cpt_ipv6only6 = np->ipv6only; + si->cpt_mapped = 0; + } + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + cpt_dump_socket_tcp(si, sk, ctx); + + return 0; +} + +int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct request_sock *req; + + for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) + cpt_dump_socket(NULL, req->sk, -1, index, ctx); + return 0; +} + + +static int dump_openreq(struct request_sock *req, struct sock *sk, int index, + struct cpt_context *ctx) +{ + struct cpt_openreq_image *v = cpt_get_buf(ctx); + + cpt_open_object(NULL, ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_OPENREQ; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; + v->cpt_snt_isn = tcp_rsk(req)->snt_isn; + v->cpt_rmt_port = inet_rsk(req)->rmt_port; + v->cpt_mss = req->mss; + // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); + v->cpt_retrans = req->retrans; + v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; + v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; + v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; + v->cpt_sack_ok = inet_rsk(req)->sack_ok; + v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; + v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; + v->cpt_acked = inet_rsk(req)->acked; + v->cpt_window_clamp = req->window_clamp; + v->cpt_rcv_wnd = req->rcv_wnd; + v->cpt_ts_recent = req->ts_recent; + v->cpt_expires = jiffies_export(req->expires); + + if (v->cpt_family == AF_INET) { + memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); + memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); + memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); + v->cpt_iif = inet6_rsk(req)->iif; +#endif + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + cpt_close_object(ctx); + return 0; +} + +int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) +{ + struct inet_connection_sock *icsk; + struct listen_sock *lopt; + struct request_sock *req; + int nr_entries; + int i; + + icsk = inet_csk(sk); + lopt = icsk->icsk_accept_queue.listen_opt; + nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries; + + for (i=0; i < nr_entries; i++) { + for (req=lopt->syn_table[i]; req; req=req->dl_next) { + loff_t saved_obj; + cpt_push_object(&saved_obj, ctx); + dump_openreq(req, sk, index, ctx); + cpt_pop_object(&saved_obj, ctx); + } + } + return 0; +} + + +int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) +{ + if (sk->sk_state != TCP_CLOSE && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + tcp_set_state(sk, TCP_CLOSE); + else + sk->sk_prot->disconnect(sk, 0); + } + return 0; +} + +int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + for (iml = inet->mc_list; iml; iml = iml->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (iml->sflist) + scnt = iml->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET; + smi.cpt_mode = iml->sfmode; + smi.cpt_ifindex = iml->multi.imr_ifindex; + memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); + smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; + + ctx->write(&smi, sizeof(smi), ctx); + + for (i = 0; i < scnt; i++) { + u32 addr[4]; + memset(&addr, 0, sizeof(addr)); + addr[0] = iml->sflist->sl_addr[i]; + ctx->write(&addr, sizeof(addr), ctx); + } + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == AF_INET6) { + struct ipv6_mc_socklist *mcl; + struct ipv6_pinfo *np = inet6_sk(sk); + + for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { + struct cpt_sockmc_image smi; + int scnt = 0; + int i; + + if (mcl->sflist) + scnt = mcl->sflist->sl_count*16; + + smi.cpt_next = sizeof(smi) + scnt; + smi.cpt_object = CPT_OBJ_SOCK_MCADDR; + smi.cpt_hdrlen = sizeof(smi); + smi.cpt_content = CPT_CONTENT_DATA; + + smi.cpt_family = AF_INET6; + smi.cpt_mode = mcl->sfmode; + smi.cpt_ifindex = mcl->ifindex; + memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); + + ctx->write(&smi, sizeof(smi), ctx); + for (i = 0; i < scnt; i++) + ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); + } + } +#endif + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_syscalls.h linux-2.6.24.ovz/kernel/cpt/cpt_syscalls.h --- linux-2.6.24/kernel/cpt/cpt_syscalls.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_syscalls.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,100 @@ +#include +#include +#include + +#define WRAP(c, args) return sys_##c args +#define WRAP2(c, args) int err; mm_segment_t oldfs; \ + oldfs = get_fs(); set_fs(KERNEL_DS); \ + err = sys_##c args ;\ + set_fs(oldfs); \ + return err + +static inline int sc_close(int fd) +{ + WRAP(close, (fd)); +} + +static inline int sc_dup2(int fd1, int fd2) +{ + WRAP(dup2, (fd1, fd2)); +} + +static inline int sc_unlink(char *name) +{ + WRAP2(unlink, (name)); +} + +static inline int sc_pipe(int *pfd) +{ + return do_pipe(pfd); +} + +static inline int sc_mknod(char *name, int mode, int dev) +{ + WRAP2(mknod, (name, mode, dev)); +} + +static inline int sc_chmod(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_chown(char *name, int uid, int gid) +{ + WRAP2(chown, (name, uid, gid)); +} + +static inline int sc_mkdir(char *name, int mode) +{ + WRAP2(mkdir, (name, mode)); +} + +static inline int sc_rmdir(char *name) +{ + WRAP2(rmdir, (name)); +} + +static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) +{ + WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); +} + +static inline int sc_mprotect(unsigned long start, size_t len, + unsigned long prot) +{ + WRAP(mprotect, (start, len, prot)); +} + +static inline int sc_mlock(unsigned long start, size_t len) +{ + WRAP(mlock, (start, len)); +} + +static inline int sc_munlock(unsigned long start, size_t len) +{ + WRAP(munlock, (start, len)); +} + +static inline int sc_remap_file_pages(unsigned long start, size_t len, + unsigned long prot, unsigned long pgoff, + unsigned long flags) +{ + WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); +} + +static inline int sc_waitx(int pid, int opt, int *stat_addr) +{ + WRAP(wait4, (pid, stat_addr, opt, NULL)); +} + +static inline int sc_flock(int fd, int flags) +{ + WRAP(flock, (fd, flags)); +} + +static inline int sc_open(char* path, int flags, int mode) +{ + WRAP(open, (path, flags, mode)); +} + +extern int sc_execve(char *cms, char **argv, char **env); diff -uprN linux-2.6.24/kernel/cpt/cpt_sysvipc.c linux-2.6.24.ovz/kernel/cpt/cpt_sysvipc.c --- linux-2.6.24/kernel/cpt/cpt_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_sysvipc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,403 @@ +/* + * + * kernel/cpt/cpt_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int dump_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; + + if (shp->shm_file != warg->file) + return 0; + + v->cpt_key = shp->shm_perm.key; + v->cpt_uid = shp->shm_perm.uid; + v->cpt_gid = shp->shm_perm.gid; + v->cpt_cuid = shp->shm_perm.cuid; + v->cpt_cgid = shp->shm_perm.cgid; + v->cpt_mode = shp->shm_perm.mode; + v->cpt_seq = shp->shm_perm.seq; + + v->cpt_id = shp->shm_perm.id; + v->cpt_segsz = shp->shm_segsz; + v->cpt_atime = shp->shm_atim; + v->cpt_ctime = shp->shm_ctim; + v->cpt_dtime = shp->shm_dtim; + v->cpt_creator = shp->shm_cprid; + v->cpt_last = shp->shm_lprid; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; +#else + v->cpt_mlockuser = -1; +#endif + return 1; +} + +int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) +{ + struct cpt_sysvshm_image *v = cpt_get_buf(ctx); + struct _warg warg; + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SHM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + warg.file = file; + warg.v = v; + if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { + cpt_release_buf(ctx); + return -ESRCH; + } + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + return 0; +} + + +int match_sem(int id, struct sem_array *sema, void *arg) +{ + if (id != (unsigned long)arg) + return 0; + return sema->sem_nsems + 1; +} + +static int get_sem_nsem(int id, cpt_context_t *ctx) +{ + int res; + res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); + if (res > 0) + return res - 1; + eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); + return -ESRCH; +} + +static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) +{ + struct cpt_sysvsem_undo_image v; + loff_t saved_obj; + + cpt_open_object(NULL, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_SEMUNDO; + v.cpt_id = su->semid; + v.cpt_nsem = get_sem_nsem(su->semid, ctx); + if ((int)v.cpt_nsem < 0) + return -ESRCH; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + return 0; +} + +struct sem_warg { + int last_id; + struct cpt_sysvsem_image *v; +}; + +static int dump_one_sem(int id, struct sem_array *sma, void *arg) +{ + struct sem_warg * warg = (struct sem_warg *)arg; + struct cpt_sysvsem_image *v = warg->v; + int i; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSV_SEM; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_SEMARRAY; + + v->cpt_key = sma->sem_perm.key; + v->cpt_uid = sma->sem_perm.uid; + v->cpt_gid = sma->sem_perm.gid; + v->cpt_cuid = sma->sem_perm.cuid; + v->cpt_cgid = sma->sem_perm.cgid; + v->cpt_mode = sma->sem_perm.mode; + v->cpt_seq = sma->sem_perm.seq; + + v->cpt_id = id; + v->cpt_ctime = sma->sem_ctime; + v->cpt_otime = sma->sem_otime; + + for (i=0; isem_nsems; i++) { + struct { + __u32 semval; + __u32 sempid; + } *s = (void*)v + v->cpt_next; + if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) + return -EINVAL; + s->semval = sma->sem_base[i].semval; + s->sempid = sma->sem_base[i].sempid; + v->cpt_next += sizeof(*s); + } + + warg->last_id = id % IPCMNI; + return 1; +} + + +int cpt_dump_sysvsem(struct cpt_context *ctx) +{ + cpt_object_t *obj; + struct sem_warg warg; + + /* Dumping semaphores is quite tricky because we cannot + * write to dump file under lock inside sysvipc_walk_sem(). + */ + cpt_open_section(ctx, CPT_SECT_SYSV_SEM); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) + break; + ctx->write(warg.v, warg.v->cpt_next, ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + + cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + struct sem_undo *su; + struct cpt_object_hdr v; + loff_t saved_obj; + + cpt_open_object(obj, ctx); + + v.cpt_next = CPT_NULL; + v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; + v.cpt_hdrlen = sizeof(v); + v.cpt_content = CPT_CONTENT_ARRAY; + + ctx->write(&v, sizeof(v), ctx); + + cpt_push_object(&saved_obj, ctx); + for (su = semu->proc_list; su; su = su->proc_next) { + if (su->semid != -1) { + int err; + err = dump_one_semundo(su, ctx); + if (err < 0) + return err; + } + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_close_section(ctx); + return 0; +} + +struct msg_warg { + int last_id; + struct msg_queue *msq; + struct cpt_sysvmsg_image *v; +}; + +static int dump_one_msg(int id, struct msg_queue *msq, void *arg) +{ + struct msg_warg * warg = (struct msg_warg *)arg; + struct cpt_sysvmsg_image *v = warg->v; + + if (warg->last_id != -1) { + if ((id % IPCMNI) <= warg->last_id) + return 0; + } + + v->cpt_next = sizeof(*v); + v->cpt_object = CPT_OBJ_SYSVMSG; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_key = msq->q_perm.key; + v->cpt_uid = msq->q_perm.uid; + v->cpt_gid = msq->q_perm.gid; + v->cpt_cuid = msq->q_perm.cuid; + v->cpt_cgid = msq->q_perm.cgid; + v->cpt_mode = msq->q_perm.mode; + v->cpt_seq = msq->q_perm.seq; + + v->cpt_id = id; + v->cpt_stime = msq->q_stime; + v->cpt_rtime = msq->q_rtime; + v->cpt_ctime = msq->q_ctime; + v->cpt_last_sender = msq->q_lspid; + v->cpt_last_receiver = msq->q_lrpid; + v->cpt_qbytes = msq->q_qbytes; + + warg->msq = msq; + warg->last_id = id % IPCMNI; + return 1; +} + +static int do_store(void * src, int len, int offset, void * data) +{ + cpt_context_t * ctx = data; + ctx->write(src, len, ctx); + return 0; +} + +static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx) +{ + loff_t saved_obj; + struct cpt_sysvmsg_msg_image mv; + + cpt_open_object(NULL, ctx); + mv.cpt_next = CPT_NULL; + mv.cpt_object = CPT_OBJ_SYSVMSG_MSG; + mv.cpt_hdrlen = sizeof(mv); + mv.cpt_content = CPT_CONTENT_DATA; + + mv.cpt_type = m->m_type; + mv.cpt_size = m->m_ts; + + ctx->write(&mv, sizeof(mv), ctx); + + cpt_push_object(&saved_obj, ctx); + sysv_msg_store(m, do_store, m->m_ts, ctx); + cpt_pop_object(&saved_obj, ctx); + cpt_close_object(ctx); +} + +int cpt_dump_sysvmsg(struct cpt_context *ctx) +{ + struct msg_warg warg; + + /* Dumping msg queues is tricky because we cannot + * write to dump file under lock inside sysvipc_walk_msg(). + * + * And even worse, we have to access msg list in an unserialized + * context. It is fragile. But VE is still frozen, remember? + */ + cpt_open_section(ctx, CPT_SECT_SYSV_MSG); + warg.last_id = -1; + warg.v = cpt_get_buf(ctx); + for (;;) { + loff_t saved_obj; + struct msg_msg * m; + + if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0) + break; + + cpt_open_object(NULL, ctx); + + ctx->write(warg.v, warg.v->cpt_next, ctx); + + cpt_push_object(&saved_obj, ctx); + list_for_each_entry(m, &warg.msq->q_messages, m_list) { + cpt_dump_one_sysvmsg(m, ctx); + } + cpt_pop_object(&saved_obj, ctx); + + cpt_close_object(ctx); + } + cpt_release_buf(ctx); + cpt_close_section(ctx); + return 0; +} + +static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + if (tsk->exit_state) { + /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list + * on exit. Grrr... */ + continue; + } + if (tsk->sysvsem.undo_list && + cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) + return -ENOMEM; + } + + for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { + struct sem_undo_list *semu = obj->o_obj; + + if (atomic_read(&semu->refcnt) != obj->o_count) { + eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); + return -EBUSY; + } + } + return 0; +} + +static int collect_one_shm(struct shmid_kernel *shp, void *arg) +{ + cpt_context_t *ctx = arg; + + if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) + return -ENOMEM; + return 0; +} + +int cpt_collect_sysvshm(cpt_context_t * ctx) +{ + int err; + + err = sysvipc_walk_shm(collect_one_shm, ctx); + + return err < 0 ? err : 0; +} + +int cpt_collect_sysv(cpt_context_t * ctx) +{ + int err; + + err = cpt_collect_sysvsem_undo(ctx); + if (err) + return err; + err = cpt_collect_sysvshm(ctx); + if (err) + return err; + + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_tty.c linux-2.6.24.ovz/kernel/cpt/cpt_tty.c --- linux-2.6.24/kernel/cpt/cpt_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_tty.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,215 @@ +/* + * + * kernel/cpt/cpt_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +/* We must support at least N_TTY. */ + +int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct *tty = file->private_data; + cpt_object_t *obj; + struct cpt_obj_ref o; + loff_t saved_pos; + + obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); + if (!obj) + return -EINVAL; + + cpt_push_object(&saved_pos, ctx); + + o.cpt_next = sizeof(o); + o.cpt_object = CPT_OBJ_REF; + o.cpt_hdrlen = sizeof(o); + o.cpt_content = CPT_CONTENT_VOID; + o.cpt_pos = obj->o_pos; + ctx->write(&o, sizeof(o), ctx); + + cpt_pop_object(&saved_pos, ctx); + + return 0; +} + +int cpt_collect_tty(struct file *file, cpt_context_t * ctx) +{ + struct tty_struct *tty = file->private_data; + + if (tty) { + if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) + return -ENOMEM; + if (tty->link) { + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); + if (obj == NULL) + return -ENOMEM; + /* Undo o_count, tty->link is not a reference */ + obj->o_count--; + } + } + return 0; +} + +int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct tty_struct *tty = obj->o_obj; + struct cpt_tty_image *v; + + if (tty->link) { + if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { + eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); + return -EINVAL; + } + if (tty->link->link != tty) { + eprintk_ctx("bad pty pair\n"); + return -EINVAL; + } + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_SLAVE && + tty->link->count) + obj->o_count++; + } + if (obj->o_count != tty->count) { + eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); + return -EBUSY; + } + + cpt_open_object(obj, ctx); + + v = cpt_get_buf(ctx); + v->cpt_next = -1; + v->cpt_object = CPT_OBJ_TTY; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_ARRAY; + + v->cpt_index = tty->index; + v->cpt_link = -1; + if (tty->link) + v->cpt_link = tty->link->index; + v->cpt_drv_type = tty->driver->type; + v->cpt_drv_subtype = tty->driver->subtype; + v->cpt_drv_flags = tty->driver->flags; + v->cpt_packet = tty->packet; + v->cpt_stopped = tty->stopped; + v->cpt_hw_stopped = tty->hw_stopped; + v->cpt_flow_stopped = tty->flow_stopped; + v->cpt_flags = tty->flags; + v->cpt_ctrl_status = tty->ctrl_status; + v->cpt_canon_data = tty->canon_data; + v->cpt_canon_head = tty->canon_head - tty->read_tail; + v->cpt_canon_column = tty->canon_column; + v->cpt_column = tty->column; + v->cpt_erasing = tty->erasing; + v->cpt_lnext = tty->lnext; + v->cpt_icanon = tty->icanon; + v->cpt_raw = tty->raw; + v->cpt_real_raw = tty->real_raw; + v->cpt_closing = tty->closing; + v->cpt_minimum_to_wake = tty->minimum_to_wake; + v->cpt_pgrp = 0; + if (tty->pgrp) { + v->cpt_pgrp = pid_vnr(tty->pgrp); + if ((int)v->cpt_pgrp < 0) { + dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp); + v->cpt_pgrp = -1; + } + } + v->cpt_session = 0; + if (tty->session) { + v->cpt_session = pid_vnr(tty->session); + if ((int)v->cpt_session < 0) { + eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session); + cpt_release_buf(ctx); + return -EINVAL; + } + } + memcpy(v->cpt_name, tty->name, 64); + v->cpt_ws_row = tty->winsize.ws_row; + v->cpt_ws_col = tty->winsize.ws_col; + v->cpt_ws_prow = tty->winsize.ws_ypixel; + v->cpt_ws_pcol = tty->winsize.ws_xpixel; + if (tty->termios == NULL) { + eprintk_ctx("NULL termios"); + cpt_release_buf(ctx); + return -EINVAL; + } + v->cpt_c_line = tty->termios->c_line; + v->cpt_c_iflag = tty->termios->c_iflag; + v->cpt_c_oflag = tty->termios->c_oflag; + v->cpt_c_cflag = tty->termios->c_cflag; + v->cpt_c_lflag = tty->termios->c_lflag; + memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); + if (NCCS < 32) + memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); + memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); + + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_buf && tty->read_cnt) { + struct cpt_obj_bits *v = cpt_get_buf(ctx); + loff_t saved_pos; + + cpt_push_object(&saved_pos, ctx); + cpt_open_object(NULL, ctx); + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_BITS; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_DATA; + v->cpt_size = tty->read_cnt; + ctx->write(v, sizeof(*v), ctx); + cpt_release_buf(ctx); + + if (tty->read_cnt) { + int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); + ctx->write(tty->read_buf + tty->read_tail, n, ctx); + if (tty->read_cnt > n) + ctx->write(tty->read_buf, tty->read_cnt-n, ctx); + ctx->align(ctx); + } + + cpt_close_object(ctx); + cpt_pop_object(&saved_pos, ctx); + } + + cpt_close_object(ctx); + + return 0; +} + +__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) +{ + struct tty_struct * tty; + struct fasync_struct *fa; + + tty = (struct tty_struct *)file->private_data; + + for (fa = tty->fasync; fa; fa = fa->fa_next) { + if (fa->fa_file == file) + return fa->fa_fd; + } + return -1; +} diff -uprN linux-2.6.24/kernel/cpt/cpt_ubc.c linux-2.6.24.ovz/kernel/cpt/cpt_ubc.c --- linux-2.6.24/kernel/cpt/cpt_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_ubc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,132 @@ +/* + * + * kernel/cpt/cpt_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); + if (obj != NULL) { + if (obj->o_count == 1) + get_beancounter(bc); + if (bc->parent != NULL && obj->o_parent == NULL) + obj->o_parent = cpt_add_ubc(bc->parent, ctx); + } + return obj; +} + +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); + if (obj == NULL) { + char buf[48]; + print_ub_uid(bc, buf, sizeof(buf)); + eprintk("CPT: unknown ub %s (%p)\n", buf, bc); + dump_stack(); + return CPT_NULL; + } + return obj->o_pos; +} + +static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); + dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); + dmp->held = (held ? prm->held : CPT_NULL); + dmp->maxheld = prm->maxheld; + dmp->minheld = prm->minheld; + dmp->failcnt = prm->failcnt; +} + +static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + struct cpt_beancounter_image *v; + int i; + + bc = obj->o_obj; + v = cpt_get_buf(ctx); + + v->cpt_next = CPT_NULL; + v->cpt_object = CPT_OBJ_UBC; + v->cpt_hdrlen = sizeof(*v); + v->cpt_content = CPT_CONTENT_VOID; + + if (obj->o_parent != NULL) + v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; + else + v->cpt_parent = CPT_NULL; + v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; + for (i = 0; i < UB_RESOURCES; i++) { + dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); + } + memset(v->cpt_parms + UB_RESOURCES * 2, 0, + sizeof(v->cpt_parms) + - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0])); + + cpt_open_object(obj, ctx); + ctx->write(v, sizeof(*v), ctx); + cpt_close_object(ctx); + + cpt_release_buf(ctx); + return 0; +} + +int cpt_dump_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int skipped; + int top; + + cpt_open_section(ctx, CPT_SECT_UBC); + + do { + skipped = 0; + top = 0; + for_each_object(obj, CPT_OBJ_UBC) { + if (obj->o_parent == NULL) + top++; + if (obj->o_pos != CPT_NULL) + continue; + if (obj->o_parent != NULL && + ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) + skipped++; + else + dump_one_bc(obj, ctx); + } + } while (skipped && (top < 2)); + + cpt_close_section(ctx); + if (top > 1) { + eprintk_ctx("More than one top level ub exist"); + return -EINVAL; + } + + return 0; +} + +void cpt_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -uprN linux-2.6.24/kernel/cpt/cpt_ubc.h linux-2.6.24.ovz/kernel/cpt/cpt_ubc.h --- linux-2.6.24/kernel/cpt/cpt_ubc.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_ubc.h 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,23 @@ +#ifdef CONFIG_BEANCOUNTERS +cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); +int cpt_dump_ubc(struct cpt_context *ctx); + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); +int rst_undump_ubc(struct cpt_context *ctx); + +void cpt_finish_ubc(struct cpt_context *ctx); +void rst_finish_ubc(struct cpt_context *ctx); +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id); +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id); +#else +static int inline cpt_dump_ubc(struct cpt_context *ctx) +{ return 0; } +static int inline rst_undump_ubc(struct cpt_context *ctx) +{ return 0; } +static void inline cpt_finish_ubc(struct cpt_context *ctx) +{ return; } +static void inline rst_finish_ubc(struct cpt_context *ctx) +{ return; } +#endif + diff -uprN linux-2.6.24/kernel/cpt/cpt_x8664.S linux-2.6.24.ovz/kernel/cpt/cpt_x8664.S --- linux-2.6.24/kernel/cpt/cpt_x8664.S 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/cpt_x8664.S 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,67 @@ +#define ASSEMBLY 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + pushq %rax /* rsp */ + pushq $(1<<9) /* eflags - interrupts on */ + pushq $__KERNEL_CS /* cs */ + pushq \child_rip /* rip */ + pushq %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + .endm + +ENTRY(asm_kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq $0x00800000,%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + pushq %rcx + call do_fork_pid + addq $8, %rsp + /* call do_fork */ + movq %rax,RAX(%rsp) + xorl %edi,%edi + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(asm_kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + movq %rax, %rdi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + diff -uprN linux-2.6.24/kernel/cpt/rst_conntrack.c linux-2.6.24.ovz/kernel/cpt/rst_conntrack.c --- linux-2.6.24/kernel/cpt/rst_conntrack.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_conntrack.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,283 @@ +/* + * + * kernel/cpt/rst_conntrack.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_VE_IPTABLES) && \ + (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) do { } while (0) +#define ASSERT_WRITE_LOCK(x) do { } while (0) + + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct ct_holder +{ + struct ct_holder *next; + struct ip_conntrack *ct; + int index; +}; + +static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) +{ + tuple->dst.ip = v->cpt_dst; + tuple->dst.u.all = v->cpt_dstport; + tuple->dst.protonum = v->cpt_protonum; + tuple->dst.dir = v->cpt_dir; + if (dir != tuple->dst.dir) + wprintk("dir != tuple->dst.dir\n"); + + tuple->src.ip = v->cpt_src; + tuple->src.u.all = v->cpt_srcport; +} + + +static int undump_expect_list(struct ip_conntrack *ct, + struct cpt_ip_conntrack_image *ci, + loff_t pos, struct ct_holder *ct_list, + cpt_context_t *ctx) +{ + loff_t end; + int err; + + end = pos + ci->cpt_next; + pos += ci->cpt_hdrlen; + while (pos < end) { + struct cpt_ip_connexpect_image v; + struct ip_conntrack_expect *exp; + struct ip_conntrack *sibling; + + err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); + if (err) + return err; + + sibling = NULL; + if (v.cpt_sibling_conntrack) { + struct ct_holder *c; + + for (c = ct_list; c; c = c->next) { + if (c->index == v.cpt_sibling_conntrack) { + sibling = c->ct; + break; + } + } + if (!sibling) { + eprintk_ctx("lost sibling of expectation\n"); + return -EINVAL; + } + } + + write_lock_bh(&ip_conntrack_lock); + + /* It is possible. Helper module could be just unregistered, + * if expectation were on the list, it would be destroyed. */ + if (ct->helper == NULL) { + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); + continue; + } + + exp = ip_conntrack_expect_alloc(NULL); + if (exp == NULL) { + write_unlock_bh(&ip_conntrack_lock); + return -ENOMEM; + } + + if (ct->helper->timeout && !del_timer(&exp->timeout)) { + /* Dying already. We can do nothing. */ + write_unlock_bh(&ip_conntrack_lock); + dprintk_ctx("conntrack expectation is dying\n"); + continue; + } + + decode_tuple(&v.cpt_tuple, &exp->tuple, 0); + decode_tuple(&v.cpt_mask, &exp->mask, 0); + + exp->master = ct; + nf_conntrack_get(&ct->ct_general); + ip_conntrack_expect_insert(exp); +#if 0 + if (sibling) { + exp->sibling = sibling; + sibling->master = exp; + LIST_DELETE(&ve_ip_conntrack_expect_list, exp); + ct->expecting--; + nf_conntrack_get(&master_ct(sibling)->infos[0]); + } else +#endif + if (ct->helper->timeout) { + exp->timeout.expires = jiffies + v.cpt_timeout; + add_timer(&exp->timeout); + } + write_unlock_bh(&ip_conntrack_lock); + + pos += v.cpt_next; + } + return 0; +} + +static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, + struct ct_holder **ct_list, cpt_context_t *ctx) +{ + int err = 0; + struct ip_conntrack *conntrack; + struct ct_holder *c; + struct ip_conntrack_tuple orig, repl; + + c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); + if (c == NULL) + return -ENOMEM; + + decode_tuple(&ci->cpt_tuple[0], &orig, 0); + decode_tuple(&ci->cpt_tuple[1], &repl, 1); + + conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); + if (!conntrack || IS_ERR(conntrack)) { + kfree(c); + return -ENOMEM; + } + + c->ct = conntrack; + c->next = *ct_list; + *ct_list = c; + c->index = ci->cpt_index; + + decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); + decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); + + conntrack->status = ci->cpt_status; + + memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); + memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); + +#ifdef CONFIG_IP_NF_NAT_NEEDED +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + conntrack->nat.masq_index = ci->cpt_masq_index; +#endif + if (ci->cpt_initialized) { + conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; + conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; + conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; + conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; + conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; + conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; + } + if (conntrack->status & IPS_NAT_DONE_MASK) + ip_nat_hash_conntrack(conntrack); +#endif + + if (ci->cpt_ct_helper) { + conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); + if (conntrack->helper == NULL) { + eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); + err = -EINVAL; + } + } + + ip_conntrack_hash_insert(conntrack); + conntrack->timeout.expires = jiffies + ci->cpt_timeout; + + if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) + err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); + + return err; +} + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ip_conntrack_image ci; + struct ct_holder *c; + struct ct_holder *ct_list = NULL; + + if (sec == CPT_NULL) + return 0; + + if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { + eprintk_ctx("conntrack module ct->proto version mismatch\n"); + return -EINVAL; + } + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); + if (err) + break; + err = undump_one_ct(&ci, sec, &ct_list, ctx); + if (err) + break; + sec += ci.cpt_next; + } + + while ((c = ct_list) != NULL) { + ct_list = c->next; + if (c->ct) + add_timer(&c->ct->timeout); + kfree(c); + } + + return err; +} + +#else + +#include "cpt_obj.h" +#include "cpt_context.h" + +int rst_restore_ip_conntrack(struct cpt_context * ctx) +{ + if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) + return -EINVAL; + return 0; +} + +#endif diff -uprN linux-2.6.24/kernel/cpt/rst_context.c linux-2.6.24.ovz/kernel/cpt/rst_context.c --- linux-2.6.24/kernel/cpt/rst_context.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_context.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,323 @@ +/* + * + * kernel/cpt/rst_context.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &file->f_pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) +{ + mm_segment_t oldfs; + ssize_t err = -EBADF; + struct file *file = ctx->file; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (file) + err = file->f_op->read(file, addr, count, &pos); + set_fs(oldfs); + if (err != count) + return err >= 0 ? -EIO : err; + return 0; +} + +static void file_align(struct cpt_context *ctx) +{ + struct file *file = ctx->file; + + if (file) + file->f_pos = CPT_ALIGN(file->f_pos); +} + +int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) +{ + struct cpt_section_hdr hdr; + int err; + loff_t pos; + + pos = ctx->sections[type]; + *start = *end = pos; + + if (pos != CPT_NULL) { + if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) + return err; + if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) + return -EINVAL; + *start = pos + hdr.cpt_hdrlen; + *end = pos + hdr.cpt_next; + } + return 0; +} +EXPORT_SYMBOL(rst_get_section); + +void rst_context_init(struct cpt_context *ctx) +{ + int i; + + memset(ctx, 0, sizeof(*ctx)); + + init_MUTEX(&ctx->main_sem); + ctx->refcount = 1; + + ctx->current_section = -1; + ctx->current_object = -1; + ctx->pagesize = PAGE_SIZE; + ctx->read = file_read; + ctx->pread = file_pread; + ctx->align = file_align; + for (i=0; i < CPT_SECT_MAX; i++) + ctx->sections[i] = CPT_NULL; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + init_completion(&ctx->pgin_notify); +#endif + cpt_object_init(ctx); +} + +static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) +{ + struct cpt_section_hdr h; + + while (start < end) { + int err; + + err = ctx->pread(&h, sizeof(h), ctx, start); + if (err) + return err; + if (h.cpt_hdrlen < sizeof(h) || + h.cpt_next < h.cpt_hdrlen || + start + h.cpt_next > end) + return -EINVAL; + if (h.cpt_section >= CPT_SECT_MAX) + return -EINVAL; + ctx->sections[h.cpt_section] = start; + start += h.cpt_next; + } + return 0; +} + +int rst_open_dumpfile(struct cpt_context *ctx) +{ + int err; + struct cpt_major_tail *v; + struct cpt_major_hdr h; + unsigned long size; + + err = -EBADF; + if (!ctx->file) + goto err_out; + + err = -ENOMEM; + ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); + if (ctx->tmpbuf == NULL) + goto err_out; + __cpt_release_buf(ctx); + + size = ctx->file->f_dentry->d_inode->i_size; + + if (size & 7) { + err = -EINVAL; + goto err_out; + } + if (size < sizeof(struct cpt_major_hdr) + + sizeof(struct cpt_major_tail)) { + err = -EINVAL; + goto err_out; + } + err = ctx->pread(&h, sizeof(h), ctx, 0); + if (err) { + eprintk_ctx("too short image 1 %d\n", err); + goto err_out; + } + if (h.cpt_signature[0] != CPT_SIGNATURE0 || + h.cpt_signature[1] != CPT_SIGNATURE1 || + h.cpt_signature[2] != CPT_SIGNATURE2 || + h.cpt_signature[3] != CPT_SIGNATURE3) { + err = -EINVAL; + goto err_out; + } + if (h.cpt_hz != HZ) { + err = -EINVAL; + eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); + goto err_out; + } + ctx->virt_jiffies64 = h.cpt_start_jiffies64; + ctx->start_time.tv_sec = h.cpt_start_sec; + ctx->start_time.tv_nsec = h.cpt_start_nsec; + ctx->kernel_config_flags = h.cpt_kernel_config[0]; + ctx->iptables_mask = h.cpt_iptables_mask; + if (h.cpt_image_version > CPT_VERSION_20 || + CPT_VERSION_MINOR(h.cpt_image_version) > 1) { + eprintk_ctx("Unknown image version: %x. Can't restore.\n", + h.cpt_image_version); + err = -EINVAL; + goto err_out; + } + ctx->image_version = h.cpt_image_version; + ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features); + ctx->image_arch = h.cpt_os_arch; + + v = cpt_get_buf(ctx); + err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); + if (err) { + eprintk_ctx("too short image 2 %d\n", err); + cpt_release_buf(ctx); + goto err_out; + } + if (v->cpt_signature[0] != CPT_SIGNATURE0 || + v->cpt_signature[1] != CPT_SIGNATURE1 || + v->cpt_signature[2] != CPT_SIGNATURE2 || + v->cpt_signature[3] != CPT_SIGNATURE3 || + v->cpt_nsect != CPT_SECT_MAX_INDEX) { + err = -EINVAL; + cpt_release_buf(ctx); + goto err_out; + } + if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { + cpt_release_buf(ctx); + goto err_out; + } +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + ctx->lazypages = v->cpt_lazypages; +#endif + ctx->tasks64 = v->cpt_64bit; + cpt_release_buf(ctx); + return 0; + +err_out: + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } + return err; +} + +void rst_close_dumpfile(struct cpt_context *ctx) +{ + if (ctx->file) { + fput(ctx->file); + ctx->file = NULL; + } + if (ctx->tmpbuf) { + free_page((unsigned long)ctx->tmpbuf); + ctx->tmpbuf = NULL; + } +} + +int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr *hdr = tmp; + err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); + if (err) + return err; + if (type > 0 && type != hdr->cpt_object) + return -EINVAL; + if (hdr->cpt_hdrlen > hdr->cpt_next) + return -EINVAL; + if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return -EINVAL; + if (size < sizeof(*hdr)) + return -EINVAL; + if (size > hdr->cpt_hdrlen) + size = hdr->cpt_hdrlen; + if (size > sizeof(*hdr)) + err = ctx->pread(hdr+1, size - sizeof(*hdr), + ctx, pos + sizeof(*hdr)); + return err; +} +EXPORT_SYMBOL(_rst_get_object); + +void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) +{ + int err; + void *tmp; + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); + if (err) + return NULL; + if (type > 0 && type != hdr.cpt_object) + return NULL; + if (hdr.cpt_hdrlen > hdr.cpt_next) + return NULL; + if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) + return NULL; + tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); + if (!tmp) + return NULL; + err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); + if (!err) + return tmp; + kfree(tmp); + return NULL; +} +EXPORT_SYMBOL(__rst_get_object); + +__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) +{ + int err; + struct cpt_object_hdr hdr; + __u8 *name; + + err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); + if (err) + return NULL; + if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) + return NULL; + name = (void*)__get_free_page(GFP_KERNEL); + if (!name) + return NULL; + err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, + ctx, *pos_p + hdr.cpt_hdrlen); + if (err) { + free_page((unsigned long)name); + return NULL; + } + *pos_p += hdr.cpt_next; + return name; +} + +__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) +{ + return __rst_get_name(&pos, ctx); +} + +void rst_put_name(__u8 *name, struct cpt_context *ctx) +{ + unsigned long addr = (unsigned long)name; + + if (addr) + free_page(addr&~(PAGE_SIZE-1)); +} diff -uprN linux-2.6.24/kernel/cpt/rst_epoll.c linux-2.6.24.ovz/kernel/cpt/rst_epoll.c --- linux-2.6.24/kernel/cpt/rst_epoll.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_epoll.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,170 @@ +/* + * + * kernel/cpt/rst_epoll.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +/* Those funcations are static in fs/eventpoll.c */ +extern struct file_operations eventpoll_fops; +extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd); +extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); +extern void ep_release_epitem(struct epitem *epi); + + +struct file *cpt_open_epolldev(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int efd; + + /* Argument "size" is ignored, use just 1 */ + efd = sys_epoll_create(1); + if (efd < 0) + return ERR_PTR(efd); + + file = fget(efd); + sys_close(efd); + return file; +} + +static int restore_one_epoll(cpt_object_t *obj, + loff_t pos, + struct cpt_epoll_image *ebuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct eventpoll *ep; + + if (file->f_op != &eventpoll_fops) { + eprintk_ctx("bad epoll file\n"); + return -EINVAL; + } + + ep = file->private_data; + + if (unlikely(ep == NULL)) { + eprintk_ctx("bad epoll device\n"); + return -EINVAL; + } + + endpos = pos + ebuf->cpt_next; + pos += ebuf->cpt_hdrlen; + while (pos < endpos) { + struct cpt_epoll_file_image efi; + struct epoll_event epds; + + cpt_object_t *tobj; + + err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); + if (err) + return err; + tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); + if (!tobj) { + eprintk_ctx("epoll file not found\n"); + return -EINVAL; + } + epds.events = efi.cpt_events; + epds.data = efi.cpt_data; + mutex_lock(&ep->mtx); + err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); + if (!err) { + struct epitem *epi; + epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); + if (epi) { + if (efi.cpt_ready) { + unsigned long flags; + spin_lock_irqsave(&ep->lock, flags); + if (list_empty(&epi->rdllink)) + list_add_tail(&epi->rdllink, &ep->rdllist); + spin_unlock_irqrestore(&ep->lock, flags); + } + } + } + mutex_unlock(&ep->mtx); + if (err) + break; + pos += efi.cpt_next; + } + return err; +} + +int rst_eventpoll(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_EPOLL]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find epoll file object\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + err = restore_one_epoll(obj, sec, ebuf, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += ebuf->cpt_next; + } + + return 0; + +} diff -uprN linux-2.6.24/kernel/cpt/rst_files.c linux-2.6.24.ovz/kernel/cpt/rst_files.c --- linux-2.6.24/kernel/cpt/rst_files.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_files.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1656 @@ +/* + * + * kernel/cpt/rst_files.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" + +#include "cpt_syscalls.h" + + +struct filejob { + struct filejob *next; + int pid; + loff_t fdi; +}; + +static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) +{ + struct filejob *j; + + j = kmalloc(sizeof(*j), GFP_KERNEL); + if (j == NULL) + return -ENOMEM; + j->pid = current->pid; + j->fdi = pos; + j->next = ctx->filejob_queue; + ctx->filejob_queue = j; + return 0; +} + +static void _anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); + + module_put(THIS_MODULE); +} + +static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + + return kmap(buf->page); +} + +static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); +} + +static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + +static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + +static struct pipe_buf_operations _anon_pipe_buf_ops = { + .can_merge = 1, + .map = _anon_pipe_buf_map, + .unmap = _anon_pipe_buf_unmap, + .release = _anon_pipe_buf_release, + .confirm = _anon_pipe_buf_confirm, + .get = _anon_pipe_buf_get, + .steal = _anon_pipe_buf_steal, +}; + +/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer + * many times. We need to mark it in CPT_OBJ_INODE table in some way. + */ +static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, + struct cpt_context *ctx) +{ + struct inode *ino = file->f_dentry->d_inode; + struct cpt_inode_image ii; + struct cpt_obj_bits b; + struct pipe_inode_info *info; + int err; + int count; + + if (!S_ISFIFO(ino->i_mode)) { + eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode); + return -EINVAL; + } + if (fi->cpt_inode == CPT_NULL) + return 0; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return err; + + if (ii.cpt_next <= ii.cpt_hdrlen) + return 0; + + err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); + if (err) + return err; + + if (b.cpt_size == 0) + return 0; + + mutex_lock(&ino->i_mutex); + info = ino->i_pipe; + if (info->nrbufs) { + mutex_unlock(&ino->i_mutex); + eprintk("pipe buffer is restored already\n"); + return -EINVAL; + } + info->curbuf = 0; + count = 0; + while (count < b.cpt_size) { + struct pipe_buffer *buf = info->bufs + info->nrbufs; + void * addr; + int chars; + + chars = b.cpt_size - count; + if (chars > PAGE_SIZE) + chars = PAGE_SIZE; + if (!try_module_get(THIS_MODULE)) { + err = -EBUSY; + break; + } + + buf->page = alloc_page(GFP_HIGHUSER); + if (buf->page == NULL) { + err = -ENOMEM; + break; + } + buf->ops = &_anon_pipe_buf_ops; + buf->offset = 0; + buf->len = chars; + info->nrbufs++; + addr = kmap(buf->page); + err = ctx->pread(addr, chars, ctx, + fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); + if (err) + break; + count += chars; + } + mutex_unlock(&ino->i_mutex); + + return err; +} + +static int make_flags(struct cpt_file_image *fi) +{ + int flags = O_NOFOLLOW; + switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + flags |= O_RDWR; break; + case FMODE_WRITE: + flags |= O_WRONLY; break; + case FMODE_READ: + flags |= O_RDONLY; break; + default: break; + } + flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); + flags |= O_NONBLOCK|O_NOCTTY; + return flags; +} + +static struct file *open_pipe(char *name, + struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct cpt_inode_image ii; + struct file *rf, *wf; + + err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); + if (err) + return ERR_PTR(err); + + if (ii.cpt_sb == FSMAGIC_PIPEFS) { + int pfd[2]; + + if ((err = sc_pipe(pfd)) < 0) + return ERR_PTR(err); + + rf = fcheck(pfd[0]); + wf = fcheck(pfd[1]); + get_file(rf); + get_file(wf); + sc_close(pfd[0]); + sc_close(pfd[1]); + + if (fi->cpt_mode&FMODE_READ) { + struct file *tf; + tf = wf; wf = rf; rf = tf; + } + } else { + if (fi->cpt_mode&FMODE_READ) { + rf = filp_open(name, flags, 0); + if (IS_ERR(rf)) { + dprintk_ctx("filp_open\n"); + return rf; + } + dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), + (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); + return rf; + } + + dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode); + + rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); + if (IS_ERR(rf)) + return rf; + wf = dentry_open(dget(rf->f_dentry), + mntget(rf->f_vfsmnt), flags); + } + + /* Add pipe inode to obj table. */ + obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); + if (obj == NULL) { + fput(rf); fput(wf); + return ERR_PTR(-ENOMEM); + } + cpt_obj_setpos(obj, fi->cpt_inode, ctx); + obj->o_parent = rf; + + /* Add another side of pipe to obj table, it will not be used + * (o_pos = PT_NULL), another processes opeining pipe will find + * inode and open it with dentry_open(). */ + obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); + if (obj == NULL) { + fput(wf); + return ERR_PTR(-ENOMEM); + } + return wf; +} + +static struct file *open_special(struct cpt_file_image *fi, + unsigned flags, + int deleted, + struct cpt_context *ctx) +{ + struct cpt_inode_image *ii; + struct file *file; + + /* Directories and named pipes are not special actually */ + if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) + return NULL; + + /* No support for block devices at the moment. */ + if (S_ISBLK(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + if (S_ISSOCK(fi->cpt_i_mode)) { + eprintk_ctx("bug: socket is not open\n"); + return ERR_PTR(-EINVAL); + } + + /* Support only (some) character devices at the moment. */ + if (!S_ISCHR(fi->cpt_i_mode)) + return ERR_PTR(-EINVAL); + + ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); + if (ii == NULL) + return ERR_PTR(-ENOMEM); + + /* Do not worry about this right now. /dev/null,zero,*random are here. + * To prohibit at least /dev/mem? + */ + if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { + kfree(ii); + return NULL; + } + + /* /dev/net/tun will be opened by caller */ + if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) { + kfree(ii); + return NULL; + } + + file = rst_open_tty(fi, ii, flags, ctx); + kfree(ii); + return file; +} + +static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) +{ + struct file_lock lock; + cpt_object_t *obj; + + memset(&lock, 0, sizeof(lock)); + lock.fl_type = fli->cpt_type; + lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; + lock.fl_start = fli->cpt_start; + lock.fl_end = fli->cpt_end; + obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); + if (!obj) { + eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); + return -EINVAL; + } + lock.fl_owner = obj->o_obj; + lock.fl_pid = vpid_to_pid(fli->cpt_pid); + if (lock.fl_pid < 0) { + eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); + return -EINVAL; + } + lock.fl_file = file; + + if (lock.fl_owner == NULL) + eprintk_ctx("no lock owner\n"); + return posix_lock_file(file, &lock, NULL); +} + +static int restore_flock(struct file *file, struct cpt_flock_image *fli, + cpt_context_t *ctx) +{ + int cmd, err, fd; + fd = get_unused_fd(); + if (fd < 0) { + eprintk_ctx("BSD flock cannot be restored\n"); + return fd; + } + get_file(file); + fd_install(fd, file); + if (fli->cpt_type == F_RDLCK) { + cmd = LOCK_SH; + } else if (fli->cpt_type == F_WRLCK) { + cmd = LOCK_EX; + } else { + eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); + sc_close(fd); + return -EINVAL; + } + + err = sc_flock(fd, LOCK_NB | cmd); + sc_close(fd); + return err; +} + + +static int fixup_posix_locks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_POSIX)) { + err = restore_posix_lock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("posix lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + +int rst_posix_locks(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + struct cpt_file_image fi; + + if (obj->o_pos == CPT_NULL) + continue; + + err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); + if (err < 0) + return err; + if (fi.cpt_next > fi.cpt_hdrlen) + fixup_posix_locks(file, &fi, obj->o_pos, ctx); + } + return 0; +} + +static int fixup_flocks(struct file *file, + struct cpt_file_image *fi, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end; + struct cpt_flock_image fli; + + end = pos + fi->cpt_next; + pos += fi->cpt_hdrlen; + while (pos < end) { + err = rst_get_object(-1, pos, &fli, ctx); + if (err) + return err; + if (fli.cpt_object == CPT_OBJ_FLOCK && + (fli.cpt_flags&FL_FLOCK)) { + err = restore_flock(file, &fli, ctx); + if (err) + return err; + dprintk_ctx("bsd lock restored\n"); + } + pos += fli.cpt_next; + } + return 0; +} + + +static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + int err; + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_op->write; + if (do_write == NULL) { + eprintk_ctx("no write method. Cannot restore contents of the file.\n"); + return -EINVAL; + } + + atomic_inc(&file->f_count); + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + goto out; + dprintk_ctx("restoring file data block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + goto out; + } + if (!(file->f_mode & FMODE_WRITE) || + (file->f_flags&O_DIRECT)) { + fput(file); + file = dentry_open(dget(file->f_dentry), + mntget(file->f_vfsmnt), O_WRONLY); + if (IS_ERR(file)) { + __cpt_release_buf(ctx); + return PTR_ERR(file); + } + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + if (err >= 0) + err = -EIO; + goto out; + } + count -= copy; + } + pos += pgb.cpt_next; + } + err = 0; + +out: + fput(file); + return err; +} + + +static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, + struct cpt_inode_image *ii, + struct cpt_context *ctx) +{ + int err; + struct file *file = *file_p; + struct iattr newattrs; + + if (!S_ISREG(fi->cpt_i_mode)) + return 0; + + if (file == NULL) { + file = shmem_file_setup("dev/zero", ii->cpt_size, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + *file_p = file; + } + + if (ii->cpt_next > ii->cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen); + if (err) + return err; + if (hdr.cpt_object == CPT_OBJ_PAGES) { + err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen, + fi->cpt_inode+ii->cpt_next, ctx); + if (err) + return err; + } + } + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + /* stage 1 - update size like do_truncate does */ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + newattrs.ia_size = ii->cpt_size; + cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime); + err = notify_change(file->f_dentry, &newattrs); + if (err) + goto out; + + /* stage 2 - update times, owner and mode */ + newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME_SET | + ATTR_MODE | ATTR_UID | ATTR_GID; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT; + newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT); + cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime); + cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime); + err = notify_change(file->f_dentry, &newattrs); + +out: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return err; +} + +static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, + int was_dentry_open, loff_t pos, + cpt_context_t *ctx) +{ + if (fi->cpt_pos != file->f_pos) { + int err = -ESPIPE; + if (file->f_op->llseek) + err = file->f_op->llseek(file, fi->cpt_pos, 0); + if (err < 0) { + dprintk_ctx("file %Ld lseek %Ld - %Ld\n", + (long long)pos, + (long long)file->f_pos, + (long long)fi->cpt_pos); + file->f_pos = fi->cpt_pos; + } + } + file->f_uid = fi->cpt_uid; + file->f_gid = fi->cpt_gid; + file->f_owner.pid = 0; + if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) { + file->f_owner.pid = find_get_pid(fi->cpt_fown_pid); + if (file->f_owner.pid == NULL) { + wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", + fi->cpt_fown_pid); + return -EINVAL; + } + } + file->f_owner.uid = fi->cpt_fown_uid; + file->f_owner.euid = fi->cpt_fown_euid; + file->f_owner.signum = fi->cpt_fown_signo; + + if (file->f_mode != fi->cpt_mode) { + if (was_dentry_open && + ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { + file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); + file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); + } + if (file->f_mode != fi->cpt_mode) + wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); + } + if (file->f_flags != fi->cpt_flags) { + if (!(fi->cpt_flags&O_NOFOLLOW)) + file->f_flags &= ~O_NOFOLLOW; + if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { + file->f_flags &= ~O_NONBLOCK; + file->f_flags |= fi->cpt_flags&O_NONBLOCK; + } + if (fi->cpt_flags&FASYNC) { + if (fi->cpt_fown_fd == -1) { + wprintk_ctx("No fd for FASYNC\n"); + return -EINVAL; + } else if (file->f_op && file->f_op->fasync) { + if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { + wprintk_ctx("FASYNC problem\n"); + return -EINVAL; + } else { + file->f_flags |= FASYNC; + } + } + } + if (file->f_flags != fi->cpt_flags) { + eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); + return -EINVAL; + } + } + return 0; +} + +static struct file * +open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, + struct cpt_inode_image *ii, cpt_context_t *ctx) +{ + struct file * file; + char *suffix = NULL; + int attempt = 0; + int tmp_pass = 0; + mode_t mode = fi->cpt_i_mode; + + /* Strip (deleted) part... */ + if (strlen(name) > strlen(" (deleted)")) { + if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { + suffix = &name[strlen(name) - strlen(" (deleted)")]; + *suffix = 0; + } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { + memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); + suffix = name + strlen(name); + } + } + +try_again: + for (;;) { + if (attempt) { + if (attempt > 1000) { + eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); + return ERR_PTR(-EEXIST); + } + if (suffix == NULL) { + eprintk_ctx("open_deleted: no suffix\n"); + return ERR_PTR(-EEXIST); + } + sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); + } + attempt++; + + if (S_ISFIFO(mode)) { + int err; + err = sc_mknod(name, S_IFIFO|(mode&017777), 0); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = open_pipe(name, fi, flags, ctx); + sc_unlink(name); + } else if (S_ISCHR(mode)) { + int err; + err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_unlink(name); + } else if (S_ISDIR(mode)) { + int err; + err = sc_mkdir(name, mode&017777); + if (err == -EEXIST) + continue; + if (err < 0 && !tmp_pass) + goto change_dir; + if (err < 0) + return ERR_PTR(err); + file = filp_open(name, flags, mode&017777); + sc_rmdir(name); + } else { + file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); + if (IS_ERR(file)) { + if (PTR_ERR(file) == -EEXIST) + continue; + if (!tmp_pass) + goto change_dir; + } else { + sc_unlink(name); + } + } + break; + } + + if (IS_ERR(file)) { + eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); + return file; + } else { + dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); + } + return file; + +change_dir: + sprintf(name, "/tmp/rst%u", current->pid); + suffix = name + strlen(name); + attempt = 1; + tmp_pass = 1; + goto try_again; +} + +struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) +{ + int err; + int was_dentry_open = 0; + cpt_object_t *obj; + cpt_object_t *iobj; + struct cpt_file_image fi; + __u8 *name = NULL; + struct file *file; + int flags; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); + if (obj) { + file = obj->o_obj; + if (obj->o_index >= 0) { + dprintk_ctx("file is attached to a socket\n"); + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + fixup_file_flags(file, &fi, 0, pos, ctx); + } + get_file(file); + return file; + } + + err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); + if (err < 0) + goto err_out; + + flags = make_flags(&fi); + + /* Easy way, inode has been already open. */ + if (fi.cpt_inode != CPT_NULL && + !(fi.cpt_lflags & CPT_DENTRY_CLONING) && + (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && + iobj->o_parent) { + struct file *filp = iobj->o_parent; + file = dentry_open(dget(filp->f_dentry), + mntget(filp->f_vfsmnt), flags); + dprintk_ctx("rst_file: file obtained by dentry_open\n"); + was_dentry_open = 1; + goto map_file; + } + + if (fi.cpt_next > fi.cpt_hdrlen) + name = rst_get_name(pos + sizeof(fi), ctx); + + if (!name) { + eprintk_ctx("no name for file?\n"); + err = -EINVAL; + goto err_out; + } + + if (fi.cpt_lflags & CPT_DENTRY_DELETED) { + struct cpt_inode_image ii; + if (fi.cpt_inode == CPT_NULL) { + eprintk_ctx("deleted file and no inode.\n"); + err = -EINVAL; + goto err_out; + } + + err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx); + if (err) + goto err_out; + + if (ii.cpt_next > ii.cpt_hdrlen) { + struct cpt_object_hdr hdr; + err = ctx->pread(&hdr, sizeof(hdr), ctx, + fi.cpt_inode + ii.cpt_hdrlen); + if (err) + goto err_out; + if (hdr.cpt_object == CPT_OBJ_NAME) { + rst_put_name(name, ctx); + name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen, + ctx); + if (!name) { + eprintk_ctx("no name for link?\n"); + err = -EINVAL; + goto err_out; + } + goto open_file; + } + } + + /* One very special case... */ + if (S_ISREG(fi.cpt_i_mode) && + (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { + /* MAP_ANON|MAP_SHARED mapping. + * kernel makes this damn ugly way, when file which + * is passed to mmap by user does not match + * file finally attached to VMA. Ok, rst_mm + * has to take care of this. Otherwise, it will fail. + */ + file = NULL; + } else if (S_ISREG(fi.cpt_i_mode) || + S_ISCHR(fi.cpt_i_mode) || + S_ISFIFO(fi.cpt_i_mode) || + S_ISDIR(fi.cpt_i_mode)) { + if (S_ISCHR(fi.cpt_i_mode)) { + file = open_special(&fi, flags, 1, ctx); + if (file != NULL) + goto map_file; + } + file = open_deleted(name, flags, &fi, &ii, ctx); + if (IS_ERR(file)) + goto out; + } else { + eprintk_ctx("not a regular deleted file.\n"); + err = -EINVAL; + goto err_out; + } + + err = fixup_file_content(&file, &fi, &ii, ctx); + if (err) + goto err_put; + goto map_file; + } else { +open_file: + if (!name[0]) { + eprintk_ctx("empty name for file?\n"); + err = -EINVAL; + goto err_out; + } + if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && + (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) + goto map_file; +#ifdef CONFIG_INOTIFY_USER + if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) && + (file = rst_open_inotify(&fi, flags, ctx)) != NULL) + goto map_file; +#else + if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) { + err = -EINVAL; + goto err_out; + } +#endif + if (S_ISFIFO(fi.cpt_i_mode) && + (file = open_pipe(name, &fi, flags, ctx)) != NULL) + goto map_file; + if (!S_ISREG(fi.cpt_i_mode) && + (file = open_special(&fi, flags, 0, ctx)) != NULL) + goto map_file; + } + + file = filp_open(name, flags, 0); + +map_file: + if (!IS_ERR(file)) { + fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); + + if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { + err = fixup_pipe_data(file, &fi, ctx); + if (err) + goto err_put; + } + + /* This is very special hack. Logically, cwd/root are + * nothing but open directories. Nevertheless, this causes + * failures of restores, when number of open files in VE + * is close to limit. So, if it is rst_file() of cwd/root + * (fd = -2) and the directory is not deleted, we skip + * adding files to object table. If the directory is + * not unlinked, this cannot cause any problems. + */ + if (fd != -2 || + !S_ISDIR(file->f_dentry->d_inode->i_mode) || + (fi.cpt_lflags & CPT_DENTRY_DELETED)) { + obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); + if (!obj) { + obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); + if (obj) + get_file(file); + } + if (obj) + cpt_obj_setpos(obj, pos, ctx); + + obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); + if (obj) { + cpt_obj_setpos(obj, fi.cpt_inode, ctx); + if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED)) + obj->o_parent = file; + } + } + + if (fi.cpt_next > fi.cpt_hdrlen) { + err = fixup_flocks(file, &fi, pos, ctx); + if (err) + goto err_put; + } + } else { + if (fi.cpt_lflags & CPT_DENTRY_PROC) { + dprintk_ctx("rst_file /proc delayed\n"); + file = NULL; + } else if (name) + eprintk_ctx("can't open file %s\n", name); + } + +out: + if (name) + rst_put_name(name, ctx); + return file; + +err_put: + if (file) + fput(file); +err_out: + if (name) + rst_put_name(name, ctx); + return ERR_PTR(err); +} + + +__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (ti->cpt_files == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) + flag |= CLONE_FILES; + if (ti->cpt_fs == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) + flag |= CLONE_FS; + return flag; +} + +static void local_close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->fdt->max_fds) + break; + set = files->fdt->open_fds->fds_bits[j]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fdt->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + files->fdt->open_fds->fds_bits[j] = 0; + files->fdt->close_on_exec->fds_bits[j] = 0; + j++; + } +} + +extern int expand_fdtable(struct files_struct *files, int nr); + + +int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct cpt_files_struct_image fi; + struct files_struct *f = current->files; + cpt_object_t *obj; + loff_t pos, endpos; + int err; + + if (ti->cpt_files == CPT_NULL) { + current->files = NULL; + if (f) + put_files_struct(f); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); + if (obj) { + if (obj->o_obj != f) { + put_files_struct(f); + f = obj->o_obj; + atomic_inc(&f->count); + current->files = f; + } + return 0; + } + + err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); + if (err) + return err; + + local_close_files(f); + + if (fi.cpt_max_fds > f->fdt->max_fds) { + spin_lock(&f->file_lock); + err = expand_fdtable(f, fi.cpt_max_fds-1); + spin_unlock(&f->file_lock); + if (err < 0) + return err; + } + + pos = ti->cpt_files + fi.cpt_hdrlen; + endpos = ti->cpt_files + fi.cpt_next; + while (pos < endpos) { + struct cpt_fd_image fdi; + struct file *filp; + + err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); + if (err) + return err; + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), + (long long)fdi.cpt_file); + return PTR_ERR(filp); + } + if (filp == NULL) { + int err = rst_filejob_queue(pos, ctx); + if (err) + return err; + } else { + if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); + f->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, f->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); + } + pos += fdi.cpt_next; + } + f->next_fd = fi.cpt_next_fd; + + obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); + if (obj) { + cpt_obj_setpos(obj, ti->cpt_files, ctx); + cpt_obj_setindex(obj, fi.cpt_index, ctx); + } + return 0; +} + +int rst_do_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + int err; + struct task_struct *tsk; + struct cpt_fd_image fdi; + struct file *filp; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(j->pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (!tsk) + return -EINVAL; + + err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); + if (err) { + put_task_struct(tsk); + return err; + } + + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + if (tsk->files->fdt->fd[fdi.cpt_fd] || + FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { + eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); + put_task_struct(tsk); + return -EBUSY; + } + + filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); + if (IS_ERR(filp)) { + eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file); + put_task_struct(tsk); + return PTR_ERR(filp); + } + if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); + tsk->files->fdt->fd[fdi.cpt_fd] = filp; + FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); + if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) + FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); + + dprintk_ctx("filejob %Ld done\n", j->fdi); + + put_task_struct(tsk); + ctx->filejob_queue = j->next; + kfree(j); + } + return 0; +} + +void rst_flush_filejobs(cpt_context_t *ctx) +{ + struct filejob *j; + + while ((j = ctx->filejob_queue) != NULL) { + ctx->filejob_queue = j->next; + kfree(j); + } +} + +int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct fs_struct *f = current->fs; + cpt_object_t *obj; + + if (ti->cpt_fs == CPT_NULL) { + exit_fs(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_fs(current); + f = obj->o_obj; + atomic_inc(&f->count); + current->fs = f; + } + return 0; + } + + /* Do _not_ restore root. Image contains absolute pathnames. + * So, we fix it in context of rst process. + */ + + obj = cpt_object_add(CPT_OBJ_FS, f, ctx); + if (obj) + cpt_obj_setpos(obj, ti->cpt_fs, ctx); + + return 0; +} + +int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, + loff_t *pos, struct cpt_context *ctx) +{ + struct cpt_file_image fi; + struct file * file; + int err; + + err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); + if (err) + return err; + + file = rst_file(*pos, -2, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + + *dp = dget(file->f_dentry); + *mp = mntget(file->f_vfsmnt); + *pos += fi.cpt_next; + fput(file); + return 0; +} + +static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_root; + struct vfsmount *old_rootmnt; + write_lock(&fs->lock); + old_root = fs->root; + old_rootmnt = fs->rootmnt; + fs->rootmnt = mnt; + fs->root = dentry; + write_unlock(&fs->lock); + if (old_root) { + dput(old_root); + mntput(old_rootmnt); + } +} + +static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct dentry *old_pwd; + struct vfsmount *old_pwdmnt; + + write_lock(&fs->lock); + old_pwd = fs->pwd; + old_pwdmnt = fs->pwdmnt; + fs->pwdmnt = mnt; + fs->pwd = dentry; + write_unlock(&fs->lock); + + if (old_pwd) { + dput(old_pwd); + mntput(old_pwdmnt); + } +} + + +int rst_restore_fs(struct cpt_context *ctx) +{ + loff_t pos; + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FS) { + struct cpt_fs_struct_image fi; + struct fs_struct *fs = obj->o_obj; + int i; + struct dentry *d[3]; + struct vfsmount *m[3]; + + err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); + if (err) + return err; + + fs->umask = fi.cpt_umask; + + pos = obj->o_pos + fi.cpt_hdrlen; + d[0] = d[1] = d[2] = NULL; + m[0] = m[1] = m[2] = NULL; + i = 0; + while (pos < obj->o_pos + fi.cpt_next && i<3) { + err = cpt_get_dentry(d+i, m+i, &pos, ctx); + if (err) { + eprintk_ctx("cannot get_dir: %d", err); + for (--i; i >= 0; i--) { + if (d[i]) + dput(d[i]); + if (m[i]) + mntput(m[i]); + } + return err; + } + i++; + } + if (d[0]) + __set_fs_root(fs, m[0], d[0]); + if (d[1]) + __set_fs_pwd(fs, m[1], d[1]); + if (d[2]) { + struct dentry *olddentry; + struct vfsmount *oldmnt; + write_lock(&fs->lock); + oldmnt = fs->altrootmnt; + olddentry = fs->altroot; + fs->altrootmnt = m[2]; + fs->altroot = d[2]; + write_unlock(&fs->lock); + + if (olddentry) { + dput(olddentry); + mntput(oldmnt); + } + } + } + return err; +} + +int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, + unsigned long flags, unsigned long mnt_flags, + struct cpt_context *ctx) +{ + int err; + + if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) + mntbind = NULL; + + if (mntbind) + flags |= MS_BIND; + /* Join per-mountpoint flags with global flags */ + if (mnt_flags & MNT_NOSUID) + flags |= MS_NOSUID; + if (mnt_flags & MNT_NODEV) + flags |= MS_NODEV; + if (mnt_flags & MNT_NOEXEC) + flags |= MS_NOEXEC; + + err = sc_mount(mntbind, mntpnt, mnttype, flags); + if (err < 0) { + eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); + return err; + } + return 0; +} + +static int undumptmpfs(void *arg) +{ + int i; + int *pfd = arg; + int fd1, fd2, err; + char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + set_fs(KERNEL_DS); + fd1 = sc_open("/dev/null", O_WRONLY, 0); + fd2 = sc_open("/dev/null", O_WRONLY, 0); +try: + if (fd1 < 0 || fd2 < 0) { + if (fd1 == -ENOENT && fd2 == -ENOENT) { + err = sc_mknod("/dev/null", S_IFCHR|0666, + new_encode_dev((MEM_MAJOR<files->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + i = sc_execve("/bin/tar", argv, NULL); + eprintk("failed to exec /bin/tar: %d\n", i); + return 255 << 8; +} + +static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = *pos + v.cpt_hdrlen; + end = *pos + v.cpt_next; + *pos += v.cpt_next; + do { + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("tar exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("tar terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx) +{ + struct mnt_namespace *n; + struct list_head *p; + struct vfsmount *t; + char *path, *path_buf; + int ret; + + n = current->nsproxy->mnt_ns; + ret = -ENOENT; + path_buf = cpt_get_buf(ctx); + down_read(&namespace_sem); + list_for_each(p, &n->list) { + t = list_entry(p, struct vfsmount, mnt_list); + path = d_path(t->mnt_root, t, path_buf, PAGE_SIZE); + if (IS_ERR(path)) + continue; + if (!strcmp(path, mntpnt) && + !strcmp(t->mnt_sb->s_type->name, mnttype)) { + ret = 0; + break; + } + } + up_read(&namespace_sem); + __cpt_release_buf(ctx); + return ret; +} + +int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t endpos; + + endpos = pos + mi->cpt_next; + pos += mi->cpt_hdrlen; + + while (pos < endpos) { + char *mntdev; + char *mntpnt; + char *mnttype; + char *mntbind; + + mntdev = __rst_get_name(&pos, ctx); + mntpnt = __rst_get_name(&pos, ctx); + mnttype = __rst_get_name(&pos, ctx); + mntbind = NULL; + if (mi->cpt_mntflags & CPT_MNT_BIND) + mntbind = __rst_get_name(&pos, ctx); + err = -EINVAL; + if (mnttype && mntpnt) { + err = 0; + if (!(mi->cpt_mntflags & CPT_MNT_EXT) && + strcmp(mntpnt, "/")) { + err = do_one_mount(mntpnt, mnttype, mntbind, + mi->cpt_flags, + mi->cpt_mntflags, ctx); + if (!err && + strcmp(mnttype, "tmpfs") == 0 && + !(mi->cpt_mntflags & (CPT_MNT_BIND))) + err = rst_restore_tmpfs(&pos, ctx); + } else if (mi->cpt_mntflags & CPT_MNT_EXT) { + err = check_ext_mount(mntpnt, mnttype, ctx); + if (err) + eprintk_ctx("mount point is missing: %s\n", mntpnt); + } + } + if (mntdev) + rst_put_name(mntdev, ctx); + if (mntpnt) + rst_put_name(mntpnt, ctx); + if (mnttype) + rst_put_name(mnttype, ctx); + if (mntbind) + rst_put_name(mntbind, ctx); + if (err) + return err; + } + return 0; +} + +int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) +{ + int err; + struct cpt_vfsmount_image mi; + + while (pos < endpos) { + err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); + if (err) + return err; + err = restore_one_vfsmount(&mi, pos, ctx); + if (err) + return err; + pos += mi.cpt_next; + } + return 0; +} + +int rst_root_namespace(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr sbuf; + int done = 0; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); + if (err) + return err; + if (done) { + eprintk_ctx("multiple namespaces are not supported\n"); + break; + } + done++; + err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + + return 0; +} + +int rst_stray_files(struct cpt_context *ctx) +{ + int err = 0; + loff_t sec = ctx->sections[CPT_SECT_FILES]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_object_hdr sbuf; + cpt_object_t *obj; + + err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); + if (err) + break; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); + if (!obj) { + struct file *file; + + dprintk_ctx("stray file %Ld\n", sec); + + file = rst_sysv_shm(sec, ctx); + + if (IS_ERR(file)) { + eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } else { + fput(file); + } + } + sec += sbuf.cpt_next; + } + + return err; +} diff -uprN linux-2.6.24/kernel/cpt/rst_inotify.c linux-2.6.24.ovz/kernel/cpt/rst_inotify.c --- linux-2.6.24/kernel/cpt/rst_inotify.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_inotify.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,198 @@ +/* + * + * kernel/cpt/rst_inotify.c + * + * Copyright (C) 2000-2007 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_kernel.h" +#include "cpt_fsmagic.h" +#include "cpt_syscalls.h" + +extern struct file_operations inotify_fops; + +struct file *rst_open_inotify(struct cpt_file_image *fi, + unsigned flags, + struct cpt_context *ctx) +{ + struct file *file; + int fd; + + fd = sys_inotify_init(); + if (fd < 0) + return ERR_PTR(fd); + + file = fget(fd); + sys_close(fd); + return file; +} + +static int restore_one_inotify(cpt_object_t *obj, + loff_t pos, + struct cpt_inotify_image *ibuf, + cpt_context_t *ctx) +{ + int err = 0; + loff_t endpos; + struct file *file = obj->o_obj; + struct inotify_device *dev; + + if (file->f_op != &inotify_fops) { + eprintk_ctx("bad inotify file\n"); + return -EINVAL; + } + + dev = file->private_data; + + if (unlikely(dev == NULL)) { + eprintk_ctx("bad inotify device\n"); + return -EINVAL; + } + + endpos = pos + ibuf->cpt_next; + pos += ibuf->cpt_hdrlen; + while (pos < endpos) { + union { + struct cpt_inotify_wd_image wi; + struct cpt_inotify_ev_image ei; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + eprintk_ctx("rst_get_object: %d\n", err); + return err; + } + if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) { + struct dentry *d; + struct vfsmount *mnt; + loff_t fpos = pos + u.wi.cpt_hdrlen; + + err = cpt_get_dentry(&d, &mnt, &fpos, ctx); + if (err) { + eprintk_ctx("cpt_get_dentry: %d\n", err); + return err; + } + + mutex_lock(&dev->up_mutex); + dev->ih->last_wd = u.wi.cpt_wd - 1; + err = inotify_create_watch(dev, d, mnt, u.wi.cpt_mask); + dev->ih->last_wd = ibuf->cpt_last_wd; + if (err != u.wi.cpt_wd) { + eprintk_ctx("wrong inotify descriptor %u %u\n", err, u.wi.cpt_wd); + if (err >= 0) + err = -EINVAL; + } else + err = 0; + mutex_unlock(&dev->up_mutex); + dput(d); + mntput(mnt); + if (err) + break; + } else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) { + struct inotify_user_watch dummy_watch; + struct inotify_watch *w; + char *name = NULL; + + if (u.ei.cpt_namelen) { + name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL); + if (name == NULL) { + err = -ENOMEM; + break; + } + name[u.ei.cpt_namelen] = 0; + err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen); + if (err) { + kfree(name); + break; + } + } + + w = &dummy_watch.wdata; + dummy_watch.dev = dev; + atomic_set(&w->count, 2); + + /* Trick to avoid destruction due to exit event */ + if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT)) + atomic_inc(&w->count); + dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask, + u.ei.cpt_cookie, name, NULL); + if (name) + kfree(name); + } else { + eprintk_ctx("bad object: %u\n", u.wi.cpt_object); + err = -EINVAL; + break; + } + pos += u.wi.cpt_next; + } + return err; +} + +int rst_inotify(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_INOTIFY]; + loff_t endsec; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_inotify_image ibuf; + + err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx); + if (err) + return err; + obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx); + if (obj == NULL) { + eprintk_ctx("cannot find inotify file object\n"); + return -EINVAL; + } + err = restore_one_inotify(obj, sec, &ibuf, ctx); + if (err) + return err; + sec += ibuf.cpt_next; + } + + return 0; + +} diff -uprN linux-2.6.24/kernel/cpt/rst_mm.c linux-2.6.24.ovz/kernel/cpt/rst_mm.c --- linux-2.6.24/kernel/cpt/rst_mm.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_mm.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1144 @@ +/* + * + * kernel/cpt/rst_mm.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#include +#endif +#include +#include +#include +#include + +#ifdef CONFIG_VE +#include +#include +#endif + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_ubc.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" +#ifdef CONFIG_VZ_CHECKPOINT_LAZY +#include "cpt_pagein.h" +#endif + +#include "cpt_syscalls.h" + +#define __PAGE_NX (1ULL<<63) + +static unsigned long make_prot(struct cpt_vma_image *vmai) +{ + unsigned long prot = 0; + + if (vmai->cpt_flags&VM_READ) + prot |= PROT_READ; + if (vmai->cpt_flags&VM_WRITE) + prot |= PROT_WRITE; + if (vmai->cpt_flags&VM_EXEC) + prot |= PROT_EXEC; + if (vmai->cpt_flags&VM_GROWSDOWN) + prot |= PROT_GROWSDOWN; + if (vmai->cpt_flags&VM_GROWSUP) + prot |= PROT_GROWSUP; + return prot; +} + +static unsigned long make_flags(struct cpt_vma_image *vmai) +{ + unsigned long flags = MAP_FIXED; + + if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) + flags |= MAP_SHARED; + else + flags |= MAP_PRIVATE; + + if (vmai->cpt_file == CPT_NULL) + flags |= MAP_ANONYMOUS; + if (vmai->cpt_flags&VM_GROWSDOWN) + flags |= MAP_GROWSDOWN; +#ifdef MAP_GROWSUP + if (vmai->cpt_flags&VM_GROWSUP) + flags |= MAP_GROWSUP; +#endif + if (vmai->cpt_flags&VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vmai->cpt_flags&VM_EXECUTABLE) + flags |= MAP_EXECUTABLE; + if (!(vmai->cpt_flags&VM_ACCOUNT)) + flags |= MAP_NORESERVE; + return flags; +} + +#ifdef CONFIG_X86 +#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \ + && !defined(CONFIG_XEN) +static int __alloc_ldt(mm_context_t *pc, int mincount) +{ + int oldsize, newsize, nr; + + if (mincount <= pc->size) + return 0; + /* + * LDT got larger - reallocate if necessary. + */ + oldsize = pc->size; + mincount = (mincount+511)&(~511); + newsize = mincount*LDT_ENTRY_SIZE; + for (nr = 0; nr * PAGE_SIZE < newsize; nr++) { + BUG_ON(nr * PAGE_SIZE >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); + if (!pc->ldt_pages[nr]) + goto nomem; + clear_highpage(pc->ldt_pages[nr]); + } + } + pc->size = mincount; + return 0; + +nomem: + while (--nr >= 0) + __free_page(pc->ldt_pages[nr]); + pc->size = 0; + return -ENOMEM; +} + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int i; + int err; + int size; + + err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); + if (err) + return err; + + size = mm->context.size*LDT_ENTRY_SIZE; + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); + kunmap(mm->context.ldt_pages[nr]); + if (err) + return err; + } + + load_LDT(&mm->context); + return 0; +} + +#else + +static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) +{ + struct mm_struct *mm = current->mm; + int oldsize = mm->context.size; + void *oldldt; + void *newldt; + int err; + + if (li->cpt_size > PAGE_SIZE) + newldt = vmalloc(li->cpt_size); + else + newldt = kmalloc(li->cpt_size, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); + if (err) + return err; + + oldldt = mm->context.ldt; + mm->context.ldt = newldt; + mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; + + load_LDT(&mm->context); + + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} +#endif +#endif + +static int +restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) +{ + struct aio_ring_info *info = &aio_ctx->ring_info; + unsigned nr_events = aio_ctx->max_reqs; + unsigned long size; + int nr_pages; + + /* We recalculate parameters of the ring exactly like + * fs/aio.c does and then compare calculated values + * with ones, stored in dump. They must be the same. */ + + nr_events += 2; + + size = sizeof(struct aio_ring); + size += sizeof(struct io_event) * nr_events; + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + + if (nr_pages != aimg->cpt_ring_pages) + return -EINVAL; + + info->nr_pages = nr_pages; + + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + + if (nr_events != aimg->cpt_nr) + return -EINVAL; + + info->nr = 0; + info->ring_pages = info->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + if (!info->ring_pages) + return -ENOMEM; + memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + } + + info->mmap_size = nr_pages * PAGE_SIZE; + + /* This piece of shit is not entirely my fault. Kernel aio.c makes + * something odd mmap()ping some pages and then pinning them. + * I guess it is just some mud remained of failed attempt to show ring + * to user space. The result is odd. :-) Immediately after + * creation of AIO context, kernel shares those pages with user + * and user can read and even write there. But after the first + * fork, pages are marked COW with evident consequences. + * I remember, I did the same mistake in the first version + * of mmapped packet socket, luckily that crap never reached + * mainstream. + * + * So, what are we going to do? I can simulate this odd behaviour + * exactly, but I am not insane yet. For now just take the pages + * from user space. Alternatively, we could keep kernel copy + * in AIO context image, which would be more correct. + * + * What is wrong now? If the pages are COWed, ring is transferred + * incorrectly. + */ + down_read(¤t->mm->mmap_sem); + info->mmap_base = aimg->cpt_mmap_base; + info->nr_pages = get_user_pages(current, current->mm, + info->mmap_base, nr_pages, + 1, 0, info->ring_pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (unlikely(info->nr_pages != nr_pages)) { + int i; + + for (i=0; inr_pages; i++) + put_page(info->ring_pages[i]); + if (info->ring_pages && info->ring_pages != info->internal_pages) + kfree(info->ring_pages); + return -EFAULT; + } + + aio_ctx->user_id = info->mmap_base; + + info->nr = nr_events; + info->tail = aimg->cpt_tail; + + return 0; +} + +static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) +{ + int err; + struct kioctx *aio_ctx; + extern spinlock_t aio_nr_lock; + + aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + if (!aio_ctx) + return -ENOMEM; + + memset(aio_ctx, 0, sizeof(*aio_ctx)); + aio_ctx->max_reqs = aimg->cpt_max_reqs; + + if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { + kmem_cache_free(kioctx_cachep, aio_ctx); + eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); + return err; + } + + aio_ctx->mm = current->mm; + atomic_inc(&aio_ctx->mm->mm_count); + atomic_set(&aio_ctx->users, 1); + spin_lock_init(&aio_ctx->ctx_lock); + spin_lock_init(&aio_ctx->ring_info.ring_lock); + init_waitqueue_head(&aio_ctx->wait); + INIT_LIST_HEAD(&aio_ctx->active_reqs); + INIT_LIST_HEAD(&aio_ctx->run_list); + INIT_WORK(&aio_ctx->wq.work, aio_kick_handler); + + spin_lock(&aio_nr_lock); + aio_nr += aio_ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + write_lock(&aio_ctx->mm->ioctx_list_lock); + aio_ctx->next = aio_ctx->mm->ioctx_list; + aio_ctx->mm->ioctx_list = aio_ctx; + write_unlock(&aio_ctx->mm->ioctx_list_lock); + + return 0; +} + +struct anonvma_map +{ + struct hlist_node list; + struct anon_vma *avma; + __u64 id; +}; + +static int verify_create_anonvma(struct mm_struct *mm, + struct cpt_vma_image *vmai, + cpt_context_t *ctx) +{ + struct anon_vma *avma = NULL; + struct anon_vma *new_avma; + struct vm_area_struct *vma; + int h; + + if (!ctx->anonvmas) { + if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) + return -EINVAL; + if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) + return -ENOMEM; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) + INIT_HLIST_HEAD(&ctx->anonvmas[h]); + } else { + struct anonvma_map *map; + struct hlist_node *elem; + + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { + if (map->id == vmai->cpt_anonvmaid) { + avma = map->avma; + break; + } + } + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + return -ESRCH; + } + if (vma->vm_start != vmai->cpt_start) { + up_read(&mm->mmap_sem); + eprintk_ctx("vma start mismatch\n"); + return -EINVAL; + } + if (vma->vm_pgoff != vmai->cpt_pgoff) { + dprintk_ctx("vma pgoff mismatch, fixing\n"); + if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { + eprintk_ctx("cannot fixup vma pgoff\n"); + up_read(&mm->mmap_sem); + return -EINVAL; + } + vma->vm_pgoff = vmai->cpt_pgoff; + } + + if (!vma->anon_vma) { + if (avma) { + vma->anon_vma = avma; + anon_vma_link(vma); + } else { + int err; + + err = anon_vma_prepare(vma); + + if (err) { + up_read(&mm->mmap_sem); + return err; + } + } + } else { + /* Note, we _can_ arrive to the situation, when two + * different anonvmaid's point to one anon_vma, this happens + * f.e. when mmap() merged new area to previous one and + * they will share one anon_vma even if they did not on + * original host. + * + * IT IS OK. To all that I understand, we may merge all + * the anon_vma's and rmap can scan all the huge list of vmas + * searching for page. It is just "suboptimal". + * + * Real disaster would happen, if vma already got an anon_vma + * with different id. It is very rare case, kernel does the + * best efforts to merge anon_vmas when some attributes are + * different. In this case we will fall to copying memory. + */ + if (avma && vma->anon_vma != avma) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch\n"); + return 0; + } + } + + new_avma = vma->anon_vma; + up_read(&mm->mmap_sem); + + if (!avma) { + struct anonvma_map *map; + + if (!new_avma) + return -EINVAL; + + if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) + return -ENOMEM; + + map->id = vmai->cpt_anonvmaid; + map->avma = new_avma; + h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); + hlist_add_head(&map->list, &ctx->anonvmas[h]); + } + return 0; +} + +static int copy_mm_pages(struct mm_struct *src, unsigned long start, + unsigned long end) +{ + int err; + + for (; start < end; start += PAGE_SIZE) { + struct page *page; + struct page *spage; + void *maddr, *srcaddr; + + err = get_user_pages(current, current->mm, + start, 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) + return err; + + err = get_user_pages(current, src, + start, 1, 0, 1, &spage, NULL); + + if (err == 0) + err = -EFAULT; + if (err < 0) { + page_cache_release(page); + return err; + } + + srcaddr = kmap(spage); + maddr = kmap(page); + memcpy(maddr, srcaddr, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + kunmap(spage); + page_cache_release(page); + page_cache_release(spage); + } + return 0; +} + +static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) +{ + int err = 0; + unsigned long addr; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct file *file = NULL; + unsigned long prot; + int checked = 0; + + if (vmai->cpt_type == CPT_VMA_VDSO) { + if (ctx->vdso == NULL) { +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES + err = arch_setup_additional_pages(NULL, 0, + vmai->cpt_start); +#endif + goto out; + } + } + + prot = make_prot(vmai); + + if (vmai->cpt_file != CPT_NULL) { + if (vmai->cpt_type == CPT_VMA_TYPE_0) { + file = rst_file(vmai->cpt_file, -1, ctx); + if (IS_ERR(file)) { + eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file); + return PTR_ERR(file); + } + } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { + file = rst_sysv_shm(vmai->cpt_file, ctx); + if (IS_ERR(file)) + return PTR_ERR(file); + } + } + + down_write(&mm->mmap_sem); + addr = do_mmap_pgoff(file, vmai->cpt_start, + vmai->cpt_end-vmai->cpt_start, + prot, make_flags(vmai), + vmai->cpt_pgoff); + + if (addr != vmai->cpt_start) { + up_write(&mm->mmap_sem); + + err = -EINVAL; + if (IS_ERR((void*)addr)) + err = addr; + goto out; + } + + vma = find_vma(mm, vmai->cpt_start); + if (vma == NULL) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot find mmapped vma\n"); + err = -ESRCH; + goto out; + } + + /* do_mmap_pgoff() can merge new area to previous one (not to the next, + * we mmap in order, the rest of mm is still unmapped). This can happen + * f.e. if flags are to be adjusted later, or if we had different + * anon_vma on two adjacent regions. Split it by brute force. */ + if (vma->vm_start != vmai->cpt_start) { + dprintk_ctx("vma %Ld merged, split\n", vmapos); + err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); + if (err) { + up_write(&mm->mmap_sem); + eprintk_ctx("cannot split vma\n"); + goto out; + } + } + up_write(&mm->mmap_sem); + + if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { + err = verify_create_anonvma(mm, vmai, ctx); + if (err) { + eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); + goto out; + } + } + + if (vmai->cpt_type == CPT_VMA_VDSO) { + struct page *page; + void *maddr; + + err = get_user_pages(current, current->mm, + (unsigned long)vmai->cpt_start, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("can't get vdso: get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + memcpy(maddr, ctx->vdso, PAGE_SIZE); + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + goto out; + } + + if (vmai->cpt_next > vmai->cpt_hdrlen) { + loff_t offset = vmapos + vmai->cpt_hdrlen; + + do { + union { + struct cpt_page_block pb; + struct cpt_remappage_block rpb; + struct cpt_copypage_block cpb; + struct cpt_lazypage_block lpb; + struct cpt_iterpage_block ipb; + } u; + loff_t pos; + + err = rst_get_object(-1, offset, &u, ctx); + if (err) { + eprintk_ctx("vma fix object: %d\n", err); + goto out; + } + if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { + err = sc_remap_file_pages(u.rpb.cpt_start, + u.rpb.cpt_end-u.rpb.cpt_start, + 0, u.rpb.cpt_pgoff, 0); + if (err < 0) { + eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, + (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), + (__u32)u.rpb.cpt_pgoff); + goto out; + } + offset += u.rpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + unsigned long ptr = u.lpb.cpt_start; + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE, + ptr, ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { + struct vm_area_struct *vma, *vma1; + struct mm_struct *src; + struct anon_vma *src_anon; + cpt_object_t *mobj; + + if (!vmai->cpt_anonvmaid) { + err = -EINVAL; + eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); + if (!mobj) { + eprintk_ctx("lost mm_struct to clone pages from\n"); + err = -ESRCH; + goto out; + } + src = mobj->o_obj; + + down_read(&src->mmap_sem); + src_anon = NULL; + vma1 = find_vma(src, u.cpb.cpt_start); + if (vma1) + src_anon = vma1->anon_vma; + up_read(&src->mmap_sem); + + if (!vma1) { + eprintk_ctx("lost src vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + + if (!src_anon || + !vma->anon_vma || + vma->anon_vma != src_anon || + vma->vm_start - vma1->vm_start != + (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { + up_read(&mm->mmap_sem); + wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); + err = copy_mm_pages(mobj->o_obj, + u.cpb.cpt_start, + u.cpb.cpt_end); + } else { + err = __copy_page_range(vma, vma1, + u.cpb.cpt_start, + u.cpb.cpt_end-u.cpb.cpt_start); + up_read(&mm->mmap_sem); + } + if (err) { + eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, + (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), + (long)u.cpb.cpt_source); + goto out; + } + + offset += u.cpb.cpt_next; + continue; + } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES || + u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES + ) { +#ifdef CONFIG_VZ_CHECKPOINT_ITER + unsigned long ptr = u.lpb.cpt_start; + u64 page_pos[16]; + pos = offset + sizeof(u.pb); + + err = ctx->pread(&page_pos, + 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE, + ctx, + pos); + if (err) { + eprintk_ctx("Oops\n"); + goto out; + } + + down_read(&mm->mmap_sem); + if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { + up_read(&mm->mmap_sem); + eprintk_ctx("lost vm_area_struct\n"); + err = -ESRCH; + goto out; + } + err = anon_vma_prepare(vma); + if (err) { + up_read(&mm->mmap_sem); + goto out; + } + while (ptr < u.lpb.cpt_end) { + err = rst_iter(vma, + page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE], + ptr, + ctx); + if (err) + break; + ptr += PAGE_SIZE; + } + if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) { + make_pages_present((unsigned long)u.lpb.cpt_start, + (unsigned long)u.lpb.cpt_end); + } + up_read(&mm->mmap_sem); +#else + err = -EINVAL; +#endif + if (err) + goto out; + offset += u.cpb.cpt_next; + continue; + } + if (u.pb.cpt_object != CPT_OBJ_PAGES) { + eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); + err = -EINVAL; + goto out; + } + pos = offset + sizeof(u.pb); + if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { + /* I guess this is get_user_pages() messed things, + * this happens f.e. when gdb inserts breakpoints. + */ + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { + struct page *page; + void *maddr; + err = get_user_pages(current, current->mm, + (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, + 1, 1, 1, &page, NULL); + if (err == 0) + err = -EFAULT; + if (err < 0) { + eprintk_ctx("get_user_pages: %d\n", err); + goto out; + } + err = 0; + maddr = kmap(page); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + memset(maddr, 0, PAGE_SIZE); + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + err = ctx->pread(maddr, PAGE_SIZE, + ctx, pos + i*PAGE_SIZE); + if (err) { + kunmap(page); + goto out; + } + } else { + err = -EINVAL; + kunmap(page); + goto out; + } + set_page_dirty_lock(page); + kunmap(page); + page_cache_release(page); + } + } else { + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + if (u.pb.cpt_content == CPT_CONTENT_VOID) { + int i; + for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { + err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); + if (err) { + eprintk_ctx("__put_user 2 %d\n", err); + goto out; + } + } + } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { + loff_t tpos = pos; + err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), + u.pb.cpt_end-u.pb.cpt_start, + &tpos); + if (err != u.pb.cpt_end-u.pb.cpt_start) { + if (err >= 0) + err = -EIO; + goto out; + } + } else { + err = -EINVAL; + goto out; + } + if (!(prot&PROT_WRITE)) + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + } + err = 0; + offset += u.pb.cpt_next; + } while (offset < vmapos + vmai->cpt_next); + } + +check: + do { + struct vm_area_struct *vma; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (vma) { + if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { + VM_ClearReadHint(vma); + vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; + } + if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { + dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); + up_read(&mm->mmap_sem); + if (vma->vm_flags&VM_LOCKED) + err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + else + err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); + /* When mlock fails with EFAULT, it means + * that it could not bring in pages. + * It can happen after mlock() on unreadable + * VMAs. But VMA is correctly locked, + * so that this error can be ignored. */ + if (err == -EFAULT) + err = 0; + if (err) + goto out; + goto check; + } + if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (unsigned long long)vma->vm_page_prot.pgprot, + (unsigned long long)vmai->cpt_pgprot); +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && + (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) + wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, + (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); +#endif + if (vma->vm_flags != vmai->cpt_flags) { + unsigned long x = vma->vm_flags ^ vmai->cpt_flags; + if (x & VM_EXEC) { + /* Crap. On i386 this is OK. + * It is impossible to make via mmap/mprotect + * exec.c clears VM_EXEC on stack. */ + vma->vm_flags &= ~VM_EXEC; + } else if ((x & VM_ACCOUNT) && !checked) { + checked = 1; + if (!(prot&PROT_WRITE)) { + up_read(&mm->mmap_sem); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); + sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); + goto check; + } + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } else { + wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, + (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); + } + } + } else { + wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); + } + up_read(&mm->mmap_sem); + } while (0); + +out: + if (file) + fput(file); + return err; +} + +#ifndef CONFIG_IA64 +#define TASK_UNMAP_START 0 +#else +/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping + * used to accelerate speculative dereferences of NULL pointer. */ +#define TASK_UNMAP_START PAGE_SIZE +#endif + +static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) +{ + int err = 0; + unsigned int def_flags; + struct mm_struct *mm = current->mm; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + down_write(&mm->mmap_sem); + do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START); + +#ifdef CONFIG_BEANCOUNTERS + /* + * MM beancounter is usually correct from the fork time, + * but not for init, for example. + * Luckily, mm_ub can be changed for a completely empty MM. + */ + bc = rst_lookup_ubc(vmi->cpt_mmub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc); + if (err & NOTIFY_FAIL) { + up_write(&mm->mmap_sem); + return -ECHRNG; + } + if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) { + struct user_beancounter *old_bc; + + old_bc = mm->mm_ub; + mm->mm_ub = bc; + bc = old_bc; + } + err = 0; + put_beancounter(bc); +#endif + + mm->start_code = vmi->cpt_start_code; + mm->end_code = vmi->cpt_end_code; + mm->start_data = vmi->cpt_start_data; + mm->end_data = vmi->cpt_end_data; + mm->start_brk = vmi->cpt_start_brk; + mm->brk = vmi->cpt_brk; + mm->start_stack = vmi->cpt_start_stack; + mm->arg_start = vmi->cpt_start_arg; + mm->arg_end = vmi->cpt_end_arg; + mm->env_start = vmi->cpt_start_env; + mm->env_end = vmi->cpt_end_env; + mm->def_flags = 0; + def_flags = vmi->cpt_def_flags; + + mm->flags = vmi->cpt_dumpable; + if (ctx->image_version < CPT_VERSION_24) + mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS; + + mm->vps_dumpable = vmi->cpt_vps_dumpable; +#ifndef CONFIG_IA64 + if (ctx->image_version >= CPT_VERSION_9) { + mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso); + current_thread_info()->sysenter_return = CPT_SYSENTER_RETURN; + } +#endif + +#if 0 /* def CONFIG_HUGETLB_PAGE*/ +/* NB: ? */ + int used_hugetlb; +#endif + up_write(&mm->mmap_sem); + + if (vmi->cpt_next > vmi->cpt_hdrlen) { + loff_t offset = pos + vmi->cpt_hdrlen; + do { + union { + struct cpt_vma_image vmai; + struct cpt_aio_ctx_image aioi; + struct cpt_obj_bits bits; + } u; + err = rst_get_object(-1, offset, &u, ctx); + if (err) + goto out; + if (u.vmai.cpt_object == CPT_OBJ_VMA) { +#ifdef CONFIG_IA64 + //// Later... + if (u.vmai.cpt_start) +#endif + err = do_rst_vma(&u.vmai, offset, pos, ctx); + if (err) + goto out; +#ifdef CONFIG_X86 + } else if (u.bits.cpt_object == CPT_OBJ_BITS && + u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { + err = do_rst_ldt(&u.bits, offset, ctx); + if (err) + goto out; +#endif + } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { + err = do_rst_aio(&u.aioi, offset, ctx); + if (err) + goto out; + } else { + eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); + err = -EINVAL; + goto out; + } + offset += u.vmai.cpt_next; + } while (offset < pos + vmi->cpt_next); + } + + down_write(&mm->mmap_sem); + mm->def_flags = def_flags; + up_write(&mm->mmap_sem); + + +out: + return err; +} + +extern void exit_mm(struct task_struct * tsk); + +int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err = 0; + cpt_object_t *mobj; + void *tmp = (void*)__get_free_page(GFP_KERNEL); + struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; + + if (!tmp) + return -ENOMEM; + + if (ti->cpt_mm == CPT_NULL) { + if (current->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + current); + exit_mm(current); + } + goto out; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + if (current->mm != mobj->o_obj) BUG(); + goto out; + } + + if (current->mm == NULL) { + struct mm_struct *mm = mm_alloc(); + if (mm == NULL) { + err = -ENOMEM; + goto out; + } + err = init_new_context(current, mm); + if (err) { + mmdrop(mm); + goto out; + } + current->mm = mm; + } + + if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) + goto out; + if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { + eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm); + goto out; + } + err = -ENOMEM; + mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); + if (mobj != NULL) { + err = 0; + cpt_obj_setpos(mobj, ti->cpt_mm, ctx); + } + +out: + if (tmp) + free_page((unsigned long)tmp); + return err; +} + +/* This is part of mm setup, made in parent context. Mostly, it is the place, + * where we graft mm of another process to child. + */ + +int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + cpt_object_t *mobj; + + /* Task without mm. Just get rid of this. */ + if (ti->cpt_mm == CPT_NULL) { + if (tsk->mm) { + virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, + tsk); + mmput(tsk->mm); + tsk->mm = NULL; + } + return 0; + } + + mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); + if (mobj) { + struct mm_struct *newmm = mobj->o_obj; + /* Good, the MM is already created. */ + if (newmm == tsk->mm) { + /* Already done by clone(). */ + return 0; + } + mmput(tsk->mm); + atomic_inc(&newmm->mm_users); + tsk->mm = newmm; + tsk->active_mm = newmm; + } + return 0; +} + +/* We use CLONE_VM when mm of child is going to be shared with parent. + * Otherwise mm is copied. + */ + +__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + if (ti->cpt_mm == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) + return CLONE_VM; + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/rst_net.c linux-2.6.24.ovz/kernel/cpt/rst_net.c --- linux-2.6.24/kernel/cpt/rst_net.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_net.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,580 @@ +/* + * + * kernel/cpt/rst_net.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" +#include "cpt_net.h" +#include "cpt_files.h" + +#include "cpt_syscalls.h" + +extern struct in_ifaddr *inet_alloc_ifa(void); +extern int inet_insert_ifa(struct in_ifaddr *ifa); +extern struct in_device *inetdev_init(struct net_device *dev); + +int rst_restore_ifaddr(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_ns->net_ns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_ifaddr_image di; + struct net_device *dev; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int cindex = -1; + int err; + err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); + if (err) + return err; + cindex = di.cpt_index; + rtnl_lock(); + dev = __dev_get_by_index(net, cindex); + if (dev && di.cpt_family == AF_INET) { + struct in_device *in_dev; + struct in_ifaddr *ifa; + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + in_dev = inetdev_init(dev); + ifa = inet_alloc_ifa(); + if (ifa) { + ifa->ifa_local = di.cpt_address[0]; + ifa->ifa_address = di.cpt_peer[0]; + ifa->ifa_broadcast = di.cpt_broadcast[0]; + ifa->ifa_prefixlen = di.cpt_masklen; + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + ifa->ifa_flags = di.cpt_flags; + ifa->ifa_scope = di.cpt_scope; + memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + err = inet_insert_ifa(ifa); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + } else if (dev && di.cpt_family == AF_INET6) { + __u32 prefered_lft; + __u32 valid_lft; + prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ? + 0 : di.cpt_prefered_lft; + valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ? + 0xFFFFFFFF : di.cpt_valid_lft; + err = inet6_addr_add(dev->ifindex, + (struct in6_addr *)di.cpt_address, + di.cpt_masklen, 0, + prefered_lft, + valid_lft); + if (err && err != -EEXIST) { + rtnl_unlock(); + eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); + return err; + } +#endif + } else { + rtnl_unlock(); + eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); + return -EINVAL; + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) +{ + int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + struct rtmsg *rtm = NLMSG_DATA(nlh); + __u32 prefix0 = 0; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(rta, attrlen)) { + if (rta->rta_type == RTA_DST) { + prefix0 = *(__u32*)RTA_DATA(rta); + } + rta = RTA_NEXT(rta, attrlen); + } + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (rtm->rtm_family == AF_INET6) { + if (rtm->rtm_type == RTN_LOCAL) + return 2; + if (rtm->rtm_flags & RTM_F_CLONED) + return 2; + if (rtm->rtm_protocol == RTPROT_UNSPEC || + rtm->rtm_protocol == RTPROT_RA || + rtm->rtm_protocol == RTPROT_REDIRECT || + rtm->rtm_protocol == RTPROT_KERNEL) + return 2; + if (rtm->rtm_protocol == RTPROT_BOOT && + ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || + (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) + return 2; + } +#endif + return rtm->rtm_protocol == RTPROT_KERNEL; +} + +int rst_restore_route(struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct msghdr msg; + struct iovec iov; + struct sockaddr_nl nladdr; + mm_segment_t oldfs; + loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr v; + char *pg; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen >= h.cpt_next) + return 0; + + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); + if (err < 0) + return err; + + err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); + if (err) + return err; + + pg = (char*)__get_free_page(GFP_KERNEL); + if (pg == NULL) { + err = -ENOMEM; + goto out_sock; + } + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + endsec = sec + v.cpt_next; + sec += v.cpt_hdrlen; + + while (sec < endsec) { + struct nlmsghdr *n; + struct nlmsghdr nh; + int kernel_flag; + + if (endsec - sec < sizeof(nh)) + break; + + err = ctx->pread(&nh, sizeof(nh), ctx, sec); + if (err) + goto out_sock_pg; + if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE || + endsec - sec < nh.nlmsg_len) { + err = -EINVAL; + goto out_sock_pg; + } + err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); + if (err) + goto out_sock_pg; + + n = (struct nlmsghdr*)pg; + n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; + + err = rewrite_rtmsg(n, ctx); + if (err < 0) + goto out_sock_pg; + kernel_flag = err; + + if (kernel_flag == 2) + goto do_next; + + iov.iov_base=n; + iov.iov_len=nh.nlmsg_len; + msg.msg_name=&nladdr; + msg.msg_namelen=sizeof(nladdr); + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_flags=MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(sock, &msg, nh.nlmsg_len); + set_fs(oldfs); + + if (err < 0) + goto out_sock_pg; + err = 0; + + iov.iov_base=pg; + iov.iov_len=PAGE_SIZE; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); + set_fs(oldfs); + if (err != -EAGAIN) { + if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && + n->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = NLMSG_DATA(n); + if (e->error != -EEXIST || !kernel_flag) + eprintk_ctx("NLMERR: %d\n", e->error); + } else { + eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); + } + } +do_next: + err = 0; + sec += NLMSG_ALIGN(nh.nlmsg_len); + } + +out_sock_pg: + free_page((unsigned long)pg); +out_sock: + sock_release(sock); + return err; +} + +int rst_resume_network(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + env->disable_net = 0; + put_ve(env); + return 0; +} + +/* We do not restore skb queue, just reinit it */ +static int rst_restore_tuntap(loff_t pos, struct cpt_netdev_image *di, + struct cpt_context *ctx) +{ + int err = -ENODEV; +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) + struct cpt_tuntap_image ti; + struct net_device *dev; + struct file *bind_file = NULL; + struct tun_struct *tun; + + pos += di->cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NET_TUNTAP, pos, &ti, ctx); + if (err) + return err; + + if (ti.cpt_bindfile) { + bind_file = rst_file(ti.cpt_bindfile, -1, ctx); + if (IS_ERR(bind_file)) { + eprintk_ctx("rst_restore_tuntap:" + "rst_file: %Ld\n", + (unsigned long long)ti.cpt_bindfile); + return PTR_ERR(bind_file); + } + } + + rtnl_lock(); + err = -ENOMEM; + dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup); + if (!dev) + goto out; + + tun = netdev_priv(dev); + + tun->dev = dev; + tun->owner = ti.cpt_owner; + tun->flags = ti.cpt_flags; + tun->attached = ti.cpt_attached; + tun->if_flags = ti.cpt_if_flags; + tun_net_init(dev); + BUG_ON(sizeof(ti.cpt_dev_addr) != sizeof(tun->dev_addr)); + memcpy(tun->dev_addr, ti.cpt_dev_addr, sizeof(ti.cpt_dev_addr)); + BUG_ON(sizeof(ti.cpt_chr_filter) != sizeof(tun->chr_filter)); + memcpy(tun->chr_filter, ti.cpt_chr_filter, sizeof(ti.cpt_chr_filter)); + BUG_ON(sizeof(ti.cpt_net_filter) != sizeof(tun->net_filter)); + memcpy(tun->net_filter, ti.cpt_net_filter, sizeof(ti.cpt_net_filter)); + + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + eprintk_ctx("failed to register tun/tap net device\n"); + goto out; + } + list_add(&tun->list, &tun_dev_list); + + bind_file->private_data = tun; + tun->bind_file = bind_file; + +out: + fput(bind_file); + rtnl_unlock(); +#endif + return err; +} + +int rst_restore_netdev(struct cpt_context *ctx) +{ + struct net *net = get_exec_env()->ve_ns->net_ns; + int err; + loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_netdev_image di; + struct net_device *dev; + + get_exec_env()->disable_net = 1; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + struct net_device *dev_new; + err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); + if (err) + return err; + + if (di.cpt_next > sizeof(di)) { + err = rst_restore_tuntap(sec, &di, ctx); + if (err) + return err; + } + + rtnl_lock(); + dev = __dev_get_by_name(net, di.cpt_name); + if (dev) { + if (dev->ifindex != di.cpt_index) { + dev_new = __dev_get_by_index(net, di.cpt_index); + if (!dev_new) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + } else { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->index_hlist); + hlist_del(&dev_new->index_hlist); + if (dev_new->iflink == dev_new->ifindex) + dev_new->iflink = dev->ifindex; + dev_new->ifindex = dev->ifindex; + if (dev->iflink == dev->ifindex) + dev->iflink = di.cpt_index; + dev->ifindex = di.cpt_index; + hlist_add_head(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + hlist_add_head(&dev_new->index_hlist, + dev_index_hash(net, dev_new->ifindex)); + write_unlock_bh(&dev_base_lock); + } + } + if (di.cpt_flags^dev->flags) { + err = dev_change_flags(dev, di.cpt_flags); + if (err) + eprintk_ctx("dev_change_flags err: %d\n", err); + } + } else { + eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); + } + rtnl_unlock(); + sec += di.cpt_next; + } + return 0; +} + +static int dumpfn(void *arg) +{ + int i; + int *pfd = arg; + char *argv[] = { "iptables-restore", "-c", NULL }; + + if (pfd[0] != 0) + sc_dup2(pfd[0], 0); + + for (i=1; ifiles->fdt->max_fds; i++) + sc_close(i); + + module_put(THIS_MODULE); + + set_fs(KERNEL_DS); + i = sc_execve("/sbin/iptables-restore", argv, NULL); + if (i == -ENOENT) + i = sc_execve("/usr/sbin/iptables-restore", argv, NULL); + eprintk("failed to exec iptables-restore: %d\n", i); + return 255 << 8; +} + +static int rst_restore_iptables(struct cpt_context * ctx) +{ + int err; + int pfd[2]; + struct file *f; + struct cpt_object_hdr v; + int n; + struct cpt_section_hdr h; + loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; + loff_t end; + int pid; + int status; + mm_segment_t oldfs; + sigset_t ignore, blocked; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + if (h.cpt_hdrlen == h.cpt_next) + return 0; + if (h.cpt_hdrlen > h.cpt_next) + return -EINVAL; + sec += h.cpt_hdrlen; + err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); + if (err < 0) + return err; + + err = sc_pipe(pfd); + if (err < 0) + return err; + ignore.sig[0] = CPT_SIG_IGNORE_MASK; + sigprocmask(SIG_BLOCK, &ignore, &blocked); + pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); + if (err < 0) { + eprintk_ctx("iptables local_kernel_thread: %d\n", err); + goto out; + } + f = fget(pfd[1]); + sc_close(pfd[1]); + sc_close(pfd[0]); + + ctx->file->f_pos = sec + v.cpt_hdrlen; + end = sec + v.cpt_next; + do { + char *p; + char buf[16]; + + n = end - ctx->file->f_pos; + if (n > sizeof(buf)) + n = sizeof(buf); + + if (ctx->read(buf, n, ctx)) + break; + if ((p = memchr(buf, 0, n)) != NULL) + n = p - buf; + oldfs = get_fs(); set_fs(KERNEL_DS); + f->f_op->write(f, buf, n, &f->f_pos); + set_fs(oldfs); + } while (ctx->file->f_pos < end); + + fput(f); + + oldfs = get_fs(); set_fs(KERNEL_DS); + if ((err = sc_waitx(pid, 0, &status)) < 0) + eprintk_ctx("wait4: %d\n", err); + else if ((status & 0x7f) == 0) { + err = (status & 0xff00) >> 8; + if (err != 0) { + eprintk_ctx("iptables-restore exited with %d\n", err); + err = -EINVAL; + } + } else { + eprintk_ctx("iptables-restore terminated\n"); + err = -EINVAL; + } + set_fs(oldfs); + sigprocmask(SIG_SETMASK, &blocked, NULL); + + return err; + +out: + if (pfd[1] >= 0) + sc_close(pfd[1]); + if (pfd[0] >= 0) + sc_close(pfd[0]); + sigprocmask(SIG_SETMASK, &blocked, NULL); + return err; +} + +int rst_restore_net(struct cpt_context *ctx) +{ + int err; + + err = rst_restore_netdev(ctx); + if (!err) + err = rst_restore_ifaddr(ctx); + if (!err) + err = rst_restore_route(ctx); + if (!err) + err = rst_restore_iptables(ctx); + if (!err) + err = rst_restore_ip_conntrack(ctx); + return err; +} diff -uprN linux-2.6.24/kernel/cpt/rst_proc.c linux-2.6.24.ovz/kernel/cpt/rst_proc.c --- linux-2.6.24/kernel/cpt/rst_proc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_proc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,581 @@ +/* + * + * kernel/cpt/rst_proc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_dump.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_kernel.h" + +MODULE_AUTHOR("Alexey Kuznetsov "); +MODULE_LICENSE("GPL"); + +/* List of contexts and lock protecting the list */ +static struct list_head cpt_context_list; +static spinlock_t cpt_context_lock; + +static int proc_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + cpt_context_t *ctx; + + len += sprintf(buffer, "Ctx Id VE State\n"); + + spin_lock(&cpt_context_lock); + + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + len += sprintf(buffer+len,"%p %08x %-8u %d", + ctx, + ctx->contextid, + ctx->ve_id, + ctx->ctx_state + ); +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + len += pagein_info_printf(buffer+len, ctx); +#endif + + buffer[len++] = '\n'; + + pos = begin+len; + if (pos < offset) { + len = 0; + begin = pos; + } + if (pos > offset+length) + goto done; + } + *eof = 1; + +done: + spin_unlock(&cpt_context_lock); + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) + len = length; + if(len < 0) + len = 0; + return len; +} + +void rst_context_release(cpt_context_t *ctx) +{ + list_del(&ctx->ctx_list); + spin_unlock(&cpt_context_lock); + + if (ctx->ctx_state > 0) + rst_resume(ctx); + ctx->ctx_state = CPT_CTX_ERROR; + + rst_close_dumpfile(ctx); + + if (ctx->anonvmas) { + int h; + for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { + while (!hlist_empty(&ctx->anonvmas[h])) { + struct hlist_node *elem = ctx->anonvmas[h].first; + hlist_del(elem); + kfree(elem); + } + } + free_page((unsigned long)ctx->anonvmas); + } + cpt_flush_error(ctx); + if (ctx->errorfile) { + fput(ctx->errorfile); + ctx->errorfile = NULL; + } + if (ctx->error_msg) { + free_page((unsigned long)ctx->error_msg); + ctx->error_msg = NULL; + } +#ifdef CONFIG_VZ_CHECKPOINT_ITER + rst_drop_iter_dir(ctx); +#endif +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + if (ctx->pgin_task) + put_task_struct(ctx->pgin_task); +#endif + if (ctx->filejob_queue) + rst_flush_filejobs(ctx); + if (ctx->vdso) + free_page((unsigned long)ctx->vdso); + if (ctx->objcount) + eprintk_ctx("%d objects leaked\n", ctx->objcount); + kfree(ctx); + + spin_lock(&cpt_context_lock); +} + +static void __cpt_context_put(cpt_context_t *ctx) +{ + if (!--ctx->refcount) + rst_context_release(ctx); +} + +static void cpt_context_put(cpt_context_t *ctx) +{ + spin_lock(&cpt_context_lock); + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); +} + +cpt_context_t * rst_context_open(void) +{ + cpt_context_t *ctx; + + if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { + rst_context_init(ctx); + spin_lock(&cpt_context_lock); + list_add_tail(&ctx->ctx_list, &cpt_context_list); + spin_unlock(&cpt_context_lock); + ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); + if (ctx->error_msg != NULL) + ctx->error_msg[0] = 0; + } + return ctx; +} + +void rst_report_error(int err, cpt_context_t *ctx) +{ + if (ctx->statusfile) { + mm_segment_t oldfs; + int status = 7 /* VZ_ENVCREATE_ERROR */; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) + ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); + set_fs(oldfs); + fput(ctx->statusfile); + ctx->statusfile = NULL; + } +} + + +static cpt_context_t * cpt_context_lookup(unsigned int ctxid) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + list_for_each_entry(ctx, &cpt_context_list, ctx_list) { + if (ctx->contextid == ctxid) { + ctx->refcount++; + spin_unlock(&cpt_context_lock); + return ctx; + } + } + spin_unlock(&cpt_context_lock); + return NULL; +} + +static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) +{ + int err = 0; + cpt_context_t *ctx; + struct file *dfile = NULL; + + unlock_kernel(); + + if (cmd == CPT_TEST_CAPS) { + err = test_cpu_caps(); + goto out_lock; + } + + if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { + cpt_context_t *old_ctx; + + ctx = NULL; + if (cmd == CPT_JOIN_CONTEXT) { + err = -ENOENT; + ctx = cpt_context_lookup(arg); + if (!ctx) + goto out_lock; + } + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + file->private_data = ctx; + + if (old_ctx) { + if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { + old_ctx->sticky = 0; + old_ctx->refcount--; + } + __cpt_context_put(old_ctx); + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_lock; + } + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + if (ctx) + ctx->refcount++; + spin_unlock(&cpt_context_lock); + + if (!ctx) { + cpt_context_t *old_ctx; + + err = -ENOMEM; + ctx = rst_context_open(); + if (!ctx) + goto out_lock; + + spin_lock(&cpt_context_lock); + old_ctx = (cpt_context_t*)file->private_data; + if (!old_ctx) { + ctx->refcount++; + file->private_data = ctx; + } else { + old_ctx->refcount++; + } + if (old_ctx) { + __cpt_context_put(ctx); + ctx = old_ctx; + } + spin_unlock(&cpt_context_lock); + } + + if (cmd == CPT_GET_CONTEXT) { + unsigned int contextid = (unsigned int)arg; + + err = -EINVAL; + if (ctx->contextid && ctx->contextid != contextid) + goto out_nosem; + if (!ctx->contextid) { + cpt_context_t *c1 = cpt_context_lookup(contextid); + if (c1) { + cpt_context_put(c1); + err = -EEXIST; + goto out_nosem; + } + ctx->contextid = contextid; + } + spin_lock(&cpt_context_lock); + if (!ctx->sticky) { + ctx->sticky = 1; + ctx->refcount++; + } + spin_unlock(&cpt_context_lock); + err = 0; + goto out_nosem; + } + + down(&ctx->main_sem); + + err = -EBUSY; + if (ctx->ctx_state < 0) + goto out; + + err = 0; + switch (cmd) { + case CPT_SET_DUMPFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + err = -EBADF; + dfile = fget(arg); + if (dfile == NULL) + break; + if (dfile->f_op == NULL || + dfile->f_op->read == NULL) { + fput(dfile); + break; + } + err = 0; + } + if (ctx->file) + fput(ctx->file); + ctx->file = dfile; + break; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + case CPT_SET_PAGEINFDIN: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_in) + fput(ctx->pagein_file_in); + ctx->pagein_file_in = dfile; + break; + case CPT_SET_PAGEINFDOUT: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->pagein_file_out) + fput(ctx->pagein_file_out); + ctx->pagein_file_out = dfile; + break; + case CPT_PAGEIND: + err = rst_pageind(ctx); + break; +#endif +#ifdef CONFIG_VZ_CHECKPOINT_ITER + case CPT_ITER: + err = rst_iteration(ctx); + break; +#endif + case CPT_SET_LOCKFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->lockfile) + fput(ctx->lockfile); + ctx->lockfile = dfile; + break; + case CPT_SET_STATUSFD: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->statusfile) + fput(ctx->statusfile); + ctx->statusfile = dfile; + break; + case CPT_SET_ERRORFD: + if (arg >= 0) { + dfile = fget(arg); + if (dfile == NULL) { + err = -EBADF; + break; + } + } + if (ctx->errorfile) + fput(ctx->errorfile); + ctx->errorfile = dfile; + break; + case CPT_SET_VEID: + if (ctx->ctx_state > 0) { + err = -EBUSY; + break; + } + ctx->ve_id = arg; + break; + case CPT_UNDUMP: + if (ctx->ctx_state > 0) { + err = -ENOENT; + break; + } + ctx->ctx_state = CPT_CTX_UNDUMPING; + err = vps_rst_undump(ctx); + if (err) { + rst_report_error(err, ctx); + if (rst_kill(ctx) == 0) + ctx->ctx_state = CPT_CTX_IDLE; + } else { + ctx->ctx_state = CPT_CTX_UNDUMPED; + } + break; + case CPT_RESUME: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_resume(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + case CPT_KILL: + if (!ctx->ctx_state) { + err = -ENOENT; + break; + } + err = rst_kill(ctx); + if (!err) + ctx->ctx_state = CPT_CTX_IDLE; + break; + default: + err = -EINVAL; + break; + } + +out: + cpt_flush_error(ctx); + up(&ctx->main_sem); +out_nosem: + cpt_context_put(ctx); +out_lock: + lock_kernel(); + if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || + err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) + err = -EINTR; + return err; +} + +static int rst_open(struct inode * inode, struct file * file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int rst_release(struct inode * inode, struct file * file) +{ + cpt_context_t *ctx; + + spin_lock(&cpt_context_lock); + ctx = (cpt_context_t*)file->private_data; + file->private_data = NULL; + if (ctx) + __cpt_context_put(ctx); + spin_unlock(&cpt_context_lock); + + + module_put(THIS_MODULE); + return 0; +} + +static struct file_operations rst_fops = +{ + .owner = THIS_MODULE, + .ioctl = rst_ioctl, + .open = rst_open, + .release = rst_release, +}; + + +static struct proc_dir_entry *proc_ent; +extern void *schedule_tail_p; +extern void schedule_tail_hook(void); + +static struct ctl_table_header *ctl_header; + +static ctl_table debug_table[] = { + { + .ctl_name = 9476, + .procname = "rst", + .data = &debug_level, + .maxlen = sizeof(debug_level), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +static ctl_table root_table[] = { + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { .ctl_name = 0 } +}; + +static int __init init_rst(void) +{ + int err; + + err = -ENOMEM; + ctl_header = register_sysctl_table(root_table); + if (!ctl_header) + goto err_mon; + + spin_lock_init(&cpt_context_lock); + INIT_LIST_HEAD(&cpt_context_list); + + err = -EINVAL; + proc_ent = create_proc_entry_mod("rst", 0600, NULL, THIS_MODULE); + if (!proc_ent) + goto err_out; + + rst_fops.read = proc_ent->proc_fops->read; + rst_fops.write = proc_ent->proc_fops->write; + rst_fops.llseek = proc_ent->proc_fops->llseek; + proc_ent->proc_fops = &rst_fops; + + proc_ent->read_proc = proc_read; + proc_ent->data = NULL; + proc_ent->owner = THIS_MODULE; + return 0; + +err_out: + unregister_sysctl_table(ctl_header); +err_mon: + return err; +} +module_init(init_rst); + +static void __exit exit_rst(void) +{ + remove_proc_entry("rst", NULL); + unregister_sysctl_table(ctl_header); + + spin_lock(&cpt_context_lock); + while (!list_empty(&cpt_context_list)) { + cpt_context_t *ctx; + ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); + + if (!ctx->sticky) + ctx->refcount++; + ctx->sticky = 0; + + BUG_ON(ctx->refcount != 1); + + __cpt_context_put(ctx); + } + spin_unlock(&cpt_context_lock); +} +module_exit(exit_rst); diff -uprN linux-2.6.24/kernel/cpt/rst_process.c linux-2.6.24.ovz/kernel/cpt/rst_process.c --- linux-2.6.24/kernel/cpt/rst_process.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_process.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1627 @@ +/* + * + * kernel/cpt/rst_process.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_ubc.h" +#include "cpt_process.h" +#include "cpt_kernel.h" + + +#define HOOK_RESERVE 256 + +struct resume_info +{ + asmlinkage void (*hook)(struct resume_info *); + unsigned long hooks; +#define HOOK_TID 0 +#define HOOK_CONT 1 +#define HOOK_LSI 2 +#define HOOK_RESTART 3 + unsigned long tid_ptrs[2]; + siginfo_t last_siginfo; +}; + +#ifdef CONFIG_X86_32 + +#define IN_SYSCALL(regs) ((long)(regs)->orig_eax >= 0) +#define IN_ERROR(regs) ((long)(regs)->eax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->eax)) +#define SYSCALL_RETVAL(regs) ((regs)->eax) +#define SYSCALL_NR(regs) ((regs)->orig_eax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->eax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->eax = (new); \ + (regs)->eip -= 2; } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +/* In new kernels task_pt_regs() is define to something inappropriate */ +#undef task_pt_regs +#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) + +#elif defined(CONFIG_X86_64) + +#define IN_SYSCALL(regs) ((long)(regs)->orig_rax >= 0) +#define IN_ERROR(regs) ((long)(regs)->rax < 0) +#define SYSCALL_ERRNO(regs) (-(long)((regs)->rax)) +#define SYSCALL_RETVAL(regs) ((regs)->rax) +#define SYSCALL_NR(regs) ((regs)->orig_rax) + +#define SYSCALL_SETRET(regs,val) do { (regs)->rax = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->rax = (new); \ + (regs)->rip -= 2; } while (0) + +#define __NR32_restart_syscall 0 +#define __NR32_rt_sigtimedwait 177 +#define __NR32_pause 29 +#define __NR32_futex 240 + +#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR_##name) || \ + ((task_thread_info(tsk)->flags&_TIF_IA32) && \ + SYSCALL_NR(regs) == __NR32_##name)) + +#elif defined (CONFIG_IA64) + +#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0) +#define IN_ERROR(regs) ((long)(regs)->r10 == -1) +#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0) +#define SYSCALL_RETVAL(regs) ((regs)->r8) +#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1) + +#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0) + +#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \ + (regs)->r10 = 0; \ + ia64_decrement_ip(regs); } while (0) + +#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) + +#else + +#error This arch is not supported + +#endif + +#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs)) + +pid_t vpid_to_pid(pid_t nr) +{ + pid_t vnr; + struct pid *pid; + + rcu_read_lock(); + pid = find_vpid(nr); + vnr = (pid == NULL ? -1 : pid->numbers[0].nr); + rcu_read_unlock(); + return vnr; +} + +static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) +{ + memset(info, 0, sizeof(*info)); + switch(si->cpt_code & __SI_MASK) { + case __SI_TIMER: + info->si_tid = si->cpt_pid; + info->si_overrun = si->cpt_uid; + info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); + info->si_sys_private = si->cpt_utime; + break; + case __SI_POLL: + info->si_band = si->cpt_pid; + info->si_fd = si->cpt_uid; + break; + case __SI_FAULT: + info->si_addr = cpt_ptr_import(si->cpt_sigval); +#ifdef __ARCH_SI_TRAPNO + info->si_trapno = si->cpt_pid; +#endif + break; + case __SI_CHLD: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_status = si->cpt_sigval; + info->si_stime = si->cpt_stime; + info->si_utime = si->cpt_utime; + break; + case __SI_KILL: + case __SI_RT: + case __SI_MESGQ: + default: + info->si_pid = si->cpt_pid; + info->si_uid = si->cpt_uid; + info->si_ptr = cpt_ptr_import(si->cpt_sigval); + break; + } + info->si_signo = si->cpt_signo; + info->si_errno = si->cpt_errno; + info->si_code = si->cpt_code; +} + +static int restore_sigqueue(struct task_struct *tsk, + struct sigpending *queue, unsigned long start, + unsigned long end) +{ + while (start < end) { + struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; + if (si->cpt_object == CPT_OBJ_SIGINFO) { + struct sigqueue *q = NULL; + struct user_struct *up; + + up = alloc_uid(get_exec_env()->ve_ns->user_ns, si->cpt_user); + if (!up) + return -ENOMEM; + q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); + if (!q) { + free_uid(up); + return -ENOMEM; + } + if (ub_siginfo_charge(q, get_exec_ub())) { + kmem_cache_free(sigqueue_cachep, q); + free_uid(up); + return -ENOMEM; + } + + INIT_LIST_HEAD(&q->list); + /* Preallocated elements (posix timers) are not + * supported yet. It is safe to replace them with + * a private one. */ + q->flags = 0; + q->user = up; + atomic_inc(&q->user->sigpending); + + decode_siginfo(&q->info, si); + list_add_tail(&q->list, &queue->list); + } + start += si->cpt_next; + } + return 0; +} + +int rst_process_linkage(cpt_context_t *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (tsk == NULL) { + eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EINVAL; + } + + if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) { + struct pid *pid; + + pid = get_pid(find_vpid(ti->cpt_pgrp)); + if (!pid) { + eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (task_pgrp_nr(tsk) != pid_nr(pid)) { + detach_pid(tsk, PIDTYPE_PGID); + set_task_pgrp(tsk, pid_nr(pid)); + if (thread_group_leader(tsk)) { + get_pid(pid); + attach_pid(tsk, PIDTYPE_PGID, pid); + } + } + write_unlock_irq(&tasklist_lock); + if (task_pgrp_nr(tsk) != pid_nr(pid)) { + put_pid(pid); + eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + put_pid(pid); + } + if (task_session_vnr(tsk) != ti->cpt_session) { + struct pid *pid; + + pid = get_pid(find_vpid(ti->cpt_session)); + if (!pid) { + eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + + write_lock_irq(&tasklist_lock); + if (task_session_nr(tsk) != pid_nr(pid)) { + detach_pid(tsk, PIDTYPE_SID); + set_task_session(tsk, pid_nr(pid)); + if (thread_group_leader(tsk)) { + get_pid(pid); + attach_pid(tsk, PIDTYPE_SID, pid); + } + } + write_unlock_irq(&tasklist_lock); + if (task_session_nr(tsk) != pid_nr(pid)) { + put_pid(pid); + eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + put_pid(pid); + } + if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) { + struct pid *pid; + + pid = get_pid(find_vpid(ti->cpt_old_pgrp)); + if (!pid) { + eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); + return -EINVAL; + } + tsk->signal->tty_old_pgrp = pid; + } + } + + return 0; +} + +struct pid *alloc_vpid_safe(pid_t vnr) +{ + struct pid *pid; + + pid = alloc_pid(current->nsproxy->pid_ns, vnr); + if (!pid) + pid = get_pid(find_vpid(vnr)); + return pid; +} + +static int +restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx) +{ + int err; + struct cpt_signal_image *si = cpt_get_buf(ctx); + + current->signal->tty = NULL; + + err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (task_pgrp_vnr(current) != si->cpt_pgrp) { + struct pid * pid = NULL, *free = NULL; + + if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_pgrp)) { + eprintk_ctx("external process group " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_pgrp); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid != NULL) { + if (task_pgrp_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_PGID); + set_task_pgrp(current, pid_nr(pid)); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_PGID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + } + + current->signal->tty_old_pgrp = NULL; + if ((int)si->cpt_old_pgrp > 0) { + if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { + current->signal->tty_old_pgrp = + alloc_pid(current->nsproxy->pid_ns, 0); + if (!current->signal->tty_old_pgrp) { + eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); + cpt_release_buf(ctx); + return -EINVAL; + } + } else { + current->signal->tty_old_pgrp = + alloc_vpid_safe(si->cpt_old_pgrp); + if (!current->signal->tty_old_pgrp) { + dprintk_ctx("forward old tty PGID\n"); + current->signal->tty_old_pgrp = NULL; + } + } + } + + if (task_session_vnr(current) != si->cpt_session) { + struct pid * pid = NULL, *free = NULL; + + if (si->cpt_session_type == CPT_PGRP_ORPHAN) { +#if 0 + if (!is_virtual_pid(si->cpt_session)) { + eprintk_ctx("external process session " CPT_FID, CPT_TID(current)); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + pid = alloc_vpid_safe(si->cpt_session); + free = pid; + } + write_lock_irq(&tasklist_lock); + if (pid == NULL) + pid = find_vpid(si->cpt_session); + if (pid != NULL) { + if (task_session_nr(current) != pid_nr(pid)) { + detach_pid(current, PIDTYPE_SID); + set_task_session(current, pid_nr(pid)); + if (thread_group_leader(current)) { + attach_pid(current, PIDTYPE_SID, pid); + free = NULL; + } + } + } + write_unlock_irq(&tasklist_lock); + if (free != NULL) + free_pid(free); + } + + cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); + current->signal->leader = si->cpt_leader; + if (si->cpt_ctty != CPT_NULL) { + cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); + if (obj) { + struct tty_struct *tty = obj->o_obj; + if (!tty->session || tty->session == + task_session(current)) { + tty->session = task_session(current); + current->signal->tty = tty; + } else { + wprintk_ctx("tty session mismatch\n"); + } + } + } + + if (si->cpt_curr_target) + current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target); + current->signal->flags = 0; + *exiting = si->cpt_group_exit; + current->signal->group_exit_code = si->cpt_group_exit_code; + if (si->cpt_group_exit_task) { + current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task); + if (current->signal->group_exit_task == NULL) { + eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); + cpt_release_buf(ctx); + return -EINVAL; + } + } + current->signal->notify_count = si->cpt_notify_count; + current->signal->group_stop_count = si->cpt_group_stop_count; + + if (si->cpt_next > si->cpt_hdrlen) { + char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); + if (buf == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, + ti->cpt_signal + si->cpt_hdrlen); + if (err) { + kfree(buf); + cpt_release_buf(ctx); + return err; + } + restore_sigqueue(current, + ¤t->signal->shared_pending, (unsigned long)buf, + (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); + kfree(buf); + } + cpt_release_buf(ctx); + return 0; +} + +int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct cpt_sighand_image si; + int i; + loff_t pos, endpos; + + err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); + if (err) + return err; + + for (i=0; i<_NSIG; i++) { + current->sighand->action[i].sa.sa_handler = SIG_DFL; +#ifndef CONFIG_IA64 + current->sighand->action[i].sa.sa_restorer = 0; +#endif + current->sighand->action[i].sa.sa_flags = 0; + memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); + } + + pos = ti->cpt_sighand + si.cpt_hdrlen; + endpos = ti->cpt_sighand + si.cpt_next; + while (pos < endpos) { + struct cpt_sighandler_image shi; + + err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); + if (err) + return err; + current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; +#ifndef CONFIG_IA64 + current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; +#endif + current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; + cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); + pos += shi.cpt_next; + } + + return 0; +} + + +__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + + if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) + flag |= CLONE_THREAD; + if (ti->cpt_sighand == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) + flag |= CLONE_SIGHAND; + return flag; +} + +int +rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx) +{ + int err; + cpt_object_t *obj; + + if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); + if (obj) { + struct sighand_struct *sig = current->sighand; + if (obj->o_obj != sig) { + return -EINVAL; + } + } else { + obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_sighand, ctx); + err = restore_one_sighand_struct(ti, ctx); + if (err) + return err; + } + + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); + if (obj) { + struct signal_struct *sig = current->signal; + if (obj->o_obj != sig) { + return -EINVAL; + } +/* if (current->signal) { + pid_t session; + + session = process_session(current); + set_process_vgroup(current, session); + set_signal_vsession(current->signal, session); + }*/ + } else { + obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setpos(obj, ti->cpt_signal, ctx); + err = restore_one_signal_struct(ti, exiting, ctx); + if (err) + return err; + } + + return 0; +} + +#ifdef CONFIG_X86 +static u32 decode_segment(u32 segid) +{ + if (segid == CPT_SEG_ZERO) + return 0; + + /* TLS descriptors */ + if (segid <= CPT_SEG_TLS3) + return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; + + /* LDT descriptor, it is just an index to LDT array */ + if (segid >= CPT_SEG_LDT) + return ((segid - CPT_SEG_LDT) << 3) | 7; + + /* Check for one of standard descriptors */ +#ifdef CONFIG_X86_64 + if (segid == CPT_SEG_USER32_DS) + return __USER32_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER32_CS; + if (segid == CPT_SEG_USER64_DS) + return __USER_DS; + if (segid == CPT_SEG_USER64_CS) + return __USER_CS; +#else + if (segid == CPT_SEG_USER32_DS) + return __USER_DS; + if (segid == CPT_SEG_USER32_CS) + return __USER_CS; +#endif + wprintk("Invalid segment reg %d\n", segid); + return 0; +} +#endif + +#if defined (CONFIG_IA64) +void ia64_decrement_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri - 1; + + if (ia64_psr(regs)->ri == 0) { + regs->cr_iip -= 16; + ri = 2; + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == 2) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 1; + } + } + ia64_psr(regs)->ri = ri; +} +#endif + +static void rst_child_tid(unsigned long *child_tids) +{ + dprintk("rct: " CPT_FID "\n", CPT_TID(current)); + current->clear_child_tid = (void*)child_tids[0]; + current->set_child_tid = (void*)child_tids[1]; +} + +static void rst_last_siginfo(void) +{ + int signr; + siginfo_t *info = current->last_siginfo; + struct pt_regs *regs = task_pt_regs(current); + struct k_sigaction *ka; + int ptrace_id; + + dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); + + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + recalc_sigpending(); + + ptrace_id = current->pn_state; + clear_pn_state(current); + + switch (ptrace_id) { + case PN_STOP_TF: + case PN_STOP_TF_RT: + /* frame_*signal */ + dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n", + task_pid_vnr(current), current->pid, current->comm, + info->si_signo, info->si_code, + current->exit_code, SYSCALL_NR(regs), + current->ptrace, current->ptrace_message); + goto out; + case PN_STOP_ENTRY: + case PN_STOP_LEAVE: + /* do_syscall_trace */ + spin_unlock_irq(¤t->sighand->siglock); + dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + if (IN_SYSCALL(regs)) { + if (ptrace_id == PN_STOP_ENTRY +#ifdef CONFIG_X86 + && SYSCALL_ERRNO(regs) == ENOSYS +#endif + ) + SYSCALL_RESTART(regs); + else if (IN_ERROR(regs) && + syscall_is(current, regs, rt_sigtimedwait) && + (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR)) + SYSCALL_RESTART(regs); + } + return; + case PN_STOP_FORK: + /* fork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_VFORK: + /* after vfork */ + SYSCALL_SETRET(regs, current->ptrace_message); + dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); + goto out; + case PN_STOP_SIGNAL: + /* normal case : dequeue signal */ + break; + case PN_STOP_EXIT: + dprintk("ptrace exit caught\n"); + current->ptrace &= ~PT_TRACE_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + module_put(THIS_MODULE); + complete_and_exit(NULL, current->ptrace_message); + BUG(); + case PN_STOP_EXEC: + eprintk("ptrace after exec caught: must not happen\n"); + BUG(); + default: + eprintk("ptrace with unknown identity %d\n", ptrace_id); + BUG(); + } + + signr = current->exit_code; + if (signr == 0) { + dprintk("rlsi: canceled signal %d\n", info->si_signo); + goto out; + } + current->exit_code = 0; + + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + dprintk("going to requeue signal %d\n", signr); + goto out_resend_sig; + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) { + dprintk("going to resend signal %d (ignored)\n", signr); + goto out; + } + if (ka->sa.sa_handler != SIG_DFL) { + dprintk("going to resend signal %d (not SIG_DFL)\n", signr); + goto out_resend_sig; + } + if (signr == SIGCONT || + signr == SIGCHLD || + signr == SIGWINCH || + signr == SIGURG || + current->pid == 1) + goto out; + + /* All the rest, which we cannot handle are requeued. */ + dprintk("going to resend signal %d (sigh)\n", signr); +out_resend_sig: + spin_unlock_irq(¤t->sighand->siglock); + send_sig_info(signr, info, current); + return; + +out: + spin_unlock_irq(¤t->sighand->siglock); +} + +static void rst_finish_stop(void) +{ + /* ... + * do_signal() -> + * get_signal_to_deliver() -> + * do_signal_stop() -> + * finish_stop() + * + * Normally after SIGCONT it will dequeue the next signal. If no signal + * is found, do_signal restarts syscall unconditionally. + * Otherwise signal handler is pushed on user stack. + */ + + dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); + + clear_stop_state(current); + current->exit_code = 0; +} + +static void rst_restart_sys(void) +{ + struct pt_regs *regs = task_pt_regs(current); + + /* This hook is supposed to be executed, when we have + * to complete some interrupted syscall. + */ + dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); + + if (!IN_SYSCALL(regs) || !IN_ERROR(regs)) + return; + +#ifdef __NR_pause + if (syscall_is(current,regs,pause)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#else + /* On this arch pause() is simulated with sigsuspend(). */ + if (syscall_is(current,regs,rt_sigsuspend)) { + if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + } else +#endif + if (syscall_is(current,regs,rt_sigtimedwait)) { + if (SYSCALL_ERRNO(regs) == EAGAIN || + SYSCALL_ERRNO(regs) == EINTR) { + SYSCALL_RESTART(regs); + } + } else if (syscall_is(current,regs,futex)) { + if (SYSCALL_ERRNO(regs) == EINTR && + !signal_pending(current)) { + SYSCALL_RESTART(regs); + } + } + + if (!signal_pending(current) && + !test_thread_flag(TIF_RESTORE_SIGMASK)) { + if (SYSCALL_ERRNO(regs) == ERESTARTSYS || + SYSCALL_ERRNO(regs) == ERESTARTNOINTR || + SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { + SYSCALL_RESTART(regs); + } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) { + int new = __NR_restart_syscall; +#ifdef CONFIG_X86_64 + if (task_thread_info(current)->flags&_TIF_IA32) + new = __NR32_restart_syscall; +#endif + SYSCALL_RESTART2(regs, new); + } + } +} + +#ifdef CONFIG_X86_32 + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_x86_regs *b, + struct resume_info **rip, struct cpt_context *ctx) +{ + extern char i386_ret_from_resume; + + if (b->cpt_object != CPT_OBJ_X86_REGS) + return -EINVAL; + + tsk->thread.esp = (unsigned long) regs; + tsk->thread.esp0 = (unsigned long) (regs+1); + tsk->thread.eip = (unsigned long) &i386_ret_from_resume; + + tsk->thread.gs = decode_segment(b->cpt_gs); + tsk->thread.debugreg[0] = b->cpt_debugreg[0]; + tsk->thread.debugreg[1] = b->cpt_debugreg[1]; + tsk->thread.debugreg[2] = b->cpt_debugreg[2]; + tsk->thread.debugreg[3] = b->cpt_debugreg[3]; + tsk->thread.debugreg[4] = b->cpt_debugreg[4]; + tsk->thread.debugreg[5] = b->cpt_debugreg[5]; + tsk->thread.debugreg[6] = b->cpt_debugreg[6]; + tsk->thread.debugreg[7] = b->cpt_debugreg[7]; + + regs->ebx = b->cpt_ebx; + regs->ecx = b->cpt_ecx; + regs->edx = b->cpt_edx; + regs->esi = b->cpt_esi; + regs->edi = b->cpt_edi; + regs->ebp = b->cpt_ebp; + regs->eax = b->cpt_eax; + regs->xds = b->cpt_xds; + regs->xes = b->cpt_xes; + regs->orig_eax = b->cpt_orig_eax; + regs->eip = b->cpt_eip; + regs->xcs = b->cpt_xcs; + regs->eflags = b->cpt_eflags; + regs->esp = b->cpt_esp; + regs->xss = b->cpt_xss; + + regs->xcs = decode_segment(b->cpt_xcs); + regs->xss = decode_segment(b->cpt_xss); + regs->xds = decode_segment(b->cpt_xds); + regs->xes = decode_segment(b->cpt_xes); + regs->xfs = decode_segment(b->cpt_fs); + + tsk->thread.esp -= HOOK_RESERVE; + memset((void*)tsk->thread.esp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.esp; + + return 0; +} + +#elif defined(CONFIG_X86_64) + +static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) +{ + memset(d, 0, sizeof(struct pt_regs)); + d->rbp = s->cpt_ebp; + d->rbx = s->cpt_ebx; + d->rax = (s32)s->cpt_eax; + d->rcx = s->cpt_ecx; + d->rdx = s->cpt_edx; + d->rsi = s->cpt_esi; + d->rdi = s->cpt_edi; + d->orig_rax = (s32)s->cpt_orig_eax; + d->rip = s->cpt_eip; + d->cs = s->cpt_xcs; + d->eflags = s->cpt_eflags; + d->rsp = s->cpt_esp; + d->ss = s->cpt_xss; +} + +static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, + struct cpt_task_image *ti, struct cpt_obj_bits *hdr, + struct resume_info **rip, struct cpt_context *ctx) +{ + if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { + struct cpt_x86_64_regs *b = (void*)hdr; + + tsk->thread.rsp = (unsigned long) regs; + tsk->thread.rsp0 = (unsigned long) (regs+1); + + tsk->thread.fs = b->cpt_fsbase; + tsk->thread.gs = b->cpt_gsbase; + tsk->thread.fsindex = decode_segment(b->cpt_fsindex); + tsk->thread.gsindex = decode_segment(b->cpt_gsindex); + tsk->thread.ds = decode_segment(b->cpt_ds); + tsk->thread.es = decode_segment(b->cpt_es); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); + + tsk->thread.userrsp = regs->rsp; + regs->cs = decode_segment(b->cpt_cs); + regs->ss = decode_segment(b->cpt_ss); + } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { + struct cpt_x86_regs *b = (void*)hdr; + + tsk->thread.rsp = (unsigned long) regs; + tsk->thread.rsp0 = (unsigned long) (regs+1); + + tsk->thread.fs = 0; + tsk->thread.gs = 0; + tsk->thread.fsindex = decode_segment(b->cpt_fs); + tsk->thread.gsindex = decode_segment(b->cpt_gs); + tsk->thread.debugreg0 = b->cpt_debugreg[0]; + tsk->thread.debugreg1 = b->cpt_debugreg[1]; + tsk->thread.debugreg2 = b->cpt_debugreg[2]; + tsk->thread.debugreg3 = b->cpt_debugreg[3]; + tsk->thread.debugreg6 = b->cpt_debugreg[6]; + tsk->thread.debugreg7 = b->cpt_debugreg[7]; + + xlate_ptregs_32_to_64(regs, b); + + tsk->thread.userrsp = regs->rsp; + regs->cs = decode_segment(b->cpt_xcs); + regs->ss = decode_segment(b->cpt_xss); + tsk->thread.ds = decode_segment(b->cpt_xds); + tsk->thread.es = decode_segment(b->cpt_xes); + } else { + return -EINVAL; + } + + tsk->thread.rsp -= HOOK_RESERVE; + memset((void*)tsk->thread.rsp, 0, HOOK_RESERVE); + *rip = (void*)tsk->thread.rsp; + return 0; +} + +#elif defined(CONFIG_IA64) + +#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ + +#define PUT_BITS(first, last, nat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotl(nat & mask, dist); \ + }) + +unsigned long +ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + scratch_unat = PUT_BITS( 1, 1, nat); + scratch_unat |= PUT_BITS( 2, 3, nat); + scratch_unat |= PUT_BITS(12, 13, nat); + scratch_unat |= PUT_BITS(14, 14, nat); + scratch_unat |= PUT_BITS(15, 15, nat); + scratch_unat |= PUT_BITS( 8, 11, nat); + scratch_unat |= PUT_BITS(16, 31, nat); + + return scratch_unat; + +} + +static unsigned long +ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat) +{ + unsigned long scratch_unat; + + scratch_unat = PUT_BITS( 4, 7, nat); + + return scratch_unat; + +} + +#undef PUT_BITS + + +static int restore_registers(struct task_struct *tsk, struct pt_regs *pt, + struct cpt_task_image *ti, + struct cpt_ia64_regs *r, + struct resume_info **rip, + struct cpt_context *ctx) +{ + extern char ia64_ret_from_resume; + struct switch_stack *sw; + struct resume_info *ri; + struct ia64_psr *psr = ia64_psr(pt); + void *krbs = (void *)tsk + IA64_RBS_OFFSET; + unsigned long reg; + + if (r->cpt_object != CPT_OBJ_IA64_REGS) + return -EINVAL; + + if (r->num_regs > 96) { + eprintk(CPT_FID " too much RSE regs %lu\n", + CPT_TID(tsk), r->num_regs); + return -EINVAL; + } + + *rip = ri = ((void*)pt) - HOOK_RESERVE; + sw = ((struct switch_stack *) ri) - 1; + + memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack)); + memset(ri, 0, HOOK_RESERVE); + + /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ + memcpy(&pt->r1, &r->gr[1], 8*(2-1)); + memcpy(&pt->r2, &r->gr[2], 8*(4-2)); + memcpy(&pt->r8, &r->gr[8], 8*(12-8)); + memcpy(&pt->r12, &r->gr[12], 8*(14-12)); + memcpy(&pt->r14, &r->gr[14], 8*(15-14)); + memcpy(&pt->r15, &r->gr[15], 8*(16-15)); + memcpy(&pt->r16, &r->gr[16], 8*(32-16)); + + pt->b0 = r->br[0]; + pt->b6 = r->br[6]; + pt->b7 = r->br[7]; + + pt->ar_bspstore = r->ar_bspstore; + pt->ar_unat = r->ar_unat; + pt->ar_pfs = r->ar_pfs; + pt->ar_ccv = r->ar_ccv; + pt->ar_fpsr = r->ar_fpsr; + pt->ar_csd = r->ar_csd; + pt->ar_ssd = r->ar_ssd; + pt->ar_rsc = r->ar_rsc; + + pt->cr_iip = r->cr_iip; + pt->cr_ipsr = r->cr_ipsr; + + pt->pr = r->pr; + + pt->cr_ifs = r->cfm; + + /* fpregs 6..9,10..11 are in pt_regs */ + memcpy(&pt->f6, &r->fr[2*6], 16*(10-6)); + memcpy(&pt->f10, &r->fr[2*10], 16*(12-10)); + /* fpreg 12..15 are on switch stack */ + memcpy(&sw->f12, &r->fr[2*12], 16*(16-12)); + /* fpregs 32...127 */ + tsk->thread.flags |= IA64_THREAD_FPH_VALID; + memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32)); + ia64_drop_fpu(tsk); + psr->dfh = 1; + + memcpy(&sw->r4, &r->gr[4], 8*(8-4)); + memcpy(&sw->b1, &r->br[1], 8*(6-1)); + sw->ar_lc = r->ar_lc; + + memcpy(&sw->f2, &r->fr[2*2], 16*(6-2)); + memcpy(&sw->f16, &r->fr[2*16], 16*(32-16)); + + sw->caller_unat = 0; + sw->ar_fpsr = pt->ar_fpsr; + sw->ar_unat = 0; + if (r->nat[0] & 0xFFFFFF0FUL) + sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]); + if (r->nat[0] & 0xF0) + sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]); + + sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs); + memset(krbs, 0, (void*)sw->ar_bspstore - krbs); + sw->ar_rnat = 0; + sw->ar_pfs = 0; + + /* This is tricky. When we are in syscall, we have frame + * of output register (sometimes, plus one input reg sometimes). + * It is not so easy to restore such frame, RSE optimizes + * and does not fetch those regs from backstore. So, we restore + * the whole frame as local registers, and then repartition it + * in ia64_ret_from_resume(). + */ + if ((long)pt->cr_ifs >= 0) { + unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F); + sw->ar_pfs = out | (out<<7); + } + if (r->ar_ec) + sw->ar_pfs |= (r->ar_ec & 0x3F) << 52; + + for (reg = 0; reg < r->num_regs; reg++) { + unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); + unsigned long *rnatp; + unsigned long set_rnat = 0; + + *ptr = r->gr[32+reg]; + + if (reg < 32) + set_rnat = (r->nat[0] & (1UL<<(reg+32))); + else + set_rnat = (r->nat[1] & (1UL<<(reg-32))); + + if (set_rnat) { + rnatp = ia64_rse_rnat_addr(ptr); + if ((unsigned long)rnatp >= sw->ar_bspstore) + rnatp = &sw->ar_rnat; + *rnatp |= (1UL<b0 = (unsigned long) &ia64_ret_from_resume; + tsk->thread.ksp = (unsigned long) sw - 16; + +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + + pt->loadrs = r->loadrs; + sw->pr = 0; + sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL); + sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL)); + sw->pr &= ~(1UL << PRED_KERNEL_STACK); + sw->pr |= (1UL << PRED_USER_STACK); + if ((long)pt->cr_ifs < 0) { + sw->pr |= (1UL << PRED_NON_SYSCALL); + } else { + sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); + } + + return 0; +} +#endif + +asmlinkage void rst_resume_work(struct resume_info *ri) +{ + if (ri->hooks & (1<tid_ptrs); + if (ri->hooks & (1<hooks & (1<hooks & (1<thread.i387.fxsave.mxcsr &= 0x0000ffbf; +#endif +} + +int rst_restore_process(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + struct pt_regs * regs; + struct cpt_object_hdr *b; + struct cpt_siginfo_image *lsi = NULL; + struct group_info *gids, *ogids; + struct resume_info *ri = NULL; + int i; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + struct user_beancounter *new_bc, *old_bc; +#endif + + if (tsk == NULL) { + eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); + return -EFAULT; + } + + wait_task_inactive(tsk); +#ifdef CONFIG_BEANCOUNTERS + tbc = &tsk->task_bc; + new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx); + err = virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTTSK, new_bc); + if (err & NOTIFY_FAIL) { + put_beancounter(new_bc); + return -ECHRNG; + } + old_bc = tbc->exec_ub; + if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) { + dprintk(" *** replacing ub %p by %p for %p (%d %s)\n", + old_bc, new_bc, tsk, + tsk->pid, tsk->comm); + tbc->exec_ub = new_bc; + new_bc = old_bc; + } + put_beancounter(new_bc); +#endif + regs = task_pt_regs(tsk); + + if (!tsk->exit_state) { + tsk->lock_depth = -1; +#ifdef CONFIG_PREEMPT + task_thread_info(tsk)->preempt_count--; +#endif + } + + if (tsk->static_prio != ti->cpt_static_prio) + set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio)); + + cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); + cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); + cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); + cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); + + tsk->uid = ti->cpt_uid; + tsk->euid = ti->cpt_euid; + tsk->suid = ti->cpt_suid; + tsk->fsuid = ti->cpt_fsuid; + tsk->gid = ti->cpt_gid; + tsk->egid = ti->cpt_egid; + tsk->sgid = ti->cpt_sgid; + tsk->fsgid = ti->cpt_fsgid; +#ifdef CONFIG_IA64 + SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac); + SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu); +#endif + memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); + memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); + memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); + tsk->keep_capabilities = (ti->cpt_keepcap != 0); + tsk->did_exec = (ti->cpt_did_exec != 0); + gids = groups_alloc(ti->cpt_ngids); + ogids = tsk->group_info; + if (gids) { + int i; + for (i=0; i<32; i++) + gids->small_block[i] = ti->cpt_gids[i]; + tsk->group_info = gids; + } + if (ogids) + put_group_info(ogids); + tsk->utime = ti->cpt_utime; + tsk->stime = ti->cpt_stime; + if (ctx->image_version == CPT_VERSION_8) + tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); + else + cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); + _set_normalized_timespec(&tsk->start_time, + tsk->start_time.tv_sec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec, + tsk->start_time.tv_nsec + + VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec); + + tsk->nvcsw = ti->cpt_nvcsw; + tsk->nivcsw = ti->cpt_nivcsw; + tsk->min_flt = ti->cpt_min_flt; + tsk->maj_flt = ti->cpt_maj_flt; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) + tsk->cutime = ti->cpt_cutime; + tsk->cstime = ti->cpt_cstime; + tsk->cnvcsw = ti->cpt_cnvcsw; + tsk->cnivcsw = ti->cpt_cnivcsw; + tsk->cmin_flt = ti->cpt_cmin_flt; + tsk->cmaj_flt = ti->cpt_cmaj_flt; + + BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS); + + for (i=0; irlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } +#else + if (thread_group_leader(tsk) && tsk->signal) { + tsk->signal->utime = ti->cpt_utime; + tsk->signal->stime = ti->cpt_stime; + tsk->signal->cutime = ti->cpt_cutime; + tsk->signal->cstime = ti->cpt_cstime; + tsk->signal->nvcsw = ti->cpt_nvcsw; + tsk->signal->nivcsw = ti->cpt_nivcsw; + tsk->signal->cnvcsw = ti->cpt_cnvcsw; + tsk->signal->cnivcsw = ti->cpt_cnivcsw; + tsk->signal->min_flt = ti->cpt_min_flt; + tsk->signal->maj_flt = ti->cpt_maj_flt; + tsk->signal->cmin_flt = ti->cpt_cmin_flt; + tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; + + for (i=0; isignal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; + tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; + } + } +#endif + +#ifdef CONFIG_X86 + for (i=0; i<3; i++) { + if (i >= GDT_ENTRY_TLS_ENTRIES) { + eprintk_ctx("too many tls descs\n"); + } else { +#ifndef CONFIG_X86_64 + tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; + tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; +#else + tsk->thread.tls_array[i] = ti->cpt_tls[i]; +#endif + } + } +#endif + + clear_stopped_child_used_math(tsk); + + b = (void *)(ti+1); + while ((void*)b < ((void*)ti) + ti->cpt_next) { + /* Siginfo objects are at the end of obj array */ + if (b->cpt_object == CPT_OBJ_SIGINFO) { + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); + set_exec_env(env); + break; + } + + switch (b->cpt_object) { +#ifdef CONFIG_X86 + case CPT_OBJ_BITS: + if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && + cpu_has_fxsr) { + memcpy(&tsk->thread.i387, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fxsave_struct)); + rst_apply_mxcsr_mask(tsk); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#ifndef CONFIG_X86_64 + else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && + !cpu_has_fxsr) { + memcpy(&tsk->thread.i387, + (void*)b + b->cpt_hdrlen, + sizeof(struct i387_fsave_struct)); + if (ti->cpt_used_math) + set_stopped_child_used_math(tsk); + } +#endif + break; +#endif + case CPT_OBJ_LASTSIGINFO: + lsi = (void*)b; + break; + case CPT_OBJ_X86_REGS: + case CPT_OBJ_X86_64_REGS: + case CPT_OBJ_IA64_REGS: + if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) { + eprintk_ctx("cannot restore registers: image is corrupted\n"); + return -EINVAL; + } + break; + case CPT_OBJ_SIGALTSTACK: { + struct cpt_sigaltstack_image *sas; + sas = (struct cpt_sigaltstack_image *)b; + tsk->sas_ss_sp = sas->cpt_stack; + tsk->sas_ss_size = sas->cpt_stacksize; + break; + } + case CPT_OBJ_TASK_AUX: { + struct cpt_task_aux_image *ai; + ai = (struct cpt_task_aux_image *)b; + tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list); +#ifdef CONFIG_X86_64 +#ifdef CONFIG_COMPAT + if (task_thread_info(tsk)->flags&_TIF_IA32) { + tsk->robust_list = (void __user *)NULL; + tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list); + } +#endif +#endif + break; + } + } + b = ((void*)b) + b->cpt_next; + } + + if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + eprintk_ctx("missing register info\n"); + return -EINVAL; + } + + if (ti->cpt_ppid != ti->cpt_rppid) { + struct task_struct *parent; + struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); + write_lock_irq(&tasklist_lock); + parent = find_task_by_vpid(ti->cpt_ppid); + if (parent && parent != tsk->parent) { + list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children); + remove_parent(tsk); + tsk->parent = parent; + add_parent(tsk); + } + write_unlock_irq(&tasklist_lock); + set_exec_env(env); + } + + tsk->ptrace_message = ti->cpt_ptrace_message; + tsk->pn_state = ti->cpt_pn_state; + tsk->stopped_state = ti->cpt_stopped_state; + task_thread_info(tsk)->flags = ti->cpt_thrflags; + + /* The image was created with kernel < 2.6.16, while + * task hanged in sigsuspend -> do_signal. + * + * FIXME! This needs more brain efforts... + */ + if (ti->cpt_sigsuspend_state) { + task_thread_info(tsk)->flags |= _TIF_RESTORE_SIGMASK; + } + +#ifdef CONFIG_X86_64 + task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME; + if (!ti->cpt_64bit) + task_thread_info(tsk)->flags |= _TIF_IA32; +#endif + +#ifdef CONFIG_X86_32 + do { + if (regs->orig_eax == __NR__newselect && regs->edi) { + struct timeval tv; + if (access_process_vm(tsk, regs->edi, &tv, + sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + regs->edi); + break; + } + dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, + tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, regs->edi, &tv, + sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", + task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->edi); + } + + } else if (regs->orig_eax == __NR_select && regs->edi) { + struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; + } a; + struct timeval tv; + if (access_process_vm(tsk, regs->ebx, &a, + sizeof(a), 0) != sizeof(a)) { + wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); + break; + } + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 0) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); + break; + } + dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + tv.tv_sec -= ctx->delta_time.tv_sec; + if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { + tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; + tv.tv_sec--; + } else { + tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; + } + if (tv.tv_sec < 0) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } + dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", + tsk->pid, tv.tv_sec, tv.tv_usec); + if (access_process_vm(tsk, (unsigned long)a.tvp, + &tv, sizeof(tv), 1) != sizeof(tv)) { + wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); + } + } + } while (0); +#endif + + if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) { + switch (SYSCALL_ERRNO(regs)) { + case ERESTARTSYS: + case ERESTARTNOINTR: + case ERESTARTNOHAND: + case ERESTART_RESTARTBLOCK: + case EAGAIN: + case EINTR: + ri->hooks |= (1<pn_state)) { + /* ... -> ptrace_notify() + * or + * ... -> do_signal() -> get_signal_to_deliver() -> + * ptrace stop + */ + tsk->last_siginfo = &ri->last_siginfo; + ri->hooks |= (1<last_siginfo, lsi); + } + + tsk->ptrace = ti->cpt_ptrace; + tsk->flags = ti->cpt_flags & ~PF_FROZEN; + clear_tsk_thread_flag(tsk, TIF_FREEZE); + tsk->exit_signal = ti->cpt_exit_signal; + + if (ri && tsk->stopped_state) { + dprintk_ctx("finish_stop\n"); + if (ti->cpt_state != TASK_STOPPED) + eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); + ri->hooks |= (1<cpt_set_tid || ti->cpt_clear_tid)) { + ri->hooks |= (1<tid_ptrs[0] = ti->cpt_clear_tid; + ri->tid_ptrs[1] = ti->cpt_set_tid; + dprintk_ctx("settids\n"); + } + + if (ri && ri->hooks && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + if (try_module_get(THIS_MODULE)) + ri->hook = rst_resume_work; + } + + if (ti->cpt_state == TASK_TRACED) + tsk->state = TASK_TRACED; + else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { + tsk->signal->it_virt_expires = 0; + tsk->signal->it_prof_expires = 0; + if (tsk->state != EXIT_DEAD) + eprintk_ctx("oops, schedule() did not make us dead\n"); + } + + if (thread_group_leader(tsk) && + ti->cpt_it_real_value && + !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + ktime_t val; + s64 nsec; + + nsec = ti->cpt_it_real_value; + val.tv64 = 0; + + if (ctx->image_version < CPT_VERSION_9) + nsec *= TICK_NSEC; + + val = ktime_add_ns(val, nsec - ctx->delta_nsec); + if (val.tv64 <= 0) + val.tv64 = NSEC_PER_USEC; + dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk), + (long long)val.tv64, + (unsigned long long)ti->cpt_it_real_value); + + spin_lock_irq(&tsk->sighand->siglock); + if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { + /* FIXME. Check!!!! */ + hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL); + } else { + wprintk_ctx("Timer clash. Impossible?\n"); + } + spin_unlock_irq(&tsk->sighand->siglock); + + dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), + (unsigned long long)val.tv64); + } + + module_put(THIS_MODULE); + } + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/rst_socket.c linux-2.6.24.ovz/kernel/cpt/rst_socket.c --- linux-2.6.24/kernel/cpt/rst_socket.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_socket.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,918 @@ +/* + * + * kernel/cpt/rst_socket.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_files.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +#include "cpt_syscalls.h" + + +static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + struct timeval tmptv; + + if (sk->sk_socket) { + sk->sk_socket->flags = si->cpt_ssflags; + sk->sk_socket->state = si->cpt_sstate; + } + sk->sk_reuse = si->cpt_reuse; + sk->sk_shutdown = si->cpt_shutdown; + sk->sk_userlocks = si->cpt_userlocks; + sk->sk_no_check = si->cpt_no_check; + sock_reset_flag(sk, SOCK_DBG); + if (si->cpt_debug) + sock_set_flag(sk, SOCK_DBG); + sock_reset_flag(sk, SOCK_RCVTSTAMP); + if (si->cpt_rcvtstamp) + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_LOCALROUTE); + if (si->cpt_localroute) + sock_set_flag(sk, SOCK_LOCALROUTE); + sk->sk_protocol = si->cpt_protocol; + sk->sk_err = si->cpt_err; + sk->sk_err_soft = si->cpt_err_soft; + sk->sk_priority = si->cpt_priority; + sk->sk_rcvlowat = si->cpt_rcvlowat; + sk->sk_rcvtimeo = si->cpt_rcvtimeo; + if (si->cpt_rcvtimeo == CPT_NULL) + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = si->cpt_sndtimeo; + if (si->cpt_sndtimeo == CPT_NULL) + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_rcvbuf = si->cpt_rcvbuf; + sk->sk_sndbuf = si->cpt_sndbuf; + sk->sk_bound_dev_if = si->cpt_bound_dev_if; + sk->sk_flags = si->cpt_flags; + sk->sk_lingertime = si->cpt_lingertime; + if (si->cpt_lingertime == CPT_NULL) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + sk->sk_peercred.pid = si->cpt_peer_pid; + sk->sk_peercred.uid = si->cpt_peer_uid; + sk->sk_peercred.gid = si->cpt_peer_gid; + cpt_timeval_import(&tmptv, si->cpt_stamp); + sk->sk_stamp = timeval_to_ktime(tmptv); + return 0; +} + +static struct file *sock_mapfile(struct socket *sock) +{ + int fd = sock_map_fd(sock); + + if (fd >= 0) { + struct file *file = sock->file; + get_file(file); + sc_close(fd); + return file; + } + return ERR_PTR(fd); +} + +/* Assumption is that /tmp exists and writable. + * In previous versions we assumed that listen() will autobind + * the socket. It does not do this for AF_UNIX by evident reason: + * socket in abstract namespace is accessible, unlike socket bound + * to deleted FS object. + */ + +static int +select_deleted_name(char * name, cpt_context_t *ctx) +{ + int i; + + for (i=0; i<100; i++) { + struct nameidata nd; + unsigned int rnd = net_random(); + + sprintf(name, "/tmp/SOCK.%08x", rnd); + + if (path_lookup(name, 0, &nd) != 0) + return 0; + + path_release(&nd); + } + + eprintk_ctx("failed to allocate deleted socket inode\n"); + return -ELOOP; +} + +static int +bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, + cpt_context_t *ctx) +{ + int err; + char *name; + struct sockaddr* addr; + int addrlen; + struct sockaddr_un sun; + struct nameidata nd; + + if ((addrlen = si->cpt_laddrlen) <= 2) + return 0; + + nd.dentry = NULL; + name = ((char*)si->cpt_laddr) + 2; + addr = (struct sockaddr *)si->cpt_laddr; + + if (name[0]) { + if (path_lookup(name, 0, &nd)) + nd.dentry = NULL; + + if (si->cpt_deleted) { + if (nd.dentry == NULL && + sock->ops->bind(sock, addr, addrlen) == 0) { + sc_unlink(name); + return 0; + } + + addr = (struct sockaddr*)&sun; + addr->sa_family = AF_UNIX; + name = ((char*)addr) + 2; + err = select_deleted_name(name, ctx); + if (err) + goto out; + addrlen = 2 + strlen(name); + } else if (nd.dentry) { + if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) { + eprintk_ctx("bind_unix_socket: not a socket dentry\n"); + err = -EINVAL; + goto out; + } + sc_unlink(name); + } + } + + err = sock->ops->bind(sock, addr, addrlen); + + if (!err && name[0]) { + if (nd.dentry) { + sc_chown(name, nd.dentry->d_inode->i_uid, + nd.dentry->d_inode->i_gid); + sc_chmod(name, nd.dentry->d_inode->i_mode); + } + if (si->cpt_deleted) + sc_unlink(name); + } + +out: + if (nd.dentry) + path_release(&nd); + return err; +} + +static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + struct sock *sk = sock->sk; + cpt_object_t *obj; + struct sock *parent; + + if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) + return 0; + + if (si->cpt_parent == -1) + return bind_unix_socket(sock, si, ctx); + + obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (!obj) + return 0; + + parent = obj->o_obj; + if (unix_sk(parent)->addr) { + if (unix_sk(sk)->addr && + atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) + kfree(unix_sk(sk)->addr); + atomic_inc(&unix_sk(parent)->addr->refcnt); + unix_sk(sk)->addr = unix_sk(parent)->addr; + } + return 0; +} + +static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else { + wprintk_ctx("strange socket queue type %u\n", type); + kfree_skb(skb); + } + } + return 0; +} + +static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct socket *sock2 = NULL; + struct file *file; + cpt_object_t *fobj; + cpt_object_t *pobj = NULL; + + err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) + return err; + + if (si->cpt_socketpair) { + err = sock_create_kern(si->cpt_family, si->cpt_type, + si->cpt_protocol, &sock2); + if (err) + goto err_out; + + err = sock->ops->socketpair(sock, sock2); + if (err < 0) + goto err_out; + + /* Socketpair with a peer outside our environment. + * So, we create real half-open pipe and do not worry + * about dead end anymore. */ + if (si->cpt_peer == -1) { + sock_release(sock2); + sock2 = NULL; + } + } + + cpt_obj_setobj(obj, sock->sk, ctx); + + if (si->cpt_file != CPT_NULL) { + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto err_out; + + err = -ENOMEM; + + obj->o_parent = file; + + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + } + + if (sock2) { + struct file *file2; + + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); + if (!pobj) BUG(); + if (pobj->o_obj) BUG(); + cpt_obj_setobj(pobj, sock2->sk, ctx); + + if (pobj->o_ppos != CPT_NULL) { + file2 = sock_mapfile(sock2); + err = PTR_ERR(file2); + if (IS_ERR(file2)) + goto err_out; + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(fobj, pobj->o_ppos, ctx); + cpt_obj_setindex(fobj, si->cpt_peer, ctx); + + pobj->o_parent = file2; + } + } + + setup_sock_common(sock->sk, si, obj->o_pos, ctx); + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { + int saved_reuse = sock->sk->sk_reuse; + + inet_sk(sock->sk)->freebind = 1; + sock->sk->sk_reuse = 2; + if (si->cpt_laddrlen) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + dprintk_ctx("binding failed: %d, do not worry\n", err); + } + } + sock->sk->sk_reuse = saved_reuse; + rst_socket_in(si, obj->o_pos, sock->sk, ctx); + } else if (sock->sk->sk_family == AF_NETLINK) { + struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr; + if (nl->nl_pid) { + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + if (err) { + eprintk_ctx("AF_NETLINK binding failed: %d\n", err); + } + } + if (si->cpt_raddrlen && nl->nl_pid) { + err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); + if (err) { + eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } else if (sock->sk->sk_family == PF_PACKET) { + struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr; + if (ll->sll_protocol || ll->sll_ifindex) { + int alen = si->cpt_laddrlen; + if (alen < sizeof(struct sockaddr_ll)) + alen = sizeof(struct sockaddr_ll); + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen); + if (err) { + eprintk_ctx("AF_PACKET binding failed: %d\n", err); + } + } + generic_restore_queues(sock->sk, si, obj->o_pos, ctx); + } + fixup_unix_address(sock, si, ctx); + + if (sock2) { + err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); + if (err) + return err; + setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); + fixup_unix_address(sock2, si, ctx); + } + + if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + && (int)si->cpt_parent != -1) { + cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) + sock->sk = NULL; + } + + + if (si->cpt_file == CPT_NULL && sock->sk && + sock->sk->sk_family == AF_INET) { + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + + local_bh_disable(); + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + eprintk_ctx("oops, sock is locked by user\n"); + + sock_hold(sk); + sock_orphan(sk); + ub_inc_orphan_count(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + dprintk_ctx("orphaning socket %p\n", sk); + } + } + + if (si->cpt_file == CPT_NULL && sock->sk == NULL) + sock_release(sock); + + return 0; + +err_out: + if (sock2) + sock_release(sock2); + sock_release(sock); + return err; +} + +static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, + struct cpt_context *ctx) +{ + int err; + struct socket *sock; + struct file *file; + cpt_object_t *obj, *fobj; + + err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, + &sock); + if (err) { + eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err); + return err; + } + + sock->sk->sk_reuse = 2; + sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; + + if (sock->sk->sk_family == AF_UNIX) { + err = bind_unix_socket(sock, si, ctx); + } else if (si->cpt_laddrlen) { + if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) + inet_sk(sock->sk)->freebind = 1; + + err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); + + if (err) { + eprintk_ctx("open_listening_socket: bind: %d\n", err); + goto err_out; + } + } + + err = sock->ops->listen(sock, si->cpt_max_ack_backlog); + if (err) { + eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); + goto err_out; + } + + /* Now we may access socket body directly and fixup all the things. */ + + file = sock_mapfile(sock); + err = PTR_ERR(file); + if (IS_ERR(file)) { + eprintk_ctx("open_listening_socket: map: %d\n", err); + goto err_out; + } + + err = -ENOMEM; + if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) + goto err_out; + if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) + goto err_out; + cpt_obj_setpos(obj, pos, ctx); + cpt_obj_setindex(obj, si->cpt_index, ctx); + obj->o_parent = file; + cpt_obj_setpos(fobj, si->cpt_file, ctx); + cpt_obj_setindex(fobj, si->cpt_index, ctx); + + setup_sock_common(sock->sk, si, pos, ctx); + + if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) + rst_restore_synwait_queue(sock->sk, si, pos, ctx); + + return 0; + +err_out: + sock_release(sock); + return err; +} + +static int +rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + struct cpt_sockmc_image v; + + err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_family == AF_INET) + return rst_sk_mcfilter_in(sk, &v, pos, ctx); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (v.cpt_family == AF_INET6) + return rst_sk_mcfilter_in6(sk, &v, pos, ctx); +#endif + else + return -EAFNOSUPPORT; +} + + +static int +rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + struct sk_filter *fp, *old_fp; + loff_t pos = *pos_p; + struct cpt_obj_bits v; + + err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); + if (err) + return err; + + *pos_p += v.cpt_next; + + if (v.cpt_size % sizeof(struct sock_filter)) + return -EINVAL; + + fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); + if (fp == NULL) + return -ENOMEM; + atomic_set(&fp->refcnt, 1); + fp->len = v.cpt_size/sizeof(struct sock_filter); + + err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); + if (err) { + sk_filter_uncharge(sk, fp); + return err; + } + + old_fp = sk->sk_filter; + sk->sk_filter = fp; + if (old_fp) + sk_filter_uncharge(sk, old_fp); + return 0; +} + + +int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) +{ + int err; + loff_t pos = *pos_p; + + err = rst_sock_attr_skfilter(pos_p, sk, ctx); + if (err && pos == *pos_p) + err = rst_sock_attr_mcfilter(pos_p, sk, ctx); + return err; +} + +struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) +{ + int err; + struct sk_buff *skb; + struct cpt_skb_image v; + loff_t pos = *pos_p; + struct scm_fp_list *fpl = NULL; + struct timeval tmptv; + + err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); + if (err) + return ERR_PTR(err); + *pos_p = pos + v.cpt_next; + + if (owner) + *owner = v.cpt_owner; + if (queue) + *queue = v.cpt_queue; + + skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOMEM); + skb_reserve(skb, v.cpt_hspace); + skb_put(skb, v.cpt_len); +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->transport_header = v.cpt_h; + skb->network_header = v.cpt_nh; + skb->mac_header = v.cpt_mac; +#else + skb->transport_header = skb->head + v.cpt_h; + skb->network_header = skb->head + v.cpt_nh; + skb->mac_header = skb->head + v.cpt_mac; +#endif + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); + memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); + skb->mac_len = v.cpt_mac_len; + + skb->csum = v.cpt_csum; + skb->local_df = v.cpt_local_df; + skb->pkt_type = v.cpt_pkt_type; + skb->ip_summed = v.cpt_ip_summed; + skb->priority = v.cpt_priority; + skb->protocol = v.cpt_protocol; + cpt_timeval_import(&tmptv, v.cpt_stamp); + skb->tstamp = timeval_to_ktime(tmptv); + + skb_shinfo(skb)->gso_segs = v.cpt_gso_segs; + skb_shinfo(skb)->gso_size = v.cpt_gso_size; + if (ctx->image_version == 0) { + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + } + + if (v.cpt_next > v.cpt_hdrlen) { + pos = pos + v.cpt_hdrlen; + while (pos < *pos_p) { + union { + struct cpt_obj_bits b; + struct cpt_fd_image f; + } u; + + err = rst_get_object(-1, pos, &u, ctx); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + if (u.b.cpt_object == CPT_OBJ_BITS) { + if (u.b.cpt_size != v.cpt_hspace + skb->len) { + eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); + kfree_skb(skb); + return ERR_PTR(-EINVAL); + } + + err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { + if (!fpl) { + fpl = kmalloc(sizeof(struct scm_fp_list), + GFP_KERNEL_UBC); + if (!fpl) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + fpl->count = 0; + UNIXCB(skb).fp = fpl; + } + fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); + if (!IS_ERR(fpl->fp[fpl->count])) + fpl->count++; + } + pos += u.b.cpt_next; + } + } + + return skb; +} + +static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + struct sock *owner_sk; + __u32 owner; + + skb = rst_skb(&pos, &owner, NULL, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + owner_sk = unix_peer(sk); + if (owner != -1) { + cpt_object_t *pobj; + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); + if (pobj == NULL) { + eprintk_ctx("orphan af_unix skb?\n"); + kfree_skb(skb); + continue; + } + owner_sk = pobj->o_obj; + } + if (owner_sk == NULL) { + dprintk_ctx("orphan af_unix skb 2?\n"); + kfree_skb(skb); + continue; + } + skb_set_owner_w(skb, owner_sk); + if (UNIXCB(skb).fp) + skb->destructor = unix_destruct_fds; + skb_queue_tail(&sk->sk_receive_queue, skb); + if (sk->sk_state == TCP_LISTEN) { + struct socket *sock = skb->sk->sk_socket; + if (sock == NULL) BUG(); + if (sock->file) BUG(); + skb->sk->sk_socket = NULL; + skb->sk->sk_sleep = NULL; + sock->sk = NULL; + sock_release(sock); + } + } + return 0; +} + + +/* All the sockets are created before we start to open files */ + +int rst_sockets(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SOCKET]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) { + eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); + return err; + } + if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { + eprintk_ctx("rst_sockets: hdr err\n"); + return -EINVAL; + } + + /* The first pass: we create socket index and open listening sockets. */ + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) { + err = open_listening_socket(sec, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); + return err; + } + } else { + cpt_release_buf(ctx); + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) + return -ENOMEM; + cpt_obj_setindex(obj, sbuf->cpt_index, ctx); + cpt_obj_setpos(obj, sec, ctx); + obj->o_ppos = sbuf->cpt_file; + intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); + } + sec += sbuf->cpt_next; + } + + /* Pass 2: really restore sockets */ + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + if (obj->o_obj != NULL) + continue; + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); + cpt_release_buf(ctx); + return err; + } + if (sbuf->cpt_state == TCP_LISTEN) BUG(); + err = open_socket(obj, sbuf, ctx); + cpt_release_buf(ctx); + if (err) { + eprintk_ctx("rst_sockets: open_socket: %d\n", err); + return err; + } + } + + return 0; +} + +int rst_orphans(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; + loff_t endsec; + cpt_object_t *obj; + struct cpt_section_hdr h; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + struct cpt_sock_image *sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + obj = alloc_cpt_object(GFP_KERNEL, ctx); + if (obj == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + obj->o_pos = sec; + obj->o_ppos = sbuf->cpt_file; + err = open_socket(obj, sbuf, ctx); + dprintk_ctx("Restoring orphan: %d\n", err); + free_cpt_object(obj, ctx); + cpt_release_buf(ctx); + if (err) + return err; + sec += sbuf->cpt_next; + } + + return 0; +} + + +/* Pass 3: I understand, this is not funny already :-), + * but we have to do another pass to establish links between + * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX + * skb queues with proper skb->sk links. + * + * This could be made at the end of rst_sockets(), but we defer + * restoring af_unix queues up to the end of restoring files to + * make restoring passed FDs cleaner. + */ + +int rst_sockets_complete(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_SOCKET) { + struct cpt_sock_image *sbuf; + struct sock *sk = obj->o_obj; + struct sock *peer; + + if (!sk) BUG(); + + if (sk->sk_family != AF_UNIX) + continue; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_next > sbuf->cpt_hdrlen) + restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); + + cpt_release_buf(ctx); + + if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { + cpt_object_t *pobj; + + sbuf = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + if (sbuf->cpt_peer != -1) { + pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); + if (pobj) { + peer = pobj->o_obj; + sock_hold(peer); + unix_peer(sk) = peer; + } + } + cpt_release_buf(ctx); + } + } + + rst_orphans(ctx); + + return 0; +} + diff -uprN linux-2.6.24/kernel/cpt/rst_socket_in.c linux-2.6.24.ovz/kernel/cpt/rst_socket_in.c --- linux-2.6.24/kernel/cpt/rst_socket_in.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_socket_in.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,487 @@ +/* + * + * kernel/cpt/rst_socket_in.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_socket.h" +#include "cpt_kernel.h" + +static inline unsigned long jiffies_import(__u32 tmo) +{ + __s32 delta = tmo; + return jiffies + (long)delta; +} + +static inline __u32 tcp_jiffies_import(__u32 tmo) +{ + return ((__u32)jiffies) + tmo; +} + + +static int restore_queues(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + loff_t endpos; + + pos = pos + si->cpt_hdrlen; + endpos = pos + si->cpt_next; + while (pos < endpos) { + struct sk_buff *skb; + __u32 type; + + skb = rst_skb(&pos, NULL, &type, ctx); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) == -EINVAL) { + int err; + + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + } + return PTR_ERR(skb); + } + + if (sk->sk_type == SOCK_STREAM) { + if (type == CPT_SKB_RQ) { + sk_stream_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_OFOQ) { + struct tcp_sock *tp = tcp_sk(sk); + sk_stream_set_owner_r(skb, sk); + ub_tcprcvbuf_charge_forced(sk, skb); + skb_queue_tail(&tp->out_of_order_queue, skb); + } else if (type == CPT_SKB_WQ) { + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; + ub_tcpsndbuf_charge_forced(sk, skb); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + wprintk_ctx("strange stream queue type %u\n", type); + kfree_skb(skb); + } + } else { + if (type == CPT_SKB_RQ) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + } else if (type == CPT_SKB_WQ) { + struct inet_sock *inet = inet_sk(sk); + if (inet->cork.fragsize) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&sk->sk_write_queue, skb); + } else { + eprintk_ctx("cork skb is dropped\n"); + kfree_skb(skb); + } + } else { + wprintk_ctx("strange dgram queue type %u\n", type); + kfree_skb(skb); + } + } + } + return 0; +} + +static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) +{ + cpt_object_t *obj; + for_each_object(obj, CPT_OBJ_SOCKET) { + struct sock *sk = obj->o_obj; + if (sk && + sk->sk_state == TCP_LISTEN && + (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && + inet_sk(sk)->sport == sport) + return sk; + } + return NULL; +} + +static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + tp->pred_flags = si->cpt_pred_flags; + tp->rcv_nxt = si->cpt_rcv_nxt; + tp->snd_nxt = si->cpt_snd_nxt; + tp->snd_una = si->cpt_snd_una; + tp->snd_sml = si->cpt_snd_sml; + tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); + tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); + tp->tcp_header_len = si->cpt_tcp_header_len; + inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; + inet_csk(sk)->icsk_ack.quick = si->cpt_quick; + inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; + inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; + inet_csk(sk)->icsk_ack.ato = si->cpt_ato; + inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); + inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; + inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; + tp->snd_wl1 = si->cpt_snd_wl1; + tp->snd_wnd = si->cpt_snd_wnd; + tp->max_window = si->cpt_max_window; + inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; + tp->mss_cache = si->cpt_mss_cache; + tp->rx_opt.mss_clamp = si->cpt_mss_clamp; + inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; + inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; + inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; + tp->reordering = si->cpt_reordering; + tp->frto_counter = si->cpt_frto_counter; + tp->frto_highmark = si->cpt_frto_highmark; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + // // tp->adv_cong = si->cpt_adv_cong; +#endif + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; + inet_csk(sk)->icsk_backoff = si->cpt_backoff; + tp->srtt = si->cpt_srtt; + tp->mdev = si->cpt_mdev; + tp->mdev_max = si->cpt_mdev_max; + tp->rttvar = si->cpt_rttvar; + tp->rtt_seq = si->cpt_rtt_seq; + inet_csk(sk)->icsk_rto = si->cpt_rto; + tp->packets_out = si->cpt_packets_out; + tp->retrans_out = si->cpt_retrans_out; + tp->lost_out = si->cpt_lost_out; + tp->sacked_out = si->cpt_sacked_out; + tp->fackets_out = si->cpt_fackets_out; + tp->snd_ssthresh = si->cpt_snd_ssthresh; + tp->snd_cwnd = si->cpt_snd_cwnd; + tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; + tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; + tp->snd_cwnd_used = si->cpt_snd_cwnd_used; + tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); + inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); + tp->rcv_wnd = si->cpt_rcv_wnd; + tp->rcv_wup = si->cpt_rcv_wup; + tp->write_seq = si->cpt_write_seq; + tp->pushed_seq = si->cpt_pushed_seq; + tp->copied_seq = si->cpt_copied_seq; + tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; + tp->rx_opt.wscale_ok = si->cpt_wscale_ok; + tp->rx_opt.sack_ok = si->cpt_sack_ok; + tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; + tp->rx_opt.snd_wscale = si->cpt_snd_wscale; + tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; + tp->nonagle = si->cpt_nonagle; + tp->keepalive_probes = si->cpt_keepalive_probes; + tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; + tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; + tp->rx_opt.ts_recent = si->cpt_ts_recent; + tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; + tp->rx_opt.user_mss = si->cpt_user_mss; + tp->rx_opt.dsack = si->cpt_dsack; + tp->rx_opt.eff_sacks = si->cpt_num_sacks; + tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; + tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; + tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; + tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; + tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; + tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; + tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; + tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; + tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; + tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; + + tp->window_clamp = si->cpt_window_clamp; + tp->rcv_ssthresh = si->cpt_rcv_ssthresh; + inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; + tp->rx_opt.num_sacks = si->cpt_num_sacks; + tp->advmss = si->cpt_advmss; + inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; + tp->ecn_flags = si->cpt_ecn_flags; + tp->prior_ssthresh = si->cpt_prior_ssthresh; + tp->high_seq = si->cpt_high_seq; + tp->retrans_stamp = si->cpt_retrans_stamp; + tp->undo_marker = si->cpt_undo_marker; + tp->undo_retrans = si->cpt_undo_retrans; + tp->urg_seq = si->cpt_urg_seq; + tp->urg_data = si->cpt_urg_data; + inet_csk(sk)->icsk_pending = si->cpt_pending; + tp->urg_mode = si->cpt_urg_mode; + tp->snd_up = si->cpt_snd_up; + tp->keepalive_time = si->cpt_keepalive_time; + tp->keepalive_intvl = si->cpt_keepalive_intvl; + tp->linger2 = si->cpt_linger2; + + sk->sk_send_head = NULL; + for (skb = skb_peek(&sk->sk_write_queue); + skb && skb != (struct sk_buff*)&sk->sk_write_queue; + skb = skb->next) { + if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { + sk->sk_send_head = skb; + break; + } + } + + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { + struct inet_sock *inet = inet_sk(sk); + if (inet->num == 0) { + cpt_object_t *lobj = NULL; + + if ((int)si->cpt_parent != -1) + lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); + + if (lobj && lobj->o_obj) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(&tcp_hashinfo, lobj->o_obj, sk); + local_bh_enable(); + dprintk_ctx("port inherited from parent\n"); + } else { + struct sock *lsk = find_parent(inet->sport, ctx); + if (lsk) { + inet->num = ntohs(inet->sport); + local_bh_disable(); + __inet_inherit_port(&tcp_hashinfo, lsk, sk); + local_bh_enable(); + dprintk_ctx("port inherited\n"); + } else { + eprintk_ctx("we are kinda lost...\n"); + } + } + } + + sk->sk_prot->hash(sk); + + if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) + sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); + if (inet_csk(sk)->icsk_pending) + sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, + inet_csk(sk)->icsk_timeout); + if (sock_flag(sk, SOCK_KEEPOPEN)) { + unsigned long expires = jiffies_import(si->cpt_ka_timeout); + if (time_after(jiffies, expires)) + expires = jiffies + HZ; + sk_reset_timer(sk, &sk->sk_timer, expires); + } + } + + return 0; +} + + +int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, + struct cpt_context *ctx) +{ + struct inet_sock *inet = inet_sk(sk); + + lock_sock(sk); + + sk->sk_state = si->cpt_state; + + inet->daddr = si->cpt_daddr; + inet->dport = si->cpt_dport; + inet->saddr = si->cpt_saddr; + inet->rcv_saddr = si->cpt_rcv_saddr; + inet->sport = si->cpt_sport; + inet->uc_ttl = si->cpt_uc_ttl; + inet->tos = si->cpt_tos; + inet->cmsg_flags = si->cpt_cmsg_flags; + inet->mc_index = si->cpt_mc_index; + inet->mc_addr = si->cpt_mc_addr; + inet->hdrincl = si->cpt_hdrincl; + inet->mc_ttl = si->cpt_mc_ttl; + inet->mc_loop = si->cpt_mc_loop; + inet->pmtudisc = si->cpt_pmtudisc; + inet->recverr = si->cpt_recverr; + inet->freebind = si->cpt_freebind; + inet->id = si->cpt_idcounter; + + inet->cork.flags = si->cpt_cork_flags; + inet->cork.fragsize = si->cpt_cork_fragsize; + inet->cork.length = si->cpt_cork_length; + inet->cork.addr = si->cpt_cork_addr; + inet->cork.fl.fl4_src = si->cpt_cork_saddr; + inet->cork.fl.fl4_dst = si->cpt_cork_daddr; + inet->cork.fl.oif = si->cpt_cork_oif; + if (inet->cork.fragsize) { + if (ip_route_output_key(&inet->cork.rt, &inet->cork.fl)) { + eprintk_ctx("failed to restore cork route\n"); + inet->cork.fragsize = 0; + } + } + + if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { + struct udp_sock *up = udp_sk(sk); + up->pending = si->cpt_udp_pending; + up->corkflag = si->cpt_udp_corkflag; + up->encap_type = si->cpt_udp_encap; + up->len = si->cpt_udp_len; + } + + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + memcpy(&np->saddr, si->cpt_saddr6, 16); + memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); + memcpy(&np->daddr, si->cpt_daddr6, 16); + np->flow_label = si->cpt_flow_label6; + np->frag_size = si->cpt_frag_size6; + np->hop_limit = si->cpt_hop_limit6; + np->mcast_hops = si->cpt_mcast_hops6; + np->mcast_oif = si->cpt_mcast_oif6; + np->rxopt.all = si->cpt_rxopt6; + np->mc_loop = si->cpt_mc_loop6; + np->recverr = si->cpt_recverr6; + np->sndflow = si->cpt_sndflow6; + np->pmtudisc = si->cpt_pmtudisc6; + np->ipv6only = si->cpt_ipv6only6; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (si->cpt_mapped) { + extern struct inet_connection_sock_af_ops ipv6_mapped; + if (sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) { + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + } + } +#endif + } + + restore_queues(sk, si, pos, ctx); + + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + rst_socket_tcp(si, pos, sk, ctx); + + release_sock(sk); + return 0; +} + +int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) +{ + struct request_sock *req; + + if (lsk->sk_state != TCP_LISTEN) + return -EINVAL; + + req = reqsk_alloc(&tcp_request_sock_ops); + if (!req) + return -ENOMEM; + + sk->sk_socket = NULL; + sk->sk_sleep = NULL; + inet_csk_reqsk_queue_add(lsk, req, sk); + return 0; +} + +int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, + loff_t pos, struct cpt_context *ctx) +{ + int err; + loff_t end = si->cpt_next; + + pos += si->cpt_hdrlen; + while (pos < end) { + struct cpt_openreq_image oi; + + err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); + if (err) { + err = rst_sock_attr(&pos, sk, ctx); + if (err) + return err; + continue; + } + + if (oi.cpt_object == CPT_OBJ_OPENREQ) { + struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); + if (req == NULL) + return -ENOMEM; + + memset(req, 0, sizeof(*req)); + tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; + tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; + inet_rsk(req)->rmt_port = oi.cpt_rmt_port; + req->mss = oi.cpt_mss; + req->retrans = oi.cpt_retrans; + inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; + inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; + inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; + inet_rsk(req)->sack_ok = oi.cpt_sack_ok; + inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; + inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; + inet_rsk(req)->acked = oi.cpt_acked; + req->window_clamp = oi.cpt_window_clamp; + req->rcv_wnd = oi.cpt_rcv_wnd; + req->ts_recent = oi.cpt_ts_recent; + req->expires = jiffies_import(oi.cpt_expires); + + if (oi.cpt_family == AF_INET) { + memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); + memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } else { +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); + memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); + inet6_rsk(req)->iif = oi.cpt_iif; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +#endif + } + } + pos += oi.cpt_next; + } + return 0; +} + +int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + struct ip_mreqn imr; + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + memset(&imr, 0, sizeof(imr)); + imr.imr_ifindex = v->cpt_ifindex; + imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; + return ip_mc_join_group(sk, &imr); +} + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, + loff_t pos, cpt_context_t *ctx) +{ + + if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { + eprintk_ctx("IGMPv3 is still not supported\n"); + return -EINVAL; + } + + return ipv6_sock_mc_join(sk, v->cpt_ifindex, + (struct in6_addr*)v->cpt_mcaddr); +} +#endif diff -uprN linux-2.6.24/kernel/cpt/rst_sysvipc.c linux-2.6.24.ovz/kernel/cpt/rst_sysvipc.c --- linux-2.6.24/kernel/cpt/rst_sysvipc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_sysvipc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,554 @@ +/* + * + * kernel/cpt/rst_sysvipc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_kernel.h" + +struct _warg { + struct file *file; + struct cpt_sysvshm_image *v; +}; + +static int fixup_one_shm(struct shmid_kernel *shp, void *arg) +{ + struct _warg *warg = arg; + + if (shp->shm_file != warg->file) + return 0; + if (shp->shm_nattch) + return -EEXIST; + + shp->shm_perm.uid = warg->v->cpt_uid; + shp->shm_perm.gid = warg->v->cpt_gid; + shp->shm_perm.cuid = warg->v->cpt_cuid; + shp->shm_perm.cgid = warg->v->cpt_cgid; + shp->shm_perm.mode = warg->v->cpt_mode; + + shp->shm_atim = warg->v->cpt_atime; + shp->shm_dtim = warg->v->cpt_dtime; + shp->shm_ctim = warg->v->cpt_ctime; + shp->shm_cprid = warg->v->cpt_creator; + shp->shm_lprid = warg->v->cpt_last; + + /* TODO: fix shp->mlock_user? */ + return 1; +} + +static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) +{ + struct _warg warg; + + warg.file = file; + warg.v = v; + + return sysvipc_walk_shm(fixup_one_shm, &warg); +} + +static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, + struct cpt_context *ctx) +{ + struct cpt_page_block pgb; + ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); + + do_write = file->f_dentry->d_inode->i_fop->write; + if (do_write == NULL) { + eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); + return -EINVAL; + } + + while (pos < end) { + loff_t opos; + loff_t ipos; + int count; + int err; + + err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); + if (err) + return err; + dprintk_ctx("restoring SHM block: %08x-%08x\n", + (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); + ipos = pos + pgb.cpt_hdrlen; + opos = pgb.cpt_start; + count = pgb.cpt_end-pgb.cpt_start; + while (count > 0) { + mm_segment_t oldfs; + int copy = count; + + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + (void)cpt_get_buf(ctx); + oldfs = get_fs(); set_fs(KERNEL_DS); + err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); + set_fs(oldfs); + if (err) { + __cpt_release_buf(ctx); + return err; + } + oldfs = get_fs(); set_fs(KERNEL_DS); + ipos += copy; + err = do_write(file, ctx->tmpbuf, copy, &opos); + set_fs(oldfs); + __cpt_release_buf(ctx); + if (err != copy) { + eprintk_ctx("write() failure\n"); + if (err >= 0) + err = -EIO; + return err; + } + count -= copy; + } + pos += pgb.cpt_next; + } + return 0; +} + +struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx) +{ + struct file *file; + int err; + loff_t dpos, epos; + union { + struct cpt_file_image fi; + struct cpt_sysvshm_image shmi; + struct cpt_inode_image ii; + } u; + + err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); + if (err < 0) + goto err_out; + pos = u.fi.cpt_inode; + err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); + if (err < 0) + goto err_out; + dpos = pos + u.ii.cpt_hdrlen; + epos = pos + u.ii.cpt_next; + err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); + if (err < 0) + goto err_out; + dpos += u.shmi.cpt_next; + + file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, + u.shmi.cpt_segsz, u.shmi.cpt_mode); + if (!IS_ERR(file)) { + err = fixup_shm(file, &u.shmi); + if (err != -EEXIST && dpos < epos) + err = fixup_shm_data(file, dpos, epos, ctx); + } + + return file; + +err_out: + return ERR_PTR(err); +} + +static int attach_one_undo(int semid, struct sem_array *sma, void *arg) +{ + struct sem_undo *su = arg; + struct sem_undo_list *undo_list = current->sysvsem.undo_list; + + if (semid != su->semid) + return 0; + + su->proc_next = undo_list->proc_list; + undo_list->proc_list = su; + + su->id_next = sma->undo; + sma->undo = su; + + return 1; +} + +static int attach_undo(struct sem_undo *su) +{ + return sysvipc_walk_sem(attach_one_undo, su); +} + +static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *undo_list; + + if (current->sysvsem.undo_list) { + eprintk_ctx("Funny undo_list\n"); + return 0; + } + + undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC); + if (undo_list == NULL) + return -ENOMEM; + + atomic_set(&undo_list->refcnt, 1); + spin_lock_init(&undo_list->lock); + current->sysvsem.undo_list = undo_list; + + if (sui->cpt_next > sui->cpt_hdrlen) { + loff_t offset = pos + sui->cpt_hdrlen; + do { + struct sem_undo *new; + struct cpt_sysvsem_undo_image spi; + err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); + if (err) + goto out; + new = kmalloc(sizeof(struct sem_undo) + + sizeof(short)*spi.cpt_nsem, + GFP_KERNEL_UBC); + if (!new) { + err = -ENOMEM; + goto out; + } + + memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); + new->semadj = (short *) &new[1]; + new->semid = spi.cpt_id; + err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); + if (err) { + kfree(new); + goto out; + } + err = attach_undo(new); + if (err <= 0) { + if (err == 0) + err = -ENOENT; + kfree(new); + goto out; + } + offset += spi.cpt_next; + } while (offset < pos + sui->cpt_next); + } + err = 0; + +out: + return err; +} + +__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + __u32 flag = 0; + +#if 0 + if (ti->cpt_sysvsem_undo == CPT_NULL || + lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) + flag |= CLONE_SYSVSEM; +#endif + return flag; +} + +int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + int err; + struct sem_undo_list *f = current->sysvsem.undo_list; + cpt_object_t *obj; + struct cpt_object_hdr sui; + + if (ti->cpt_sysvsem_undo == CPT_NULL) { + exit_sem(current); + return 0; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); + if (obj) { + if (obj->o_obj != f) { + exit_sem(current); + f = obj->o_obj; + atomic_inc(&f->refcnt); + current->sysvsem.undo_list = f; + } + return 0; + } + + if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) + goto out; + + if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) + goto out; + + err = -ENOMEM; + obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); + if (obj) { + err = 0; + cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); + } + + return 0; + +out: + return err; +} + +struct _sarg { + int semid; + struct cpt_sysvsem_image *v; + __u32 *arr; +}; + +static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) +{ + struct _sarg *warg = arg; + + if (semid != warg->semid) + return 0; + + sma->sem_perm.uid = warg->v->cpt_uid; + sma->sem_perm.gid = warg->v->cpt_gid; + sma->sem_perm.cuid = warg->v->cpt_cuid; + sma->sem_perm.cgid = warg->v->cpt_cgid; + sma->sem_perm.mode = warg->v->cpt_mode; + sma->sem_perm.seq = warg->v->cpt_seq; + + sma->sem_ctime = warg->v->cpt_ctime; + sma->sem_otime = warg->v->cpt_otime; + memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); + return 1; +} + +static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) +{ + struct _sarg warg; + + warg.semid = semid; + warg.v = v; + warg.arr = arr; + + return sysvipc_walk_sem(fixup_one_sem, &warg); +} + + +static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, + struct cpt_context *ctx) +{ + int err; + __u32 *arr; + int nsems = (si->cpt_next - si->cpt_hdrlen)/8; + + arr = kmalloc(nsems*8, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); + if (err) + goto out; + err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); + if (err < 0) { + eprintk_ctx("SEM 3\n"); + goto out; + } + err = fixup_sem(si->cpt_id, si, arr); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + kfree(arr); + return err; +} + +static int rst_sysv_sem(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvsem_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); + if (err) + return err; + err = restore_sem(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + +struct _marg { + int msqid; + struct cpt_sysvmsg_image *v; + struct msg_queue *m; +}; + +static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg) +{ + struct _marg *warg = arg; + + if (msqid != warg->msqid) + return 0; + + msq->q_perm.uid = warg->v->cpt_uid; + msq->q_perm.gid = warg->v->cpt_gid; + msq->q_perm.cuid = warg->v->cpt_cuid; + msq->q_perm.cgid = warg->v->cpt_cgid; + msq->q_perm.mode = warg->v->cpt_mode; + msq->q_perm.seq = warg->v->cpt_seq; + + msq->q_stime = warg->v->cpt_stime; + msq->q_rtime = warg->v->cpt_rtime; + msq->q_ctime = warg->v->cpt_ctime; + msq->q_lspid = warg->v->cpt_last_sender; + msq->q_lrpid = warg->v->cpt_last_receiver; + msq->q_qbytes = warg->v->cpt_qbytes; + + warg->m = msq; + return 1; +} + +struct _larg +{ + cpt_context_t * ctx; + loff_t pos; +}; + +static int do_load_msg(void * dst, int len, int offset, void * data) +{ + struct _larg * arg = data; + return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset); +} + +static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos, + cpt_context_t * ctx) +{ + int err; + struct _marg warg; + loff_t endpos = pos + v->cpt_next; + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + pos += v->cpt_hdrlen; + + warg.msqid = msqid; + warg.v = v; + + err = sysvipc_walk_msg(fixup_one_msg, &warg); + if (err <= 0) + return err; + + while (pos < endpos) { + struct cpt_sysvmsg_msg_image mi; + struct msg_msg *m; + struct _larg data = { + .ctx = ctx + }; + + err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx); + if (err) + return err; + data.pos = pos + mi.cpt_hdrlen; + m = sysv_msg_load(do_load_msg, mi.cpt_size, &data); + if (IS_ERR(m)) + return PTR_ERR(m); + m->m_type = mi.cpt_type; + m->m_ts = mi.cpt_size; + list_add_tail(&m->m_list, &warg.m->q_messages); + warg.m->q_cbytes += m->m_ts; + warg.m->q_qnum++; + atomic_add(m->m_ts, &ns->msg_bytes); + atomic_inc(&ns->msg_hdrs); + + pos += mi.cpt_next; + } + return 1; +} + +static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si, + struct cpt_context *ctx) +{ + int err; + + err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode); + if (err < 0) { + eprintk_ctx("MSG 3\n"); + goto out; + } + err = fixup_msg(si->cpt_id, si, pos, ctx); + if (err == 0) + err = -ESRCH; + if (err > 0) + err = 0; +out: + return err; +} + +static int rst_sysv_msg(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_sysvmsg_image sbuf; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int err; + err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx); + if (err) + return err; + err = restore_msg(sec, &sbuf, ctx); + if (err) + return err; + sec += sbuf.cpt_next; + } + return 0; +} + + +int rst_sysv_ipc(struct cpt_context *ctx) +{ + int err; + + err = rst_sysv_sem(ctx); + if (!err) + err = rst_sysv_msg(ctx); + + return err; +} diff -uprN linux-2.6.24/kernel/cpt/rst_tty.c linux-2.6.24.ovz/kernel/cpt/rst_tty.c --- linux-2.6.24/kernel/cpt/rst_tty.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_tty.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,379 @@ +/* + * + * kernel/cpt/rst_tty.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_files.h" +#include "cpt_kernel.h" + +static int pty_setup(struct tty_struct *stty, loff_t pos, + struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + unsigned long flags; + + stty->pgrp = NULL; + stty->session = NULL; + stty->packet = pi->cpt_packet; + stty->stopped = pi->cpt_stopped; + stty->hw_stopped = pi->cpt_hw_stopped; + stty->flow_stopped = pi->cpt_flow_stopped; +#define DONOT_CHANGE ((1<flags & DONOT_CHANGE; + stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); + stty->ctrl_status = pi->cpt_ctrl_status; + stty->winsize.ws_row = pi->cpt_ws_row; + stty->winsize.ws_col = pi->cpt_ws_col; + stty->winsize.ws_ypixel = pi->cpt_ws_prow; + stty->winsize.ws_xpixel = pi->cpt_ws_pcol; + stty->canon_column = pi->cpt_canon_column; + stty->column = pi->cpt_column; + stty->raw = pi->cpt_raw; + stty->real_raw = pi->cpt_real_raw; + stty->erasing = pi->cpt_erasing; + stty->lnext = pi->cpt_lnext; + stty->icanon = pi->cpt_icanon; + stty->closing = pi->cpt_closing; + stty->minimum_to_wake = pi->cpt_minimum_to_wake; + + stty->termios->c_iflag = pi->cpt_c_iflag; + stty->termios->c_oflag = pi->cpt_c_oflag; + stty->termios->c_lflag = pi->cpt_c_lflag; + stty->termios->c_cflag = pi->cpt_c_cflag; + memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); + memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); + + if (pi->cpt_next > pi->cpt_hdrlen) { + int err; + struct cpt_obj_bits b; + err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); + if (err) + return err; + if (b.cpt_size == 0) + return 0; + err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); + if (err) + return err; + + spin_lock_irq(&stty->read_lock); + stty->read_tail = 0; + stty->read_cnt = b.cpt_size; + stty->read_head = b.cpt_size; + stty->canon_head = stty->read_tail + pi->cpt_canon_head; + stty->canon_data = pi->cpt_canon_data; + spin_unlock_irq(&stty->read_lock); + } + + return 0; +} + +/* Find slave/master tty in image, when we already know master/slave. + * It might be optimized, of course. */ +static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_tty_image *pibuf; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return CPT_NULL; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return CPT_NULL; + pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); + if (pibuf == NULL) { + eprintk_ctx("cannot allocate buffer\n"); + return CPT_NULL; + } + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) + return CPT_NULL; + if (pibuf->cpt_index == pi->cpt_index && + !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && + pos != sec) { + pty_setup(stty, sec, pibuf, ctx); + return sec; + } + sec += pibuf->cpt_next; + } + kfree(pibuf); + return CPT_NULL; +} + +static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, + struct cpt_context *ctx) +{ + int err; + struct iattr newattrs; + struct dentry *d = master->f_dentry; + + newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; + newattrs.ia_uid = ii->cpt_uid; + newattrs.ia_gid = ii->cpt_gid; + newattrs.ia_mode = ii->cpt_mode; + + mutex_lock(&d->d_inode->i_mutex); + err = notify_change(d, &newattrs); + mutex_unlock(&d->d_inode->i_mutex); + + return err; +} + +/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open + * /dev/ptmx until we get pty with desired index. + */ + +struct file *ptmx_open(int index, unsigned int flags) +{ + struct file *file; + struct file **stack = NULL; + int depth = 0; + + for (;;) { + struct tty_struct *tty; + + file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(file)) + break; + tty = file->private_data; + if (tty->index == index) + break; + + if (depth == PAGE_SIZE/sizeof(struct file *)) { + fput(file); + file = ERR_PTR(-EBUSY); + break; + } + if (stack == NULL) { + stack = (struct file **)__get_free_page(GFP_KERNEL); + if (!stack) { + fput(file); + file = ERR_PTR(-ENOMEM); + break; + } + } + stack[depth] = file; + depth++; + } + while (depth > 0) { + depth--; + fput(stack[depth]); + } + if (stack) + free_page((unsigned long)stack); + return file; +} + + +struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, + unsigned flags, struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct file *master, *slave; + struct tty_struct *stty; + struct cpt_tty_image *pi; + static char *a = "pqrstuvwxyzabcde"; + static char *b = "0123456789abcdef"; + char pairname[16]; + unsigned master_flags, slave_flags; + + if (fi->cpt_priv == CPT_NULL) + return ERR_PTR(-EINVAL); + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); + if (obj && obj->o_parent) { + dprintk_ctx("obtained pty as pair to existing\n"); + master = obj->o_parent; + stty = master->private_data; + + if (stty->driver->subtype == PTY_TYPE_MASTER && + (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { + wprintk_ctx("cloning ptmx\n"); + get_file(master); + return master; + } + + master = dentry_open(dget(master->f_dentry), + mntget(master->f_vfsmnt), flags); + if (!IS_ERR(master)) { + stty = master->private_data; + if (stty->driver->subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, master, ctx); + } + return master; + } + + pi = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); + if (err) { + cpt_release_buf(ctx); + return ERR_PTR(err); + } + + master_flags = slave_flags = 0; + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) + master_flags = flags; + else + slave_flags = flags; + + /* + * Open pair master/slave. + */ + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { + master = ptmx_open(pi->cpt_index, master_flags); + } else { + sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); + master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + } + if (IS_ERR(master)) { + eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master)); + cpt_release_buf(ctx); + return master; + } + stty = master->private_data; + clear_bit(TTY_PTY_LOCK, &stty->flags); + if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) + sprintf(pairname, "/dev/pts/%d", stty->index); + else + sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); + slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); + if (IS_ERR(slave)) { + eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); + fput(master); + cpt_release_buf(ctx); + return slave; + } + + if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) + fixup_tty_attrs(ii, slave, ctx); + + cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); + cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); + cpt_object_add(CPT_OBJ_FILE, master, ctx); + cpt_object_add(CPT_OBJ_FILE, slave, ctx); + + if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(master); + cpt_release_buf(ctx); + return master; + } else { + loff_t pos; + obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); + obj->o_parent = slave; + cpt_obj_setpos(obj, fi->cpt_priv, ctx); + pty_setup(stty->link, fi->cpt_priv, pi, ctx); + + obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); + obj->o_parent = master; + pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); + cpt_obj_setpos(obj, pos, ctx); + + obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); + cpt_obj_setpos(obj, CPT_NULL, ctx); + get_file(slave); + cpt_release_buf(ctx); + return slave; + } +} + +int rst_tty_jobcontrol(struct cpt_context *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_TTY]; + loff_t endsec; + struct cpt_section_hdr h; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + cpt_object_t *obj; + struct cpt_tty_image *pibuf = cpt_get_buf(ctx); + + if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { + cpt_release_buf(ctx); + return -EINVAL; + } + + obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); + if (obj) { + struct tty_struct *stty = obj->o_obj; + if ((int)pibuf->cpt_pgrp > 0) { + stty->pgrp = alloc_vpid_safe(pibuf->cpt_pgrp); + if (!stty->pgrp) + dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); + } else if (pibuf->cpt_pgrp) { + stty->pgrp = alloc_pid(current->nsproxy->pid_ns, + 0); + if (!stty->pgrp) { + eprintk_ctx("cannot allocate stray tty->pgrp"); + cpt_release_buf(ctx); + return -EINVAL; + } + } + if ((int)pibuf->cpt_session > 0) { + struct pid *sess; + sess = alloc_vpid_safe(pibuf->cpt_session); + if (!sess) { + dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); + } else if (!stty->session) { + stty->session = sess; + } + } + } + sec += pibuf->cpt_next; + cpt_release_buf(ctx); + } + return 0; +} diff -uprN linux-2.6.24/kernel/cpt/rst_ubc.c linux-2.6.24.ovz/kernel/cpt/rst_ubc.c --- linux-2.6.24/kernel/cpt/rst_ubc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_ubc.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,131 @@ +/* + * + * kernel/cpt/rst_ubc.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" + +struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) +{ + cpt_object_t *obj; + + obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); + if (obj == NULL) { + eprintk("RST: unknown ub @%Ld\n", (long long)pos); + return get_beancounter(get_exec_ub()); + } + return get_beancounter(obj->o_obj); +} + +void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id) +{ + to[bc_parm_id].barrier = from[bc_parm_id].barrier; + to[bc_parm_id].limit = from[bc_parm_id].limit; +} + +void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id) +{ + ubprm[bc_parm_id].barrier = UB_MAXVALUE; + ubprm[bc_parm_id].limit = UB_MAXVALUE; +} + +static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, + int held) +{ + prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier); + prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit); + if (held) + prm->held = dmp->held; + prm->maxheld = dmp->maxheld; + prm->minheld = dmp->minheld; + prm->failcnt = dmp->failcnt; +} + +static int restore_one_bc(struct cpt_beancounter_image *v, + cpt_object_t *obj, struct cpt_context *ctx) +{ + struct user_beancounter *bc; + cpt_object_t *pobj; + int i; + + if (v->cpt_parent != CPT_NULL) { + pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); + if (pobj == NULL) + return -ESRCH; + bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); + } else { + bc = get_exec_ub(); + while (bc->parent) + bc = bc->parent; + get_beancounter(bc); + } + if (bc == NULL) + return -ENOMEM; + obj->o_obj = bc; + + if (ctx->image_version < CPT_VERSION_18 && + CPT_VERSION_MINOR(ctx->image_version) < 1) + goto out; + + for (i = 0; i < UB_RESOURCES; i++) { + restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); + restore_one_bc_parm(v->cpt_parms + i * 2 + 1, + bc->ub_store + i, 1); + } + +out: + if (!bc->parent) + for (i = 0; i < UB_RESOURCES; i++) + copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i); + + return 0; +} + +int rst_undump_ubc(struct cpt_context *ctx) +{ + loff_t start, end; + struct cpt_beancounter_image *v; + cpt_object_t *obj; + int err; + + err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + v = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } + + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_UBC, obj, ctx); + + restore_one_bc(v, obj, ctx); + + cpt_release_buf(ctx); + start += v->cpt_next; + } + return 0; +} + +void rst_finish_ubc(struct cpt_context *ctx) +{ + cpt_object_t *obj; + + for_each_object(obj, CPT_OBJ_UBC) + put_beancounter(obj->o_obj); +} diff -uprN linux-2.6.24/kernel/cpt/rst_undump.c linux-2.6.24.ovz/kernel/cpt/rst_undump.c --- linux-2.6.24/kernel/cpt/rst_undump.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpt/rst_undump.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,1005 @@ +/* + * + * kernel/cpt/rst_undump.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include +#include + +#include "cpt_obj.h" +#include "cpt_context.h" +#include "cpt_files.h" +#include "cpt_mm.h" +#include "cpt_process.h" +#include "cpt_socket.h" +#include "cpt_net.h" +#include "cpt_ubc.h" +#include "cpt_kernel.h" + +static int rst_utsname(cpt_context_t *ctx); + + +struct thr_context { + struct completion init_complete; + struct completion task_done; + int error; + struct cpt_context *ctx; + cpt_object_t *tobj; +}; + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); + +static int vps_rst_veinfo(struct cpt_context *ctx) +{ + int err; + struct cpt_veinfo_image *i; + struct ve_struct *ve; + struct timespec delta; + loff_t start, end; + struct ipc_namespace *ns; + + err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); + if (err) + goto out; + + i = cpt_get_buf(ctx); + memset(i, 0, sizeof(*i)); + err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); + if (err) + goto out_rel; + + ve = get_exec_env(); + ns = ve->ve_ns->ipc_ns; + + /* Damn. Fatal mistake, these two values are size_t! */ + ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU; + ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU; + ns->shm_ctlmni = i->shm_ctl_mni; + + ns->msg_ctlmax = i->msg_ctl_max; + ns->msg_ctlmni = i->msg_ctl_mni; + ns->msg_ctlmnb = i->msg_ctl_mnb; + + BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); + ns->sem_ctls[0] = i->sem_ctl_arr[0]; + ns->sem_ctls[1] = i->sem_ctl_arr[1]; + ns->sem_ctls[2] = i->sem_ctl_arr[2]; + ns->sem_ctls[3] = i->sem_ctl_arr[3]; + + cpt_timespec_import(&delta, i->start_timespec_delta); + _set_normalized_timespec(&ve->start_timespec, + ve->start_timespec.tv_sec - delta.tv_sec, + ve->start_timespec.tv_nsec - delta.tv_nsec); + ve->start_jiffies -= i->start_jiffies_delta; + // // FIXME: what??? + // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; + + ctx->last_vpid = i->last_pid; + + err = 0; +out_rel: + cpt_release_buf(ctx); +out: + return err; +} + +static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err; + struct env_create_param3 param; + + do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); + do_gettimespec(&ctx->delta_time); + + _set_normalized_timespec(&ctx->delta_time, + ctx->delta_time.tv_sec - ctx->start_time.tv_sec, + ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec); + ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec; + if (ctx->delta_nsec < 0) { + wprintk_ctx("Wall time is behind source by %Ld ns, " + "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec); + } + + _set_normalized_timespec(&ctx->cpt_monotonic_time, + ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec, + ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec); + + memset(¶m, 0, sizeof(param)); + param.iptables_mask = ctx->iptables_mask; + param.feature_mask = ctx->features; + + /* feature_mask is set as required - pretend we know everything */ + param.known_features = (ctx->image_version < CPT_VERSION_18) ? + VE_FEATURES_OLD : ~(__u64)0; + + err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, + ¶m, sizeof(param)); + if (err < 0) + eprintk_ctx("real_env_create: %d\n", err); + + get_exec_env()->jiffies_fixup = + (ctx->delta_time.tv_sec < 0 ? + 0 : timespec_to_jiffies(&ctx->delta_time)) - + (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64); + dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup, + (long long)ctx->delta_nsec); + return err < 0 ? err : 0; +} + +static int hook(void *arg) +{ + struct thr_context *thr_ctx = arg; + struct cpt_context *ctx; + cpt_object_t *tobj; + struct cpt_task_image *ti; + int err = 0; + int exiting = 0; + + current->state = TASK_UNINTERRUPTIBLE; + complete(&thr_ctx->init_complete); + schedule(); + + ctx = thr_ctx->ctx; + tobj = thr_ctx->tobj; + ti = tobj->o_image; + + current->fs->umask = 0; + + if (ti->cpt_pid == 1) { +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + err = vps_rst_reparent_root(tobj, ctx); + + if (err) { + rst_report_error(err, ctx); + goto out; + } + + memcpy(&cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t)); + + if (ctx->statusfile) { + fput(ctx->statusfile); + ctx->statusfile = NULL; + } + + if (ctx->lockfile) { + char b; + mm_segment_t oldfs; + err = -EINVAL; + + oldfs = get_fs(); set_fs(KERNEL_DS); + if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) + err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); + set_fs(oldfs); + fput(ctx->lockfile); + ctx->lockfile = NULL; + } + + if (err) { + eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err); + goto out; + } + err = vps_rst_veinfo(ctx); + if (err) { + eprintk_ctx("rst_veinfo: %d\n", err); + goto out; + } + + err = rst_utsname(ctx); + if (err) { + eprintk_ctx("rst_utsname: %d\n", err); + goto out; + } + + err = rst_root_namespace(ctx); + if (err) { + eprintk_ctx("rst_namespace: %d\n", err); + goto out; + } + + if ((err = rst_restore_net(ctx)) != 0) { + eprintk_ctx("rst_restore_net: %d\n", err); + goto out; + } + + err = rst_sockets(ctx); + if (err) { + eprintk_ctx("rst_sockets: %d\n", err); + goto out; + } + err = rst_sysv_ipc(ctx); + if (err) { + eprintk_ctx("rst_sysv_ipc: %d\n", err); + goto out; + } +#ifdef CONFIG_BEANCOUNTERS + bc = get_exec_ub(); + set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC); + set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE); + set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE); +#endif + } + + do { + if (current->user->uid != ti->cpt_user) { + struct user_struct *u; + + u = alloc_uid(get_exec_env()->ve_ns->user_ns, ti->cpt_user); + if (!u) { + eprintk_ctx("alloc_user\n"); + } else { + switch_uid(u); + } + } + } while (0); + + if ((err = rst_mm_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_mm: %d\n", err); + goto out; + } + + if ((err = rst_files_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_files: %d\n", err); + goto out; + } + + if ((err = rst_fs_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_fs: %d\n", err); + goto out; + } + + if ((err = rst_semundo_complete(ti, ctx)) != 0) { + eprintk_ctx("rst_semundo: %d\n", err); + goto out; + } + + if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) { + eprintk_ctx("rst_signal: %d\n", err); + goto out; + } + + if (ti->cpt_personality != 0) + __set_personality(ti->cpt_personality); + +#ifdef CONFIG_X86_64 + /* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */ + if (!ti->cpt_64bit) + __set_personality(PER_LINUX32); +#endif + + current->set_child_tid = NULL; + current->clear_child_tid = NULL; + current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); + current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); + current->exit_code = ti->cpt_exit_code; + current->pdeath_signal = ti->cpt_pdeath_signal; + + if (ti->cpt_restart.fn != CPT_RBL_0) { + if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP +#ifdef CONFIG_COMPAT + || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP +#endif + ) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + + if (ctx->image_version >= CPT_VERSION_20) + e = ktime_add_ns(e, ti->cpt_restart.arg2); + else if (ctx->image_version >= CPT_VERSION_9) + e = ktime_add_ns(e, ti->cpt_restart.arg0); + else + e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC); + if (e.tv64 < 0) + e.tv64 = TICK_NSEC; + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP) + rb->fn = hrtimer_nanosleep_restart; +#ifdef CONFIG_COMPAT + else + rb->fn = compat_nanosleep_restart; +#endif + if (ctx->image_version >= CPT_VERSION_20) { + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else if (ctx->image_version >= CPT_VERSION_9) { + rb->arg0 = ti->cpt_restart.arg2; + rb->arg1 = ti->cpt_restart.arg3; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } else { + rb->arg0 = ti->cpt_restart.arg1; + rb->arg1 = CLOCK_MONOTONIC; + rb->arg2 = e.tv64 & 0xFFFFFFFF; + rb->arg3 = e.tv64 >> 32; + } + } else if (ti->cpt_restart.fn == CPT_RBL_POLL) { + struct restart_block *rb; + ktime_t e; + struct timespec ts; + unsigned long timeout_jiffies; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_sub(e, timespec_to_ktime(ctx->delta_time)); + ts = ns_to_timespec(ktime_to_ns(e)); + timeout_jiffies = timespec_to_jiffies(&ts); + + rb = &task_thread_info(current)->restart_block; + rb->fn = do_restart_poll; + rb->arg0 = ti->cpt_restart.arg0; + rb->arg1 = ti->cpt_restart.arg1; + rb->arg2 = timeout_jiffies & 0xFFFFFFFF; + rb->arg3 = (u64)timeout_jiffies >> 32; + } else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) { + struct restart_block *rb; + ktime_t e; + + e.tv64 = 0; + e = ktime_add_ns(e, ti->cpt_restart.arg2); + e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); + + rb = &task_thread_info(current)->restart_block; + rb->fn = futex_wait_restart; + rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0; + rb->futex.val = ti->cpt_restart.arg1; + rb->futex.time = e.tv64; + rb->futex.flags = ti->cpt_restart.arg3; + } else + eprintk_ctx("unknown restart block\n"); + } + + if (thread_group_leader(current)) { + current->signal->it_real_incr.tv64 = 0; + if (ctx->image_version >= CPT_VERSION_9) { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); + } else { + current->signal->it_real_incr = + ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); + } + current->signal->it_prof_incr = ti->cpt_it_prof_incr; + current->signal->it_virt_incr = ti->cpt_it_virt_incr; + current->signal->it_prof_expires = ti->cpt_it_prof_value; + current->signal->it_virt_expires = ti->cpt_it_virt_value; + } + + err = rst_clone_children(tobj, ctx); + if (err) { + eprintk_ctx("rst_clone_children\n"); + goto out; + } + + if (exiting) + current->signal->flags |= SIGNAL_GROUP_EXIT; + + if (ti->cpt_pid == 1) { + if ((err = rst_process_linkage(ctx)) != 0) { + eprintk_ctx("rst_process_linkage: %d\n", err); + goto out; + } + if ((err = rst_do_filejobs(ctx)) != 0) { + eprintk_ctx("rst_do_filejobs: %d\n", err); + goto out; + } + if ((err = rst_eventpoll(ctx)) != 0) { + eprintk_ctx("rst_eventpoll: %d\n", err); + goto out; + } +#ifdef CONFIG_INOTIFY_USER + if ((err = rst_inotify(ctx)) != 0) { + eprintk_ctx("rst_inotify: %d\n", err); + goto out; + } +#endif + if ((err = rst_sockets_complete(ctx)) != 0) { + eprintk_ctx("rst_sockets_complete: %d\n", err); + goto out; + } + if ((err = rst_stray_files(ctx)) != 0) { + eprintk_ctx("rst_stray_files: %d\n", err); + goto out; + } + if ((err = rst_posix_locks(ctx)) != 0) { + eprintk_ctx("rst_posix_locks: %d\n", err); + goto out; + } + if ((err = rst_tty_jobcontrol(ctx)) != 0) { + eprintk_ctx("rst_tty_jobcontrol: %d\n", err); + goto out; + } + if ((err = rst_restore_fs(ctx)) != 0) { + eprintk_ctx("rst_restore_fs: %d\n", err); + goto out; + } + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) { + err = -ECHRNG; + eprintk_ctx("scp_restore failed\n"); + goto out; + } + if (ctx->last_vpid) + get_exec_env()->ve_ns->pid_ns->last_pid = + ctx->last_vpid; + } + +out: + thr_ctx->error = err; + complete(&thr_ctx->task_done); + + if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { + current->flags |= PF_EXIT_RESTART; + do_exit(ti->cpt_exit_code); + } else { + __set_current_state(TASK_UNINTERRUPTIBLE); + } + + schedule(); + + dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm); + + module_put(THIS_MODULE); + complete_and_exit(NULL, 0); + return 0; +} + +#if 0 +static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) +{ + struct task_beancounter *tbc; + + tbc = task_bc(current); + + put_beancounter(tbc->fork_sub); + tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); + if (ti->cpt_mm_ub != CPT_NULL) { + put_beancounter(tbc->exec_ub); + tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); + } +} +#endif + +static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, + struct thr_context *thr_ctx) +{ + struct task_struct *tsk; + int pid; + + thr_ctx->ctx = ctx; + thr_ctx->error = 0; + init_completion(&thr_ctx->init_complete); + init_completion(&thr_ctx->task_done); +#if 0 + set_task_ubs(obj->o_image, ctx); +#endif + + pid = local_kernel_thread(hook, thr_ctx, 0, 0); + if (pid < 0) + return pid; + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(obj, tsk, ctx); + thr_ctx->tobj = obj; + return 0; +} + +static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) +{ + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); + rst_mm_basic(obj, ti, ctx); + return 0; +} + +static int make_baby(cpt_object_t *cobj, + struct cpt_task_image *pi, + struct cpt_context *ctx) +{ + unsigned long flags; + struct cpt_task_image *ci = cobj->o_image; + struct thr_context thr_ctx; + struct task_struct *tsk; + pid_t pid; + struct fs_struct *tfs = NULL; + + flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) + | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); + if (ci->cpt_rppid != pi->cpt_pid) { + flags |= CLONE_THREAD|CLONE_PARENT; + if (ci->cpt_signal != pi->cpt_signal || + !(flags&CLONE_SIGHAND) || + (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { + eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", + (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, + (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags + ); + return -EINVAL; + } + } + + thr_ctx.ctx = ctx; + thr_ctx.error = 0; + init_completion(&thr_ctx.init_complete); + init_completion(&thr_ctx.task_done); + thr_ctx.tobj = cobj; + +#if 0 + set_task_ubs(ci, ctx); +#endif + + if (current->fs == NULL) { + tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs; + if (tfs == NULL) + return -EINVAL; + atomic_inc(&tfs->count); + current->fs = tfs; + } + pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); + if (tfs) { + current->fs = NULL; + atomic_dec(&tfs->count); + } + if (pid < 0) + return pid; + + read_lock(&tasklist_lock); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + read_unlock(&tasklist_lock); + if (tsk == NULL) + return -ESRCH; + cpt_obj_setobj(cobj, tsk, ctx); + thr_ctx.tobj = cobj; + wait_for_completion(&thr_ctx.init_complete); + wait_task_inactive(cobj->o_obj); + rst_basic_init_task(cobj, ctx); + + /* clone() increases group_stop_count if it was not zero and + * CLONE_THREAD was asked. Undo. + */ + if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { + if (tsk->signal != current->signal) BUG(); + current->signal->group_stop_count--; + } + + wake_up_process(tsk); + wait_for_completion(&thr_ctx.task_done); + wait_task_inactive(tsk); + + return thr_ctx.error; +} + +static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) +{ + int err = 0; + struct cpt_task_image *ti = obj->o_image; + cpt_object_t *cobj; + + for_each_object(cobj, CPT_OBJ_TASK) { + struct cpt_task_image *ci = cobj->o_image; + if (cobj == obj) + continue; + if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || + (ci->cpt_leader == ti->cpt_pid && + ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { + err = make_baby(cobj, ti, ctx); + if (err) { + eprintk_ctx("make_baby: %d\n", err); + return err; + } + } + } + return 0; +} + +static int read_task_images(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + + err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); + if (err) + return err; + + while (start < end) { + cpt_object_t *obj; + struct cpt_task_image *ti = cpt_get_buf(ctx); + + err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); + if (err) { + cpt_release_buf(ctx); + return err; + } +#if 0 + if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { + eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); + cpt_release_buf(ctx); + return -EINVAL; + } +#endif + obj = alloc_cpt_object(GFP_KERNEL, ctx); + cpt_obj_setpos(obj, start, ctx); + intern_cpt_object(CPT_OBJ_TASK, obj, ctx); + obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); + if (obj->o_image == NULL) { + cpt_release_buf(ctx); + return -ENOMEM; + } + memcpy(obj->o_image, ti, sizeof(*ti)); + err = ctx->pread(obj->o_image + sizeof(*ti), + ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); + cpt_release_buf(ctx); + if (err) + return err; + start += ti->cpt_next; + } + return 0; +} + + +static int vps_rst_restore_tree(struct cpt_context *ctx) +{ + int err; + cpt_object_t *obj; + struct thr_context thr_ctx_root; + + err = read_task_images(ctx); + if (err) + return err; + + err = rst_undump_ubc(ctx); + if (err) + return err; + + if (virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL) + return -ECHRNG; +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); + if (err) + return err; +#endif + for_each_object(obj, CPT_OBJ_TASK) { + err = create_root_task(obj, ctx, &thr_ctx_root); + if (err) + return err; + + wait_for_completion(&thr_ctx_root.init_complete); + wait_task_inactive(obj->o_obj); + rst_basic_init_task(obj, ctx); + + wake_up_process(obj->o_obj); + wait_for_completion(&thr_ctx_root.task_done); + wait_task_inactive(obj->o_obj); + err = thr_ctx_root.error; + if (err) + return err; + break; + } + + return err; +} + +#ifndef CONFIG_IA64 +int rst_read_vdso(struct cpt_context *ctx) +{ + int err; + loff_t start, end; + struct cpt_page_block *pgb; + + ctx->vdso = NULL; + err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end); + if (err) + return err; + if (start == CPT_NULL) + return 0; + if (end < start + sizeof(*pgb) + PAGE_SIZE) + return -EINVAL; + + pgb = cpt_get_buf(ctx); + err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx); + if (err) { + goto err_buf; + } + ctx->vdso = (char*)__get_free_page(GFP_KERNEL); + if (ctx->vdso == NULL) { + err = -ENOMEM; + goto err_buf; + } + err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb)); + if (err) + goto err_page; + if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) { + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; + } + + cpt_release_buf(ctx); + return 0; +err_page: + free_page((unsigned long)ctx->vdso); + ctx->vdso = NULL; +err_buf: + cpt_release_buf(ctx); + return err; +} +#endif + +int vps_rst_undump(struct cpt_context *ctx) +{ + int err; + unsigned long umask; + + err = rst_open_dumpfile(ctx); + if (err) + return err; + + if (ctx->tasks64) { +#if defined(CONFIG_IA64) + if (ctx->image_arch != CPT_OS_ARCH_IA64) +#elif defined(CONFIG_X86_64) + if (ctx->image_arch != CPT_OS_ARCH_EMT64) +#else + if (1) +#endif + { + eprintk_ctx("Cannot restore 64 bit container on this architecture\n"); + return -EINVAL; + } + } + + umask = current->fs->umask; + current->fs->umask = 0; + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + err = rst_setup_pagein(ctx); +#endif +#ifndef CONFIG_IA64 + if (err == 0) + err = rst_read_vdso(ctx); +#endif + if (err == 0) + err = vps_rst_restore_tree(ctx); + + if (err == 0) + err = rst_restore_process(ctx); + + if (err) + virtinfo_notifier_call(VITYPE_SCP, + VIRTINFO_SCP_RSTFAIL, ctx); + + current->fs->umask = umask; + + return err; +} + +static int rst_unlock_ve(struct cpt_context *ctx) +{ + struct ve_struct *env; + + env = get_ve_by_id(ctx->ve_id); + if (!env) + return -ESRCH; + down_write(&env->op_sem); + env->is_locked = 0; + up_write(&env->op_sem); + put_ve(env); + return 0; +} + +int recalc_sigpending_tsk(struct task_struct *t); + +int rst_resume(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *bc; +#endif + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + +#ifdef CONFIG_BEANCOUNTERS + bc = get_beancounter_byuid(ctx->ve_id, 0); + BUG_ON(!bc); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE); + copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE); + put_beancounter(bc); +#endif + + rst_resume_network(ctx); + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + struct cpt_task_image *ti = obj->o_image; + + if (!tsk) + continue; + + if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { + dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); + + /* Weird... If a signal is sent to stopped task, + * nobody makes recalc_sigpending(). We have to do + * this by hands after wake_up_process(). + * if we did this before a signal could arrive before + * wake_up_process() and stall. + */ + spin_lock_irq(&tsk->sighand->siglock); + if (!signal_pending(tsk)) + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } else { + if (ti->cpt_state == TASK_STOPPED || + ti->cpt_state == TASK_TRACED) { + set_task_state(tsk, ti->cpt_state); + } + } + put_task_struct(tsk); + } + + rst_unlock_ve(ctx); + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 0); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +int rst_kill(struct cpt_context *ctx) +{ + cpt_object_t *obj; + int err = 0; + + for_each_object(obj, CPT_OBJ_FILE) { + struct file *file = obj->o_obj; + + fput(file); + } + + for_each_object(obj, CPT_OBJ_TASK) { + struct task_struct *tsk = obj->o_obj; + + if (tsk == NULL) + continue; + + if (tsk->exit_state == 0) { + send_sig(SIGKILL, tsk, 1); + + spin_lock_irq(&tsk->sighand->siglock); + sigfillset(&tsk->blocked); + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + set_tsk_thread_flag(tsk, TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_FREEZE); + if (tsk->flags & PF_FROZEN) + tsk->flags &= ~PF_FROZEN; + spin_unlock_irq(&tsk->sighand->siglock); + + wake_up_process(tsk); + } + + put_task_struct(tsk); + } + +#ifdef CONFIG_VZ_CHECKPOINT_LAZY + rst_complete_pagein(ctx, 1); +#endif + + rst_finish_ubc(ctx); + cpt_object_destroy(ctx); + + return err; +} + +static int rst_utsname(cpt_context_t *ctx) +{ + int err; + loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; + loff_t endsec; + struct cpt_section_hdr h; + struct cpt_object_hdr o; + struct ve_struct *ve; + struct uts_namespace *ns; + int i; + + if (sec == CPT_NULL) + return 0; + + err = ctx->pread(&h, sizeof(h), ctx, sec); + if (err) + return err; + if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) + return -EINVAL; + + ve = get_exec_env(); + ns = ve->ve_ns->uts_ns; + + i = 0; + endsec = sec + h.cpt_next; + sec += h.cpt_hdrlen; + while (sec < endsec) { + int len; + char *ptr; + err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); + if (err) + return err; + len = o.cpt_next - o.cpt_hdrlen; + if (len > __NEW_UTS_LEN + 1) + return -ENAMETOOLONG; + switch (i) { + case 0: + ptr = ns->name.nodename; break; + case 1: + ptr = ns->name.domainname; break; + default: + return -EINVAL; + } + err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); + if (err) + return err; + i++; + sec += o.cpt_next; + } + + return 0; +} diff -uprN linux-2.6.24/kernel/cpu.c linux-2.6.24.ovz/kernel/cpu.c --- linux-2.6.24/kernel/cpu.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/cpu.c 2008-03-25 18:53:59.000000000 -0500 @@ -92,7 +92,7 @@ static inline void check_for_tasks(int c struct task_struct *p; write_lock_irq(&tasklist_lock); - for_each_process(p) { + for_each_process_all(p) { if (task_cpu(p) == cpu && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) diff -uprN linux-2.6.24/kernel/exit.c linux-2.6.24.ovz/kernel/exit.c --- linux-2.6.24/kernel/exit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/exit.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -42,8 +43,14 @@ #include #include /* for audit_free() */ #include +#include #include #include +#include +#include + +#include +#include #include #include @@ -52,7 +59,7 @@ extern void sem_exit (void); -static void exit_mm(struct task_struct * tsk); +void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) { @@ -63,6 +70,9 @@ static void __unhash_process(struct task detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_VE + list_del_rcu(&p->ve_task_info.vetask_list); +#endif __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); @@ -153,6 +163,8 @@ repeat: ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); + nr_zombie--; + atomic_inc(&nr_dead); /* * If we are the last non-leader member of the thread @@ -174,9 +186,12 @@ repeat: */ zap_leader = (leader->exit_signal == -1); } + put_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); release_thread(p); + ub_task_uncharge(p); + pput_ve(p->ve_task_info.owner_env); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; @@ -295,28 +310,30 @@ static void reparent_to_kthreadd(void) switch_uid(INIT_USER); } -void __set_special_pids(pid_t session, pid_t pgrp) +void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; + pid_t nr = pid_nr(pid); - if (task_session_nr(curr) != session) { + if (task_session(curr) != pid) { detach_pid(curr, PIDTYPE_SID); - set_task_session(curr, session); - attach_pid(curr, PIDTYPE_SID, find_pid(session)); + attach_pid(curr, PIDTYPE_SID, pid); + set_task_session(curr, nr); } - if (task_pgrp_nr(curr) != pgrp) { + if (task_pgrp(curr) != pid) { detach_pid(curr, PIDTYPE_PGID); - set_task_pgrp(curr, pgrp); - attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); + attach_pid(curr, PIDTYPE_PGID, pid); + set_task_pgrp(curr, nr); } } -static void set_special_pids(pid_t session, pid_t pgrp) +void set_special_pids(struct pid *pid) { write_lock_irq(&tasklist_lock); - __set_special_pids(session, pgrp); + __set_special_pids(pid); write_unlock_irq(&tasklist_lock); } +EXPORT_SYMBOL(set_special_pids); /* * Let kernel threads use this to say that they @@ -385,7 +402,7 @@ void daemonize(const char *name, ...) */ current->flags |= PF_NOFREEZE; - set_special_pids(1, 1); + set_special_pids(&init_struct_pid); proc_clear_tty(current); /* Block and flush all signals */ @@ -554,13 +571,17 @@ EXPORT_SYMBOL_GPL(exit_fs); * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; mm_release(tsk, mm); if (!mm) return; + + if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) + mm->oom_killed = 1; + /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_waiters @@ -591,6 +612,7 @@ static void exit_mm(struct task_struct * task_unlock(tsk); mmput(mm); } +EXPORT_SYMBOL_GPL(exit_mm); static void reparent_thread(struct task_struct *p, struct task_struct *father, int traced) @@ -817,6 +839,9 @@ static void exit_notify(struct task_stru && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; + if (tsk->exit_signal != -1 && t == init_pid_ns.child_reaper) + /* We dont want people slaying init. */ + tsk->exit_signal = SIGCHLD; /* If something other than our normal parent is ptracing us, then * send it a SIGCHLD instead of honoring exit_signal. exit_signal @@ -833,6 +858,7 @@ static void exit_notify(struct task_stru if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) state = EXIT_DEAD; tsk->exit_state = state; + nr_zombie++; if (thread_group_leader(tsk) && tsk->signal->notify_count < 0 && @@ -881,7 +907,6 @@ static inline void exit_child_reaper(str if (tsk->nsproxy->pid_ns == &init_pid_ns) panic("Attempted to kill init!"); - /* * @tsk is the last thread in the 'cgroup-init' and is exiting. * Terminate all remaining processes in the namespace and reap them @@ -905,6 +930,7 @@ static inline void exit_child_reaper(str * perform the role of the child_reaper. */ zap_pid_ns_processes(tsk->nsproxy->pid_ns); + (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); } fastcall NORET_TYPE void do_exit(long code) @@ -975,12 +1001,14 @@ fastcall NORET_TYPE void do_exit(long co } acct_collect(code, group_dead); #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); + if (!(tsk->flags & PF_EXIT_RESTART)) { + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); #endif + } #endif if (group_dead) tty_audit_exit(); @@ -1009,8 +1037,16 @@ fastcall NORET_TYPE void do_exit(long co if (tsk->binfmt) module_put(tsk->binfmt->module); - proc_exit_connector(tsk); - exit_notify(tsk); + if (!(tsk->flags & PF_EXIT_RESTART)) { + proc_exit_connector(tsk); + exit_notify(tsk); + } else { + write_lock_irq(&tasklist_lock); + tsk->exit_state = EXIT_ZOMBIE; + nr_zombie++; + write_unlock_irq(&tasklist_lock); + exit_task_namespaces(tsk); + } #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; @@ -1740,6 +1776,7 @@ asmlinkage long sys_wait4(pid_t pid, int prevent_tail_call(ret); return ret; } +EXPORT_SYMBOL_GPL(sys_wait4); #ifdef __ARCH_WANT_SYS_WAITPID diff -uprN linux-2.6.24/kernel/fork.c linux-2.6.24.ovz/kernel/fork.c --- linux-2.6.24/kernel/fork.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/fork.c 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +53,7 @@ #include #include #include +#include #include #include @@ -59,11 +62,16 @@ #include #include +#include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ +EXPORT_SYMBOL(nr_threads); int max_threads; /* tunable limit on nr_threads */ @@ -71,6 +79,8 @@ DEFINE_PER_CPU(unsigned long, process_co __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); + int nr_processes(void) { int cpu; @@ -121,15 +131,22 @@ void __put_task_struct(struct task_struc WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + ub_task_put(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); delayacct_tsk_free(tsk); +#ifdef CONFIG_VE + put_ve(VE_TASK_INFO(tsk)->owner_env); + atomic_dec(&nr_dead); +#endif if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); + void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -139,7 +156,7 @@ void __init fork_init(unsigned long memp /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC|SLAB_UBC, NULL); #endif /* @@ -243,7 +260,12 @@ static int dup_mmap(struct mm_struct *mm -pages); continue; } + charge = 0; + if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, + mpnt->vm_file, UB_HARD)) + goto fail_noch; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) @@ -290,7 +312,7 @@ static int dup_mmap(struct mm_struct *mm rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -309,6 +331,9 @@ out: fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: + ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, + mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); +fail_noch: retval = -ENOMEM; vm_unacct_memory(charge); goto out; @@ -339,7 +364,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLO #include -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, + struct task_struct *tsk) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -356,11 +382,14 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + set_mm_ub(mm, tsk); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } + + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -375,10 +404,11 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm); + mm = mm_init(mm, NULL); } return mm; } +EXPORT_SYMBOL_GPL(mm_alloc); /* * Called when the last reference to the mm @@ -390,8 +420,10 @@ void fastcall __mmdrop(struct mm_struct BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + put_mm_ub(mm); free_mm(mm); } +EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. @@ -409,6 +441,9 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm); + if (mm->oom_killed) + ub_oom_task_dead(current); mmdrop(mm); } } @@ -509,7 +544,7 @@ static struct mm_struct *dup_mm(struct t mm->token_priority = 0; mm->last_interval = 0; - if (!mm_init(mm)) + if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) @@ -536,6 +571,7 @@ fail_nocontext: * because it calls destroy_context() */ mm_free_pgd(mm); + put_mm_ub(mm); free_mm(mm); return NULL; } @@ -975,14 +1011,19 @@ static struct task_struct *copy_process( struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, pid_t vpid) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; +#ifdef CONFIG_VE + if (clone_flags & CLONE_NAMESPACES_MASK) + return ERR_PTR(-EINVAL); +#else if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); +#endif /* * Thread groups must share signals as well, and detached threads @@ -1010,6 +1051,9 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); + if (ub_task_charge(current, p)) + goto bad_fork_charge; + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1153,7 +1197,7 @@ static struct task_struct *copy_process( if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(task_active_pid_ns(p), vpid); if (!pid) goto bad_fork_cleanup_namespaces; @@ -1161,6 +1205,8 @@ static struct task_struct *copy_process( retval = pid_ns_prepare_proc(task_active_pid_ns(p)); if (retval < 0) goto bad_fork_free_pid; + if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD) + task_active_pid_ns(p)->flags |= PID_NS_HIDDEN; } } @@ -1259,7 +1305,7 @@ static struct task_struct *copy_process( * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); - if (signal_pending(current)) { + if (signal_pending(current) && !vpid) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; @@ -1300,15 +1346,26 @@ static struct task_struct *copy_process( set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail_rcu(&p->tasks, &init_task.tasks); +#ifdef CONFIG_VE + list_add_tail_rcu(&p->ve_task_info.vetask_list, + &p->ve_task_info.owner_env->vetask_lh); +#endif __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } + (void)get_ve(p->ve_task_info.owner_env); + pget_ve(p->ve_task_info.owner_env); +#ifdef CONFIG_VE + seqcount_init(&p->ve_task_info.wakeup_lock); +#endif total_forks++; spin_unlock(¤t->sighand->siglock); + get_task_fairsched_node(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); @@ -1354,6 +1411,9 @@ bad_fork_cleanup_count: atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: + ub_task_uncharge(p); + ub_task_put(p); +bad_fork_charge: free_task(p); fork_out: return ERR_PTR(retval); @@ -1371,7 +1431,7 @@ struct task_struct * __cpuinit fork_idle struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1400,17 +1460,22 @@ static int fork_traceflag(unsigned clone * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long do_fork_pid(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + long vpid) { struct task_struct *p; int trace = 0; long nr; + nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags); + if (nr) + return nr; + if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) @@ -1418,7 +1483,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, vpid); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1450,6 +1515,8 @@ long do_fork(unsigned long clone_flags, set_tsk_thread_flag(p, TIF_SIGPENDING); } + (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p); + if (!(clone_flags & CLONE_STOPPED)) wake_up_new_task(p, clone_flags); else @@ -1472,6 +1539,8 @@ long do_fork(unsigned long clone_flags, } else { nr = PTR_ERR(p); } + + (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr); return nr; } @@ -1487,27 +1556,40 @@ static void sighand_ctor(struct kmem_cac init_waitqueue_head(&sighand->signalfd_wqh); } +EXPORT_SYMBOL(do_fork_pid); + +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return do_fork_pid(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr, 0); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|SLAB_UBC, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + SLAB_PANIC|SLAB_UBC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); } /* @@ -1657,6 +1739,10 @@ asmlinkage long sys_unshare(unsigned lon CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| CLONE_NEWNET)) goto bad_unshare_out; +#ifdef CONFIG_VE + if (unshare_flags & CLONE_NAMESPACES_MASK) + goto bad_unshare_out; +#endif if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; @@ -1670,9 +1756,11 @@ asmlinkage long sys_unshare(unsigned lon goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; +#ifndef CONFIG_VE if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_semundo; +#endif if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { @@ -1710,7 +1798,9 @@ asmlinkage long sys_unshare(unsigned lon if (new_nsproxy) put_nsproxy(new_nsproxy); +#ifndef CONFIG_VE bad_unshare_cleanup_semundo: +#endif bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff -uprN linux-2.6.24/kernel/futex.c linux-2.6.24.ovz/kernel/futex.c --- linux-2.6.24/kernel/futex.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/futex.c 2008-03-25 18:53:59.000000000 -0500 @@ -1153,8 +1153,6 @@ static int fixup_pi_state_owner(u32 __us */ #define FLAGS_SHARED 1 -static long futex_wait_restart(struct restart_block *restart); - static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, u32 val, ktime_t *abs_time) { @@ -1309,7 +1307,7 @@ static int futex_wait(u32 __user *uaddr, } -static long futex_wait_restart(struct restart_block *restart) +long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; struct rw_semaphore *fshared = NULL; @@ -1321,6 +1319,7 @@ static long futex_wait_restart(struct re fshared = ¤t->mm->mmap_sem; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t); } +EXPORT_SYMBOL_GPL(futex_wait_restart); /* diff -uprN linux-2.6.24/kernel/futex_compat.c linux-2.6.24.ovz/kernel/futex_compat.c --- linux-2.6.24/kernel/futex_compat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/futex_compat.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include diff -uprN linux-2.6.24/kernel/hrtimer.c linux-2.6.24.ovz/kernel/hrtimer.c --- linux-2.6.24/kernel/hrtimer.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/hrtimer.c 2008-03-25 18:53:59.000000000 -0500 @@ -1291,11 +1291,26 @@ static int __sched do_nanosleep(struct h return t->task == NULL; } +static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(timer->expires, timer->base->get_time()); + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; +} + long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; - struct timespec *rmtp; - ktime_t time; + struct timespec __user *rmtp; restart->fn = do_no_restart_syscall; @@ -1305,12 +1320,11 @@ long __sched hrtimer_nanosleep_restart(s if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; - rmtp = (struct timespec *)restart->arg1; + rmtp = (struct timespec __user *)restart->arg1; if (rmtp) { - time = ktime_sub(t.timer.expires, t.timer.base->get_time()); - if (time.tv64 <= 0) - return 0; - *rmtp = ktime_to_timespec(time); + int ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + return ret; } restart->fn = hrtimer_nanosleep_restart; @@ -1318,13 +1332,13 @@ long __sched hrtimer_nanosleep_restart(s /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; } +EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart); -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp, +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; - ktime_t rem; hrtimer_init(&t.timer, clockid, mode); t.timer.expires = timespec_to_ktime(*rqtp); @@ -1336,10 +1350,9 @@ long hrtimer_nanosleep(struct timespec * return -ERESTARTNOHAND; if (rmtp) { - rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); - if (rem.tv64 <= 0) - return 0; - *rmtp = ktime_to_timespec(rem); + int ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + return ret; } restart = ¤t_thread_info()->restart_block; @@ -1355,8 +1368,7 @@ long hrtimer_nanosleep(struct timespec * asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) { - struct timespec tu, rmt; - int ret; + struct timespec tu; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; @@ -1364,15 +1376,7 @@ sys_nanosleep(struct timespec __user *rq if (!timespec_valid(&tu)) return -EINVAL; - ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL, - CLOCK_MONOTONIC); - - if (ret && rmtp) { - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - } - - return ret; + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff -uprN linux-2.6.24/kernel/kmod.c linux-2.6.24.ovz/kernel/kmod.c --- linux-2.6.24/kernel/kmod.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/kmod.c 2008-03-25 18:53:59.000000000 -0500 @@ -77,6 +77,10 @@ int request_module(const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* Don't allow request_module() inside VE. */ + if (!ve_is_super(get_exec_env())) + return -EPERM; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -453,6 +457,9 @@ int call_usermodehelper_exec(struct subp DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + if (!ve_is_super(get_exec_env())) + return -EPERM; + helper_lock(); if (sub_info->path[0] == '\0') goto out; diff -uprN linux-2.6.24/kernel/kprobes.c linux-2.6.24.ovz/kernel/kprobes.c --- linux-2.6.24/kernel/kprobes.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/kprobes.c 2008-03-25 18:53:59.000000000 -0500 @@ -106,14 +106,14 @@ static int __kprobes check_safety(void) ret = freeze_processes(); if (ret == 0) { struct task_struct *p, *q; - do_each_thread(p, q) { + do_each_thread_all(p, q) { if (p != current && p->state == TASK_RUNNING && p->pid != 0) { printk("Check failed: %s is running\n",p->comm); ret = -1; goto loop_end; } - } while_each_thread(p, q); + } while_each_thread_all(p, q); } loop_end: thaw_processes(); diff -uprN linux-2.6.24/kernel/lockdep.c linux-2.6.24.ovz/kernel/lockdep.c --- linux-2.6.24/kernel/lockdep.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/lockdep.c 2008-03-25 18:53:59.000000000 -0500 @@ -3182,7 +3182,7 @@ retry: if (count != 10) printk(" locked it.\n"); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * It's not reliable to print a task's held locks * if it's not sleeping (or if it's not the current @@ -3195,7 +3195,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; - } while_each_thread(g, p); + } while_each_thread_all(g, p); printk("\n"); printk("=============================================\n\n"); diff -uprN linux-2.6.24/kernel/module.c linux-2.6.24.ovz/kernel/module.c --- linux-2.6.24/kernel/module.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/module.c 2008-03-25 18:53:59.000000000 -0500 @@ -2135,6 +2135,14 @@ sys_init_module(void __user *umod, mutex_unlock(&module_mutex); return ret; } + if (ret > 0) { + printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " + "it should follow 0/-E convention\n" + KERN_WARNING "%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } /* Now it's a first class citizen! */ mutex_lock(&module_mutex); @@ -2349,6 +2357,8 @@ unsigned long module_kallsyms_lookup_nam static void *m_start(struct seq_file *m, loff_t *pos) { mutex_lock(&module_mutex); + if (!ve_is_super(get_exec_env())) + return NULL; return seq_list_start(&modules, *pos); } diff -uprN linux-2.6.24/kernel/nsproxy.c linux-2.6.24.ovz/kernel/nsproxy.c --- linux-2.6.24/kernel/nsproxy.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/nsproxy.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,14 @@ static struct kmem_cache *nsproxy_cachep struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + /* * creates a copy of "orig" with refcount 1. */ @@ -132,12 +140,12 @@ int copy_namespaces(unsigned long flags, if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) return 0; - +#ifndef CONFIG_VE if (!capable(CAP_SYS_ADMIN)) { err = -EPERM; goto out; } - +#endif new_ns = create_new_namespaces(flags, tsk, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); @@ -156,6 +164,7 @@ out: put_nsproxy(old_ns); return err; } +EXPORT_SYMBOL_GPL(copy_namespaces); void free_nsproxy(struct nsproxy *ns) { @@ -172,6 +181,22 @@ void free_nsproxy(struct nsproxy *ns) put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } +EXPORT_SYMBOL_GPL(free_nsproxy); + +struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk) +{ + struct mnt_namespace *mnt_ns = NULL; + + task_lock(tsk); + if (tsk->nsproxy) + mnt_ns = tsk->nsproxy->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); + task_unlock(tsk); + + return mnt_ns; +} +EXPORT_SYMBOL(get_task_mnt_ns); /* * Called from unshare. Unshare all the namespaces part of nsproxy. diff -uprN linux-2.6.24/kernel/panic.c linux-2.6.24.ovz/kernel/panic.c --- linux-2.6.24/kernel/panic.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/panic.c 2008-03-25 18:53:59.000000000 -0500 @@ -28,6 +28,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +int kernel_text_csum_broken; +EXPORT_SYMBOL(kernel_text_csum_broken); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -160,7 +162,8 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", + kernel_text_csum_broken ? 'B' : ' ', tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', diff -uprN linux-2.6.24/kernel/pid.c linux-2.6.24.ovz/kernel/pid.c --- linux-2.6.24/kernel/pid.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/pid.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; + static int pidhash_shift; struct pid init_struct_pid = INIT_STRUCT_PID; static struct kmem_cache *pid_ns_cachep; @@ -120,6 +122,7 @@ static fastcall void free_pidmap(struct clear_bit(offset, map->page); atomic_inc(&map->nr_free); } +EXPORT_SYMBOL_GPL(free_pidmap); static int alloc_pidmap(struct pid_namespace *pid_ns) { @@ -181,6 +184,36 @@ static int alloc_pidmap(struct pid_names return -1; } +static int set_pidmap(struct pid_namespace *pid_ns, pid_t pid) +{ + int offset; + struct pidmap *map; + + offset = pid & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; + if (unlikely(!map->page)) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irq(&pidmap_lock); + if (map->page) + kfree(page); + else + map->page = page; + spin_unlock_irq(&pidmap_lock); + if (unlikely(!map->page)) + return -ENOMEM; + } + + if (test_and_set_bit(offset, map->page)) + return -EBUSY; + + atomic_dec(&map->nr_free); + return pid; +} + static int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; @@ -198,6 +231,7 @@ static int next_pidmap(struct pid_namesp } return -1; } +EXPORT_SYMBOL_GPL(alloc_pidmap); fastcall void put_pid(struct pid *pid) { @@ -226,25 +260,33 @@ fastcall void free_pid(struct pid *pid) /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; + struct upid *upid; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); - spin_unlock_irqrestore(&pidmap_lock, flags); + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + if (!hlist_unhashed(&upid->pid_chain)) + hlist_del_rcu(&upid->pid_chain); + } + spin_unlock(&pidmap_lock); + ub_kmemsize_uncharge(pid->ub, pid->numbers[pid->level].ns->pid_cachep->objuse); + local_irq_restore(flags); for (i = 0; i <= pid->level; i++) free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); - + put_beancounter(pid->ub); call_rcu(&pid->rcu, delayed_put_pid); } +EXPORT_SYMBOL_GPL(free_pid); -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; + struct user_beancounter *ub; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) @@ -252,7 +294,10 @@ struct pid *alloc_pid(struct pid_namespa tmp = ns; for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + if (vpid != 0 && i == ns->level) + nr = set_pidmap(tmp, vpid); + else + nr = alloc_pidmap(tmp); if (nr < 0) goto out_free; @@ -267,17 +312,32 @@ struct pid *alloc_pid(struct pid_namespa for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); +#ifdef CONFIG_BEANCOUNTERS + ub = get_exec_ub(); + local_irq_disable(); + if (ub_kmemsize_charge(ub, ns->pid_cachep->objuse, UB_HARD)) + goto out_enable; + pid->ub = get_beancounter(ub); + spin_lock(&pidmap_lock); +#else spin_lock_irq(&pidmap_lock); +#endif for (i = ns->level; i >= 0; i--) { upid = &pid->numbers[i]; hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + if (upid->ns->flags & PID_NS_HIDDEN) + while (i--) + INIT_HLIST_NODE(&pid->numbers[i].pid_chain); } spin_unlock_irq(&pidmap_lock); out: return pid; +out_enable: + local_irq_enable(); + put_pid_ns(ns); out_free: for (i++; i <= ns->level; i++) free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); @@ -286,6 +346,7 @@ out_free: pid = NULL; goto out; } +EXPORT_SYMBOL_GPL(alloc_pid); struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns) { @@ -328,6 +389,7 @@ int fastcall attach_pid(struct task_stru return 0; } +EXPORT_SYMBOL_GPL(attach_pid); void fastcall detach_pid(struct task_struct *task, enum pid_type type) { @@ -347,6 +409,7 @@ void fastcall detach_pid(struct task_str free_pid(pid); } +EXPORT_SYMBOL_GPL(detach_pid); /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, @@ -368,6 +431,7 @@ struct task_struct * fastcall pid_task(s } return result; } +EXPORT_SYMBOL_GPL(pid_task); /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. @@ -559,6 +623,7 @@ static struct pid_namespace *create_pid_ ns->last_pid = 0; ns->child_reaper = NULL; ns->level = level; + ns->flags = 0; set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -624,6 +689,197 @@ void free_pid_ns(struct kref *kref) } #endif /* CONFIG_PID_NS */ +/* + * this is a dirty ugly hack. + */ + +static void reattach_pid(struct task_struct *tsk, enum pid_type type, + struct pid *pid) +{ + int i; + struct pid *old_pid; + struct pid_link *link; + struct upid *upid; + + link = &tsk->pids[type]; + old_pid = link->pid; + + hlist_del_rcu(&link->node); + link->pid = pid; + hlist_add_head_rcu(&link->node, &pid->tasks[type]); + + if (type != PIDTYPE_PID) { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + return; + + for (i = 0; i < pid->level; i++) + hlist_del_rcu(&old_pid->numbers[i].pid_chain); + } else { + for (i = PIDTYPE_MAX; --i >= 0; ) + if (!hlist_empty(&old_pid->tasks[i])) + BUG(); + + for (i = 0; i < pid->level; i++) + hlist_replace_rcu(&old_pid->numbers[i].pid_chain, + &pid->numbers[i].pid_chain); + + upid = &pid->numbers[pid->level]; + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + } + + call_rcu(&old_pid->rcu, delayed_put_pid); +} + +static int __pid_ns_attach_task(struct pid_namespace *ns, + struct task_struct *tsk, pid_t nr) +{ + struct pid *pid; + enum pid_type type; + unsigned long old_size, new_size; + + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); + if (!pid) + goto out; + + if (nr == 0) + nr = alloc_pidmap(ns); + else + nr = set_pidmap(ns, nr); + + if (nr < 0) + goto out_free; + + memcpy(pid, task_pid(tsk), + sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid)); + get_pid_ns(ns); + pid->level++; + BUG_ON(pid->level != ns->level); + pid->numbers[pid->level].nr = nr; + pid->numbers[pid->level].ns = ns; + atomic_set(&pid->count, 1); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + + old_size = pid->numbers[pid->level - 1].ns->pid_cachep->objuse; + new_size = pid->numbers[pid->level].ns->pid_cachep->objuse; + local_irq_disable(); + /* + * Depending on sizeof(struct foo), cache flags (redzoning, etc) + * and actual CPU (cacheline_size() jump from 64 to 128 bytes after + * CPU detection) new size can very well be smaller than old size. + */ + if (new_size > old_size) { + if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0) + goto out_enable; + } else + ub_kmemsize_uncharge(pid->ub, old_size - new_size); + + write_lock(&tasklist_lock); + + spin_lock(&pidmap_lock); + reattach_pid(tsk, PIDTYPE_SID, pid); + set_task_session(tsk, pid_nr(pid)); + reattach_pid(tsk, PIDTYPE_PGID, pid); + tsk->signal->__pgrp = pid_nr(pid); + current->signal->tty_old_pgrp = NULL; + + reattach_pid(tsk, PIDTYPE_PID, pid); + spin_unlock(&pidmap_lock); + + write_unlock_irq(&tasklist_lock); + + return 0; + +out_enable: + local_irq_enable(); + put_pid_ns(ns); +out_free: + kmem_cache_free(ns->pid_cachep, pid); +out: + return -ENOMEM; +} + +int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk) +{ + return __pid_ns_attach_task(ns, tsk, 0); +} +EXPORT_SYMBOL_GPL(pid_ns_attach_task); + +int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk) +{ + int err; + + err = __pid_ns_attach_task(ns, tsk, 1); + if (err < 0) + return err; + + ns->child_reaper = tsk; + return 0; +} +EXPORT_SYMBOL_GPL(pid_ns_attach_init); + +#ifdef CONFIG_VE +static noinline void show_lost_task(struct task_struct *p) +{ + extern char * task_sig(struct task_struct *p, char *buffer); + char buf[512]; + + task_sig(p, buf); + printk("Lost task: %d/%s/%p\nSignals:%s\n", p->pid, p->comm, p, buf); +} + +static void zap_ve_processes(struct ve_struct *env) +{ + /* + * Here the VE changes its state into "not running". + * op_sem taken for write is a barrier to all VE manipulations from + * ioctl: it waits for operations currently in progress and blocks all + * subsequent operations until is_running is set to 0 and op_sem is + * released. + */ + down_write(&env->op_sem); + env->is_running = 0; + up_write(&env->op_sem); + + /* wait for all init childs exit */ + while (atomic_read(&env->pcounter) > 1) { + struct task_struct *g, *p; + long delay = 1; + + if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) + continue; + /* it was ENOCHLD or no more children somehow */ + if (atomic_read(&env->pcounter) == 1) + break; + + /* clear all signals to avoid wakeups */ + if (signal_pending(current)) + flush_signals(current); + /* we have child without signal sent */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + delay = (delay < HZ) ? (delay << 1) : HZ; + read_lock(&tasklist_lock); + do_each_thread_ve(g, p) { + if (p != current) { + /* + * by that time no processes other then entered + * may exist in the VE. if some were missed by + * zap_pid_ns_processes() this was a BUG + */ + if (!p->did_ve_enter) + show_lost_task(p); + + force_sig_specific(SIGKILL, p); + } + } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } +} +#endif + void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; @@ -655,12 +911,25 @@ void zap_pid_ns_processes(struct pid_nam rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - +#ifdef CONFIG_VE + zap_ve_processes(get_exec_env()); +#endif /* Child reaper for the pid namespace is going away */ pid_ns->child_reaper = NULL; return; } +pid_t pid_to_vpid(pid_t nr) +{ + struct pid *pid; + + pid = find_pid(nr); + if (pid) + return pid->numbers[pid->level].nr; + return -1; +} +EXPORT_SYMBOL_GPL(pid_to_vpid); + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or diff -uprN linux-2.6.24/kernel/posix-cpu-timers.c linux-2.6.24.ovz/kernel/posix-cpu-timers.c --- linux-2.6.24/kernel/posix-cpu-timers.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/posix-cpu-timers.c 2008-03-25 18:53:59.000000000 -0500 @@ -6,6 +6,7 @@ #include #include #include +#include static int check_clock(const clockid_t which_clock) { diff -uprN linux-2.6.24/kernel/posix-timers.c linux-2.6.24.ovz/kernel/posix-timers.c --- linux-2.6.24/kernel/posix-timers.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/posix-timers.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,6 +31,8 @@ * POSIX clocks & timers */ #include +#include +#include #include #include #include @@ -47,6 +49,9 @@ #include #include #include +#include + +#include /* * Management arrays for POSIX timers. Timers are kept in slab memory @@ -241,8 +246,8 @@ static __init int init_posix_timers(void register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof (struct k_itimer), 0, + SLAB_PANIC|SLAB_UBC, NULL); idr_init(&posix_timers_id); return 0; } @@ -298,6 +303,13 @@ void do_schedule_next_timer(struct sigin int posix_timer_event(struct k_itimer *timr,int si_private) { + int ret; + struct ve_struct *ve; + struct user_beancounter *ub; + + ve = set_exec_env(timr->it_process->ve_task_info.owner_env); + ub = set_exec_ub(timr->it_process->task_bc.task_ub); + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); timr->sigq->info.si_sys_private = si_private; /* Send signal to the process that owns this timer.*/ @@ -310,11 +322,11 @@ int posix_timer_event(struct k_itimer *t if (timr->it_sigev_notify & SIGEV_THREAD_ID) { struct task_struct *leader; - int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); if (likely(ret >= 0)) - return ret; + goto out; timr->it_sigev_notify = SIGEV_SIGNAL; leader = timr->it_process->group_leader; @@ -322,8 +334,12 @@ int posix_timer_event(struct k_itimer *t timr->it_process = leader; } - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); +out: + (void)set_exec_ub(ub); + (void)set_exec_env(ve); + return ret; } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -981,20 +997,9 @@ sys_clock_getres(const clockid_t which_c static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { - struct timespec rmt; - int ret; - - ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL, - flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); - - if (ret && rmtp) { - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - } - - return ret; + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } asmlinkage long diff -uprN linux-2.6.24/kernel/power/process.c linux-2.6.24.ovz/kernel/power/process.c --- linux-2.6.24/kernel/power/process.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/power/process.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,8 @@ #include #include +static atomic_t global_suspend = ATOMIC_INIT(0); + /* * Timeout for stopping processes */ @@ -26,7 +28,9 @@ static inline int freezeable(struct task { if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) + (p->exit_state != 0) || + (p->state == TASK_STOPPED) || + (p->state == TASK_TRACED)) return 0; return 1; } @@ -50,6 +54,24 @@ void refrigerator(void) processes around? */ long save; +#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) + save = current->state; + current->state = TASK_UNINTERRUPTIBLE; + + spin_lock_irq(¤t->sighand->siglock); + if (test_and_clear_thread_flag(TIF_FREEZE)) { + recalc_sigpending(); /* We sent fake signal, clean it up */ + current->flags |= PF_FROZEN; + } else { + /* Freeze request could be canceled before we entered + * refrigerator(). In this case we do nothing. */ + current->state = save; + } + spin_unlock_irq(¤t->sighand->siglock); + + while (current->flags & PF_FROZEN) + schedule(); +#else task_lock(current); if (freezing(current)) { frozen_process(); @@ -71,6 +93,7 @@ void refrigerator(void) break; schedule(); } +#endif pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } @@ -178,7 +201,7 @@ static int try_to_freeze_tasks(int freez do { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (frozen(p) || !freezeable(p)) continue; @@ -192,7 +215,7 @@ static int try_to_freeze_tasks(int freez if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ if (time_after(jiffies, end_time)) @@ -216,13 +239,13 @@ static int try_to_freeze_tasks(int freez elapsed_csecs / 100, elapsed_csecs % 100, todo); show_state(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (freezing(p) && !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, @@ -239,6 +262,7 @@ int freeze_processes(void) { int error; + atomic_inc(&global_suspend); printk("Freezing user space processes ... "); error = try_to_freeze_tasks(FREEZER_USER_SPACE); if (error) @@ -253,6 +277,7 @@ int freeze_processes(void) Exit: BUG_ON(in_atomic()); printk("\n"); + atomic_dec(&global_suspend); return error; } @@ -261,15 +286,17 @@ static void thaw_tasks(int thaw_user_spa struct task_struct *g, *p; read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!freezeable(p)) continue; if (!p->mm == thaw_user_space) continue; - thaw_process(p); - } while_each_thread(g, p); + if (!thaw_process(p)) + printk(KERN_WARNING " Strange, %s not stopped\n", + p->comm ); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } diff -uprN linux-2.6.24/kernel/printk.c linux-2.6.24.ovz/kernel/printk.c --- linux-2.6.24/kernel/printk.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/printk.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,7 +31,9 @@ #include #include #include +#include #include +#include #include #include @@ -54,6 +56,9 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +struct printk_aligned printk_no_wake_var[NR_CPUS]; +EXPORT_SYMBOL(printk_no_wake_var); + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -84,7 +89,7 @@ static int console_locked, console_suspe * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +DEFINE_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -115,6 +120,7 @@ static int preferred_console = -1; /* Flag: console code may call schedule() */ static int console_may_schedule; +int console_silence_loglevel; #ifdef CONFIG_PRINTK @@ -123,6 +129,19 @@ static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ +static int __init setup_console_silencelevel(char *str) +{ + int level; + + if (get_option(&str, &level) != 1) + return 0; + + console_silence_loglevel = level; + return 1; +} + +__setup("silencelevel=", setup_console_silencelevel); + static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -293,6 +312,9 @@ int do_syslog(int type, char __user *buf char c; int error = 0; + if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7)) + goto out; + error = security_syslog(type); if (error) return error; @@ -313,15 +335,15 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); + error = wait_event_interruptible(ve_log_wait, + (ve_log_start - ve_log_end)); if (error) goto out; i = 0; spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; + while (!error && (ve_log_start != ve_log_end) && i < len) { + c = VE_LOG_BUF(ve_log_start); + ve_log_start++; spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; @@ -347,15 +369,17 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } + if (ve_log_buf == NULL) + goto out; count = len; - if (count > log_buf_len) - count = log_buf_len; + if (count > ve_log_buf_len) + count = ve_log_buf_len; spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; + if (count > ve_logged_chars) + count = ve_logged_chars; if (do_clear) - logged_chars = 0; - limit = log_end; + ve_logged_chars = 0; + limit = ve_log_end; /* * __put_user() could sleep, and while we sleep * printk() could overwrite the messages @@ -364,9 +388,9 @@ int do_syslog(int type, char __user *buf */ for (i = 0; i < count && !error; i++) { j = limit-1-i; - if (j + log_buf_len < log_end) + if (j + ve_log_buf_len < ve_log_end) break; - c = LOG_BUF(j); + c = VE_LOG_BUF(j); spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); @@ -390,7 +414,7 @@ int do_syslog(int type, char __user *buf } break; case 5: /* Clear ring buffer */ - logged_chars = 0; + ve_logged_chars = 0; break; case 6: /* Disable logging to console */ console_loglevel = minimum_console_loglevel; @@ -402,16 +426,19 @@ int do_syslog(int type, char __user *buf error = -EINVAL; if (len < 1 || len > 8) goto out; + error = 0; + /* VE has no console, so return success */ + if (!ve_is_super(get_exec_env())) + goto out; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; - error = 0; break; case 9: /* Number of chars in the log buffer */ - error = log_end - log_start; + error = ve_log_end - ve_log_start; break; case 10: /* Size of the log buffer */ - error = log_buf_len; + error = ve_log_buf_len; break; default: error = -EINVAL; @@ -522,16 +549,18 @@ static void call_console_drivers(unsigne static void emit_log_char(char c) { - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + VE_LOG_BUF(ve_log_end) = c; + ve_log_end++; + if (ve_log_end - ve_log_start > ve_log_buf_len) + ve_log_start = ve_log_end - ve_log_buf_len; + if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) + con_start = ve_log_end - ve_log_buf_len; + if (ve_logged_chars < ve_log_buf_len) + ve_logged_chars++; } +static unsigned long do_release_console_sem(unsigned long *flags); + /* * Zap console related locks when oopsing. Only zap at most once * every 10 seconds, to leave time for slow consoles to print a @@ -613,6 +642,30 @@ static int have_callable_console(void) * printf(3) */ +static inline int ve_log_init(void) +{ +#ifdef CONFIG_VE + if (ve_log_buf != NULL) + return 0; + + if (ve_is_super(get_exec_env())) { + ve0._log_wait = &log_wait; + ve0._log_start = &log_start; + ve0._log_end = &log_end; + ve0._logged_chars = &logged_chars; + ve0.log_buf = log_buf; + return 0; + } + + ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); + if (!ve_log_buf) + return -ENOMEM; + + memset(ve_log_buf, 0, ve_log_buf_len); +#endif + return 0; +} + asmlinkage int printk(const char *fmt, ...) { va_list args; @@ -628,13 +681,14 @@ asmlinkage int printk(const char *fmt, . /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int __vprintk(const char *fmt, va_list args) { unsigned long flags; int printed_len; char *p; static char printk_buf[1024]; static int log_level_unknown = 1; + int err, need_wake; boot_delay_msec(); @@ -650,6 +704,12 @@ asmlinkage int vprintk(const char *fmt, spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); + err = ve_log_init(); + if (err) { + spin_unlock_irqrestore(&logbuf_lock, flags); + return err; + } + /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -710,7 +770,26 @@ asmlinkage int vprintk(const char *fmt, log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { + if (!ve_is_super(get_exec_env())) { + need_wake = (ve_log_start != ve_log_end); + spin_unlock_irqrestore(&logbuf_lock, flags); + if (!oops_in_progress && need_wake) + wake_up_interruptible(&ve_log_wait); + } else if (__printk_no_wake) { + /* + * A difficult case, created by the console semaphore mess... + * All wakeups are omitted. + */ + if (!atomic_add_negative(-1, &console_sem.count)) { + console_locked = 1; + console_may_schedule = 0; + do_release_console_sem(&flags); + console_locked = 0; + console_may_schedule = 0; + } + atomic_inc(&console_sem.count); + spin_unlock_irqrestore(&logbuf_lock, flags); + } else if (!down_trylock(&console_sem)) { /* * We own the drivers. We can drop the spinlock and * let release_console_sem() print the text, maybe ... @@ -753,6 +832,63 @@ asmlinkage int vprintk(const char *fmt, EXPORT_SYMBOL(printk); EXPORT_SYMBOL(vprintk); +static struct timer_list conswakeup_timer; +static void conswakeup_timer_call(unsigned long dumy) +{ + if (!down_trylock(&console_sem)) { + console_locked = 1; + console_may_schedule = 0; + release_console_sem(); + } + mod_timer(&conswakeup_timer, jiffies + 5 * HZ); +} + +static int __init conswakeup_init(void) +{ + init_timer(&conswakeup_timer); + conswakeup_timer.function = &conswakeup_timer_call; + conswakeup_timer.expires = jiffies + 5 * HZ; + add_timer(&conswakeup_timer); + return 0; +} +console_initcall(conswakeup_init); + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + int i; + struct ve_struct *env; + + env = set_exec_env(get_ve0()); + i = __vprintk(fmt, args); + (void)set_exec_env(env); + return i; +} + +asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) +{ + int printed_len; + + printed_len = 0; + if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) + printed_len = vprintk(fmt, args); + if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) + printed_len = __vprintk(fmt, args); + return printed_len; +} + +asmlinkage int ve_printk(int dst, const char *fmt, ...) +{ + va_list args; + int printed_len; + + va_start(args, fmt); + printed_len = ve_vprintk(dst, fmt, args); + va_end(args); + return printed_len; +} +EXPORT_SYMBOL(ve_printk); + + #else asmlinkage long sys_syslog(int type, char __user *buf, int len) @@ -950,31 +1086,40 @@ void wake_up_klogd(void) * * release_console_sem() may be called from any context. */ -void release_console_sem(void) +static unsigned long do_release_console_sem(unsigned long *flags) { - unsigned long flags; unsigned long _con_start, _log_end; unsigned long wake_klogd = 0; if (console_suspended) { up(&secondary_console_sem); - return; + goto out; } console_may_schedule = 0; for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + spin_unlock_irqrestore(&logbuf_lock, *flags); call_console_drivers(_con_start, _log_end); - local_irq_restore(flags); + spin_lock_irqsave(&logbuf_lock, *flags); } +out: + return wake_klogd; +} + +void release_console_sem(void) +{ + unsigned long flags; + unsigned long wake_klogd; + + spin_lock_irqsave(&logbuf_lock, flags); + wake_klogd = do_release_console_sem(&flags); console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); @@ -1285,6 +1430,36 @@ int printk_ratelimit(void) } EXPORT_SYMBOL(printk_ratelimit); +/* + * Rate limiting stuff. + */ +int vz_ratelimit(struct vz_rate_info *p) +{ + unsigned long cjif, djif; + unsigned long flags; + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + long new_bucket; + + spin_lock_irqsave(&ratelimit_lock, flags); + cjif = jiffies; + djif = cjif - p->last; + if (djif < p->interval) { + if (p->bucket >= p->burst) { + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 0; + } + p->bucket++; + } else { + new_bucket = p->bucket - (djif / (unsigned)p->interval); + if (new_bucket < 0) + new_bucket = 0; + p->bucket = new_bucket + 1; + } + p->last = cjif; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; +} + /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state diff -uprN linux-2.6.24/kernel/ptrace.c linux-2.6.24.ovz/kernel/ptrace.c --- linux-2.6.24/kernel/ptrace.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ptrace.c 2008-03-25 18:53:59.000000000 -0500 @@ -131,6 +131,8 @@ int __ptrace_may_attach(struct task_stru * or halting the specified task is impossible. */ int dumpable = 0; + int vps_dumpable = 0; + /* Don't let security modules deny introspection */ if (task == current) return 0; @@ -142,11 +144,17 @@ int __ptrace_may_attach(struct task_stru (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (task->mm) + if (task->mm) { dumpable = get_dumpable(task->mm); + vps_dumpable = (task->mm->vps_dumpable == 1); + } + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - + if (!vps_dumpable && !ve_is_super(get_exec_env())) + return -EPERM; + if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) + return -EPERM; return security_ptrace(current, task); } @@ -199,6 +207,8 @@ repeat: retval = __ptrace_may_attach(task); if (retval) goto bad; + if (task->mm->vps_dumpable == 2) + goto bad; /* Go */ task->ptrace |= PT_PTRACED | ((task->real_parent != current) @@ -294,6 +304,7 @@ int ptrace_writedata(struct task_struct } return copied; } +EXPORT_SYMBOL_GPL(access_process_vm); static int ptrace_setoptions(struct task_struct *child, long data) { diff -uprN linux-2.6.24/kernel/relay.c linux-2.6.24.ovz/kernel/relay.c --- linux-2.6.24/kernel/relay.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/relay.c 2008-03-25 18:53:59.000000000 -0500 @@ -92,6 +92,7 @@ static int relay_mmap_buf(struct rchan_b return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; + vma->vm_flags |= VM_DONTEXPAND; vma->vm_private_data = buf; buf->chan->cb->buf_mapped(buf, filp); diff -uprN linux-2.6.24/kernel/sched.c linux-2.6.24.ovz/kernel/sched.c --- linux-2.6.24/kernel/sched.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sched.c 2008-03-25 18:53:59.000000000 -0500 @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -208,6 +209,8 @@ static inline struct task_group *task_gr #elif defined(CONFIG_FAIR_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); +#elif defined(CONFIG_VZ_FAIRSCHED) + tg = p->fsched_node->tg; #else tg = &init_task_group; #endif @@ -310,6 +313,9 @@ struct rq { */ unsigned long nr_uninterruptible; + unsigned long nr_sleeping; + unsigned long nr_stopped; + struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; @@ -379,6 +385,11 @@ static inline int cpu_of(struct rq *rq) #endif } +struct kernel_stat_glob kstat_glob; +DEFINE_SPINLOCK(kstat_glb_lock); +EXPORT_SYMBOL(kstat_glob); +EXPORT_SYMBOL(kstat_glb_lock); + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -631,6 +642,217 @@ static inline void task_rq_unlock(struct spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_VE +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait++; +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_iowait--; +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint++; +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ + VE_CPU_STATS(ve, cpu)->nr_unint--; +} + +#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) + +cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->idle_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_uninterruptible_ve(ve) == 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_idle_time); + +cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) +{ + struct ve_cpu_stats *ve_stat; + unsigned v; + cycles_t strt, ret, cycles; + + ve_stat = VE_CPU_STATS(ve, cpu); + do { + v = read_seqcount_begin(&ve_stat->stat_lock); + ret = ve_stat->iowait_time; + strt = ve_stat->strt_idle_time; + if (strt && nr_iowait_ve(ve) > 0) { + cycles = get_cycles(); + if (cycles_after(cycles, strt)) + ret += cycles - strt; + } + } while (read_seqcount_retry(&ve_stat->stat_lock, v)); + return ret; +} +EXPORT_SYMBOL(ve_sched_get_iowait_time); + +static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + if (ve_stat->strt_idle_time) { + if (cycles_after(cycles, ve_stat->strt_idle_time)) { + if (nr_iowait_ve(ve) == 0) + ve_stat->idle_time += + cycles - ve_stat->strt_idle_time; + else + ve_stat->iowait_time += + cycles - ve_stat->strt_idle_time; + } + ve_stat->strt_idle_time = 0; + } + write_seqcount_end(&ve_stat->stat_lock); +} + +static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) +{ + struct ve_cpu_stats *ve_stat; + + ve_stat = VE_CPU_STATS(ve, cpu); + + write_seqcount_begin(&ve_stat->stat_lock); + ve_stat->strt_idle_time = cycles; + write_seqcount_end(&ve_stat->stat_lock); +} + +static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (++VE_CPU_STATS(ve, cpu)->nr_running == 1) + ve_stop_idle(ve, cpu, cycles); +} + +static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles) +{ + if (--VE_CPU_STATS(ve, cpu)->nr_running == 0) + ve_strt_idle(ve, cpu, cycles); +} + +void ve_sched_attach(struct ve_struct *target_ve) +{ + struct task_struct *tsk; + unsigned int cpu; + cycles_t cycles; + + tsk = current; + preempt_disable(); + cycles = get_cycles(); + cpu = task_cpu(tsk); + ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); + ve_nr_running_inc(target_ve, cpu, cycles); + preempt_enable(); +} +EXPORT_SYMBOL(ve_sched_attach); + +static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) +{ + struct ve_task_info *ti; + + ti = VE_TASK_INFO(p); + write_seqcount_begin(&ti->wakeup_lock); + ti->wakeup_stamp = cyc; + write_seqcount_end(&ti->wakeup_lock); +} + +static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) +{ + int cpu; + cycles_t ve_wstamp; + + /* safe due to runqueue lock */ + cpu = smp_processor_id(); + ve_wstamp = t->ve_task_info.wakeup_stamp; + + if (ve_wstamp && cycles > ve_wstamp) { + KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, + cpu, cycles - ve_wstamp); + KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, + cpu, cycles - ve_wstamp); + } +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +#ifdef CONFIG_FAIRSCHED + if (prev != this_pcpu()->idle) { +#else + if (prev != this_rq()->idle) { +#endif + VE_CPU_STATS(prev->ve_task_info.owner_env, + smp_processor_id())->used_time += + cycles - prev->ve_task_info.sched_time; + + prev->ve_task_info.sched_time = cycles; + } +} +#else +static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles) +{ +} + +static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) +{ +} + +static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) +{ +} + +static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) +{ +} +#endif + +struct task_nrs_struct { + long nr_running; + long nr_unint; + long nr_stopped; + long nr_sleeping; + long nr_iowait; + long long nr_switches; +} ____cacheline_aligned_in_smp; + +unsigned long nr_zombie = 0; /* protected by tasklist_lock */ +EXPORT_SYMBOL(nr_zombie); + +atomic_t nr_dead = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_dead); + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -1003,11 +1225,21 @@ static int effective_prio(struct task_st */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - if (p->state == TASK_UNINTERRUPTIBLE) + cycles_t cycles; + +#ifdef CONFIG_VE + cycles = get_cycles(); + write_wakeup_stamp(p, cycles); + p->ve_task_info.sleep_time += cycles; +#endif + if (p->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; + ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p)); + } enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); } /* @@ -1015,6 +1247,30 @@ static void activate_task(struct rq *rq, */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { + cycles_t cycles; +#ifdef CONFIG_VE + unsigned int cpu, pcpu; + struct ve_struct *ve; + + cycles = get_cycles(); + cpu = task_cpu(p); + pcpu = smp_processor_id(); + ve = p->ve_task_info.owner_env; + + p->ve_task_info.sleep_time -= cycles; +#endif + if (p->state == TASK_UNINTERRUPTIBLE) { + ve_nr_unint_inc(ve, cpu); + } + if (p->state == TASK_INTERRUPTIBLE) { + rq->nr_sleeping++; + } + if (p->state == TASK_STOPPED) { + rq->nr_stopped++; + } + + ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles); + if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; @@ -1220,6 +1476,7 @@ void wait_task_inactive(struct task_stru break; } } +EXPORT_SYMBOL_GPL(wait_task_inactive); /*** * kick_process - kick a running thread to enter/exit the kernel @@ -1739,6 +1996,10 @@ void sched_fork(struct task_struct *p, i /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_VE + /* cosmetic: sleep till wakeup below */ + p->ve_task_info.sleep_time -= get_cycles(); +#endif put_cpu(); } @@ -1769,6 +2030,8 @@ void fastcall wake_up_new_task(struct ta */ p->sched_class->task_new(rq, p); inc_nr_running(p, rq); + ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), + get_cycles()); } check_preempt_curr(rq, p); task_rq_unlock(rq, &flags); @@ -1921,6 +2184,7 @@ asmlinkage void schedule_tail(struct tas if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } +EXPORT_SYMBOL_GPL(schedule_tail); /* * context_switch - switch to the new MM and the new @@ -1991,6 +2255,7 @@ unsigned long nr_running(void) return sum; } +EXPORT_SYMBOL(nr_running); unsigned long nr_uninterruptible(void) { @@ -2008,6 +2273,7 @@ unsigned long nr_uninterruptible(void) return sum; } +EXPORT_SYMBOL(nr_uninterruptible); unsigned long long nr_context_switches(void) { @@ -2019,6 +2285,7 @@ unsigned long long nr_context_switches(v return sum; } +EXPORT_SYMBOL(nr_context_switches); unsigned long nr_iowait(void) { @@ -2029,6 +2296,7 @@ unsigned long nr_iowait(void) return sum; } +EXPORT_SYMBOL(nr_iowait); unsigned long nr_active(void) { @@ -2045,6 +2313,72 @@ unsigned long nr_active(void) return running + uninterruptible; } +unsigned long nr_stopped(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_stopped; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_stopped); + +unsigned long nr_sleeping(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_sleeping; + if (unlikely((long)sum < 0)) + sum = 0; + return sum; +} +EXPORT_SYMBOL(nr_sleeping); + +#ifdef CONFIG_VE +unsigned long nr_running_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_running; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_running_ve); + +unsigned long nr_uninterruptible_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + sum = 0; + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_unint; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_uninterruptible_ve); + +unsigned long nr_iowait_ve(struct ve_struct *ve) +{ + int i; + long sum = 0; + cpumask_t ve_cpus; + + ve_cpu_online_map(ve, &ve_cpus); + for_each_cpu_mask(i, ve_cpus) + sum += VE_CPU_STATS(ve, i)->nr_iowait; + return (unsigned long)(sum < 0 ? 0 : sum); +} +EXPORT_SYMBOL(nr_iowait_ve); +#endif + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -2075,6 +2409,16 @@ static void update_cpu_load(struct rq *t } } +#ifdef CONFIG_VE +#define update_ve_cpu_time(p, time, tick) \ + do { \ + VE_CPU_STATS((p)->ve_task_info.owner_env, \ + task_cpu(p))->time += tick; \ + } while (0) +#else +#define update_ve_cpu_time(p, time, tick) do { } while (0) +#endif + #ifdef CONFIG_SMP /* @@ -2198,8 +2542,15 @@ void sched_exec(void) static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { + struct ve_struct *ve; + cycles_t cycles = get_cycles(); + + ve = VE_TASK_INFO(p)->owner_env; + deactivate_task(src_rq, p, 0); + ve_nr_running_dec(ve, task_cpu(p), cycles); set_task_cpu(p, this_cpu); + ve_nr_running_inc(ve, task_cpu(p), cycles); activate_task(this_rq, p, 0); /* * Note that idle threads have a prio of MAX_PRIO, for this test @@ -3365,10 +3716,13 @@ void account_user_time(struct task_struc /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0) { cpustat->nice = cputime64_add(cpustat->nice, tmp); - else + update_ve_cpu_time(p, nice, tmp); + } else { cpustat->user = cputime64_add(cpustat->user, tmp); + update_ve_cpu_time(p, user, tmp); + } } /* @@ -3420,6 +3774,7 @@ void account_system_time(struct task_str /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); + update_ve_cpu_time(p, system, tmp); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) @@ -3663,13 +4018,33 @@ need_resched_nonpreemptible: sched_info_switch(prev, next); if (likely(prev != next)) { + cycles_t cycles = get_cycles(); + rq->nr_switches++; rq->curr = next; ++*switch_count; +#ifdef CONFIG_VE + prev->ve_task_info.sleep_stamp = cycles; + if (prev->state == TASK_RUNNING && prev != this_rq()->idle) + write_wakeup_stamp(prev, cycles); + update_sched_lat(next, cycles); + + /* because next & prev are protected with + * runqueue lock we may not worry about + * wakeup_stamp and sched_time protection + * (same thing in 'else' branch below) + */ + update_ve_task_info(prev, cycles); + next->ve_task_info.sched_time = cycles; + write_wakeup_stamp(next, 0); +#endif + context_switch(rq, prev, next); /* unlocks the rq */ - } else + } else { + update_ve_task_info(prev, get_cycles()); spin_unlock_irq(&rq->lock); + } if (unlikely(reacquire_kernel_lock(current) < 0)) { cpu = smp_processor_id(); @@ -4285,7 +4660,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (!capable(CAP_SYS_ADMIN)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -4761,10 +5136,15 @@ EXPORT_SYMBOL(yield); void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + ve_nr_iowait_inc(ve, task_cpu(current)); schedule(); + ve_nr_iowait_dec(ve, task_cpu(current)); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -4774,10 +5154,15 @@ long __sched io_schedule_timeout(long ti { struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; +#ifdef CONFIG_VE + struct ve_struct *ve = current->ve_task_info.owner_env; +#endif delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + ve_nr_iowait_inc(ve, task_cpu(current)); ret = schedule_timeout(timeout); + ve_nr_iowait_dec(ve, task_cpu(current)); atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -4898,17 +5283,7 @@ static void show_task(struct task_struct state = p->state ? __ffs(p->state) + 1 : 0; printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif + printk(KERN_CONT " %p ", p); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long *n = end_of_stack(p); @@ -4930,13 +5305,13 @@ void show_state_filter(unsigned long sta #if BITS_PER_LONG == 32 printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #else printk(KERN_INFO - " task PC stack pid father\n"); + " task taskaddr stack pid father\n"); #endif read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: @@ -4944,7 +5319,7 @@ void show_state_filter(unsigned long sta touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) show_task(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); touch_all_softlockup_watchdogs(); @@ -5292,13 +5667,13 @@ static void migrate_live_tasks(int src_c read_lock(&tasklist_lock); - do_each_thread(t, p) { + do_each_thread_all(t, p) { if (p == current) continue; if (task_cpu(p) == src_cpu) move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); + } while_each_thread_all(t, p); read_unlock(&tasklist_lock); } @@ -6841,6 +7216,7 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + fairsched_init_early(); } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6890,7 +7266,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { /* * Only normalize user tasks: */ @@ -6922,7 +7298,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irq(&tasklist_lock); } diff -uprN linux-2.6.24/kernel/sched_debug.c linux-2.6.24.ovz/kernel/sched_debug.c --- linux-2.6.24/kernel/sched_debug.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sched_debug.c 2008-03-25 18:53:59.000000000 -0500 @@ -91,12 +91,12 @@ static void print_rq(struct seq_file *m, read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + do_each_thread_all(g, p) { if (!p->se.on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock_irqrestore(&tasklist_lock, flags); } diff -uprN linux-2.6.24/kernel/sched_fair.c linux-2.6.24.ovz/kernel/sched_fair.c --- linux-2.6.24/kernel/sched_fair.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sched_fair.c 2008-03-25 18:53:59.000000000 -0500 @@ -511,7 +511,7 @@ place_entity(struct cfs_rq *cfs_rq, stru if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se)) + if (sched_feat(NEW_FAIR_SLEEPERS)) vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ @@ -867,7 +867,11 @@ static void check_preempt_wakeup(struct } gran = sysctl_sched_wakeup_granularity; - if (unlikely(se->load.weight != NICE_0_LOAD)) + /* + * More easily preempt - nice tasks, while not making + * it harder for + nice tasks. + */ + if (unlikely(se->load.weight > NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); if (pse->vruntime + gran < se->vruntime) diff -uprN linux-2.6.24/kernel/signal.c linux-2.6.24.ovz/kernel/signal.c --- linux-2.6.24/kernel/signal.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/signal.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,15 +31,34 @@ #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. */ -static struct kmem_cache *sigqueue_cachep; +struct kmem_cache *sigqueue_cachep; +EXPORT_SYMBOL(sigqueue_cachep); +static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t) +{ + struct ve_struct *ve; + + /* always allow signals from the kernel */ + if (info == SEND_SIG_FORCED || + (!is_si_special(info) && SI_FROMKERNEL(info))) + return 0; + + ve = current->ve_task_info.owner_env; + if (ve->ve_ns->pid_ns->child_reaper != t) + return 0; + if (ve_is_super(get_exec_env())) + return 0; + return !sig_user_defined(t, sig) || sig_kernel_only(sig); +} + static int sig_ignored(struct task_struct *t, int sig) { void __user * handler; @@ -96,7 +115,7 @@ static inline int has_pending_signals(si #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +int recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || PENDING(&t->pending, &t->blocked) || @@ -121,6 +140,7 @@ void recalc_sigpending_and_wake(struct t if (recalc_sigpending_tsk(t)) signal_wake_up(t, 0); } +EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); void recalc_sigpending(void) { @@ -179,8 +199,13 @@ static struct sigqueue *__sigqueue_alloc atomic_inc(&user->sigpending); if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + if (q && ub_siginfo_charge(q, get_task_ub(t))) { + kmem_cache_free(sigqueue_cachep, q); + q = NULL; + } + } if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); } else { @@ -197,6 +222,7 @@ static void __sigqueue_free(struct sigqu return; atomic_dec(&q->user->sigpending); free_uid(q->user); + ub_siginfo_uncharge(q); kmem_cache_free(sigqueue_cachep, q); } @@ -345,7 +371,18 @@ static int collect_signal(int sig, struc static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = next_signal(pending, mask); + int sig = 0; + + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { @@ -468,6 +505,7 @@ void signal_wake_up(struct task_struct * if (!wake_up_state(t, mask)) kick_process(t); } +EXPORT_SYMBOL_GPL(signal_wake_up); /* * Remove signals in mask from the pending set and queue. @@ -908,7 +946,7 @@ __group_complete_signal(int sig, struct do { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - } while_each_thread(p, t); + } while_each_thread_all(p, t); return; } @@ -930,7 +968,7 @@ __group_complete_signal(int sig, struct do { p->signal->group_stop_count++; signal_wake_up(t, t == p); - } while_each_thread(p, t); + } while_each_thread_all(p, t); return; } @@ -1025,7 +1063,8 @@ int group_send_sig_info(int sig, struct if (!ret && sig) { ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); + ret = sig_ve_ignored(sig, info, p) ? 0 : + __group_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -1149,7 +1188,7 @@ static int kill_something_info(int sig, struct task_struct * p; read_lock(&tasklist_lock); - for_each_process(p) { + for_each_process_ve(p) { if (p->pid > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p); ++count; @@ -1446,6 +1485,14 @@ void do_notify_parent(struct task_struct BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); +#ifdef CONFIG_VE + /* Allow to send only SIGCHLD from VE */ + if (sig != SIGCHLD && + tsk->ve_task_info.owner_env != + tsk->parent->ve_task_info.owner_env) + sig = SIGCHLD; +#endif + info.si_signo = sig; info.si_errno = 0; /* @@ -1684,7 +1731,9 @@ finish_stop(int stop_count) } do { + set_stop_state(current); schedule(); + clear_stop_state(current); } while (try_to_freeze()); /* * Now we don't run again until continued. @@ -1793,8 +1842,6 @@ int get_signal_to_deliver(siginfo_t *inf sigset_t *mask = ¤t->blocked; int signr = 0; - try_to_freeze(); - relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { @@ -2246,8 +2293,10 @@ static int do_tkill(int tgid, int pid, i */ if (!error && sig && p->sighand) { spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); - error = specific_send_sig_info(sig, &info, p); + if (!sig_ve_ignored(sig, &info, p)) { + handle_stop_signal(sig, p); + error = specific_send_sig_info(sig, &info, p); + } spin_unlock_irq(&p->sighand->siglock); } } @@ -2604,5 +2653,5 @@ __attribute__((weak)) const char *arch_v void __init signals_init(void) { - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC); } diff -uprN linux-2.6.24/kernel/softirq.c linux-2.6.24.ovz/kernel/softirq.c --- linux-2.6.24/kernel/softirq.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/softirq.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,8 @@ #include #include +#include + #include /* - No shared variables, all the data are CPU local. @@ -207,10 +209,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); asmlinkage void __do_softirq(void) { + struct user_beancounter *ub; struct softirq_action *h; __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + struct ve_struct *envid; + + envid = set_exec_env(get_ve0()); pending = local_softirq_pending(); account_system_vtime(current); @@ -227,6 +233,7 @@ restart: h = softirq_vec; + ub = set_exec_ub(get_ub0()); do { if (pending & 1) { h->action(h); @@ -235,6 +242,7 @@ restart: h++; pending >>= 1; } while (pending); + (void)set_exec_ub(ub); local_irq_disable(); @@ -248,6 +256,7 @@ restart: trace_softirq_exit(); account_system_vtime(current); + (void)set_exec_env(envid); _local_bh_enable(); } @@ -298,6 +307,7 @@ void irq_exit(void) { account_system_vtime(current); trace_hardirq_exit(); + restore_context(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); diff -uprN linux-2.6.24/kernel/stop_machine.c linux-2.6.24.ovz/kernel/stop_machine.c --- linux-2.6.24/kernel/stop_machine.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/stop_machine.c 2008-03-25 18:53:59.000000000 -0500 @@ -9,7 +9,7 @@ #include #include #include - +#include #include #include #include @@ -63,7 +63,7 @@ static int stopmachine(void *cpu) /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) - yield(); + msleep(10); else cpu_relax(); } @@ -109,7 +109,7 @@ static int stop_machine(void) /* Wait for them all to come to life. */ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - yield(); + msleep(10); /* If some failed, kill them all. */ if (ret < 0) { diff -uprN linux-2.6.24/kernel/sys.c linux-2.6.24.ovz/kernel/sys.c --- linux-2.6.24/kernel/sys.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sys.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include #include #include @@ -106,6 +108,102 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +DECLARE_MUTEX(virtinfo_sem); +EXPORT_SYMBOL(virtinfo_sem); +static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; + +void __virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + + for (p = &virtinfo_chain[type]; + *p != NULL && nb->priority < (*p)->priority; + p = &(*p)->next); + nb->next = *p; + smp_wmb(); + *p = nb; +} + +EXPORT_SYMBOL(__virtinfo_notifier_register); + +void virtinfo_notifier_register(int type, struct vnotifier_block *nb) +{ + down(&virtinfo_sem); + __virtinfo_notifier_register(type, nb); + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_register); + +struct virtinfo_cnt_struct { + volatile unsigned long exit[NR_CPUS]; + volatile unsigned long entry; +}; +static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); + +void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) +{ + struct vnotifier_block **p; + int entry_cpu, exit_cpu; + unsigned long cnt, ent; + + down(&virtinfo_sem); + for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); + *p = nb->next; + smp_mb(); + + for_each_cpu_mask(entry_cpu, cpu_possible_map) { + while (1) { + cnt = 0; + for_each_cpu_mask(exit_cpu, cpu_possible_map) + cnt += + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; + smp_rmb(); + ent = per_cpu(virtcnt, entry_cpu).entry; + if (cnt == ent) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 100); + } + } + up(&virtinfo_sem); +} + +EXPORT_SYMBOL(virtinfo_notifier_unregister); + +int virtinfo_notifier_call(int type, unsigned long n, void *data) +{ + int ret; + int entry_cpu, exit_cpu; + struct vnotifier_block *nb; + + entry_cpu = get_cpu(); + per_cpu(virtcnt, entry_cpu).entry++; + smp_wmb(); + put_cpu(); + + nb = virtinfo_chain[type]; + ret = NOTIFY_DONE; + while (nb) + { + ret = nb->notifier_call(nb, n, data, ret); + if(ret & NOTIFY_STOP_MASK) { + ret &= ~NOTIFY_STOP_MASK; + break; + } + nb = nb->next; + } + + exit_cpu = get_cpu(); + smp_wmb(); + per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; + put_cpu(); + + return ret; +} + +EXPORT_SYMBOL(virtinfo_notifier_call); + static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; @@ -165,7 +263,7 @@ asmlinkage long sys_setpriority(int whic pgrp = task_pgrp(current); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(who, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -175,10 +273,10 @@ asmlinkage long sys_setpriority(int whic if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (p->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* For find_user() */ break; @@ -227,7 +325,7 @@ asmlinkage long sys_getpriority(int whic niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(who, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -237,13 +335,13 @@ asmlinkage long sys_getpriority(int whic if ((who != current->uid) && !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread_ve(g, p) if (p->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + while_each_thread_ve(g, p); if (who != current->uid) free_uid(user); /* for find_user() */ break; @@ -377,6 +475,25 @@ asmlinkage long sys_reboot(int magic1, i magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; +#ifdef CONFIG_VE + if (!ve_is_super(get_exec_env())) + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_HALT: + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_RESTART2: + force_sig(SIGKILL, + get_exec_env()->ve_ns->pid_ns->child_reaper); + + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + return 0; + + default: + return -EINVAL; + } +#endif + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ @@ -558,7 +675,7 @@ asmlinkage long sys_setgid(gid_t gid) return 0; } -static int set_user(uid_t new_ruid, int dumpclear) +int set_user(uid_t new_ruid, int dumpclear) { struct user_struct *new_user; @@ -582,6 +699,7 @@ static int set_user(uid_t new_ruid, int current->uid = new_ruid; return 0; } +EXPORT_SYMBOL(set_user); /* * Unprivileged users may change the real uid to the effective uid @@ -862,8 +980,27 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +#ifdef CONFIG_VE +unsigned long long ve_relative_clock(struct timespec * ts) +{ + unsigned long long offset = 0; + + if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec || + (ts->tv_sec == get_exec_env()->start_timespec.tv_sec && + ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec)) + offset = (unsigned long long)(ts->tv_sec - + get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC + + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec; + return nsec_to_clock_t(offset); +} +#endif + asmlinkage long sys_times(struct tms __user * tbuf) { +#ifdef CONFIG_VE + struct timespec now; +#endif + /* * In the SMP world we might just be unlucky and have one of * the times increment as we use it. Since the value is an @@ -897,7 +1034,13 @@ asmlinkage long sys_times(struct tms __u if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } +#ifndef CONFIG_VE return (long) jiffies_64_to_clock_t(get_jiffies_64()); +#else + /* Compare to calculation in fs/proc/array.c */ + do_posix_clock_monotonic_gettime(&now); + return ve_relative_clock(&now); +#endif } /* @@ -1049,6 +1192,7 @@ asmlinkage long sys_setsid(void) { struct task_struct *group_leader = current->group_leader; pid_t session; + struct pid *sid; int err = -EPERM; write_lock_irq(&tasklist_lock); @@ -1057,7 +1201,8 @@ asmlinkage long sys_setsid(void) if (group_leader->signal->leader) goto out; - session = group_leader->pid; + sid = task_pid(group_leader); + session = pid_vnr(sid); /* Fail if a process group id already exists that equals the * proposed session id. * @@ -1065,18 +1210,18 @@ asmlinkage long sys_setsid(void) * session id and so the check will always fail and make it so * init cannot successfully call setsid. */ - if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID, - session, &init_pid_ns)) + if (session > 1 && pid_task(sid, PIDTYPE_PGID)) goto out; group_leader->signal->leader = 1; - __set_special_pids(session, session); + __set_special_pids(sid); spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; + group_leader->signal->tty_old_pgrp = 0; spin_unlock(&group_leader->sighand->siglock); - err = task_pgrp_vnr(group_leader); + err = session; out: write_unlock_irq(&tasklist_lock); return err; @@ -1358,7 +1503,7 @@ asmlinkage long sys_sethostname(char __u int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1403,7 +1548,7 @@ asmlinkage long sys_setdomainname(char _ int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_VE_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff -uprN linux-2.6.24/kernel/sys_ni.c linux-2.6.24.ovz/kernel/sys_ni.c --- linux-2.6.24/kernel/sys_ni.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sys_ni.c 2008-03-25 18:53:59.000000000 -0500 @@ -157,3 +157,15 @@ cond_syscall(sys_timerfd); cond_syscall(compat_sys_signalfd); cond_syscall(compat_sys_timerfd); cond_syscall(sys_eventfd); +cond_syscall(sys_getluid); +cond_syscall(sys_setluid); +cond_syscall(sys_setublimit); +cond_syscall(sys_ubstat); + +/* fairsched compat */ +cond_syscall(sys_fairsched_mknod); +cond_syscall(sys_fairsched_rmnod); +cond_syscall(sys_fairsched_mvpr); +cond_syscall(sys_fairsched_vcpus); +cond_syscall(sys_fairsched_chwt); +cond_syscall(sys_fairsched_rate); diff -uprN linux-2.6.24/kernel/sysctl.c linux-2.6.24.ovz/kernel/sysctl.c --- linux-2.6.24/kernel/sysctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sysctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -58,6 +61,8 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args); #if defined(CONFIG_SYSCTL) +extern int gr_handle_sysctl_mod(const char *dirname, const char *name, + const int op); /* External variables not in a header file. */ extern int C_A_D; @@ -69,6 +74,7 @@ extern int sysctl_oom_kill_allocating_ta extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; +extern int sysctl_at_vsyscall; extern char core_pattern[]; extern int pid_max; extern int min_free_kbytes; @@ -80,6 +86,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int ve_area_access_check; /* fs/namei.c */ extern int audit_argv_kb; /* Constants used for minimum and maximum */ @@ -102,6 +109,8 @@ static int min_percpu_pagelist_fract = 8 static int ngroups_max = NGROUPS_MAX; +int ve_allow_kthreads = 1; +EXPORT_SYMBOL(ve_allow_kthreads); #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif @@ -115,6 +124,8 @@ extern int stop_a_enabled; extern int scons_pwroff; #endif +extern int alloc_fail_warn; + #ifdef __hppa__ extern int pwrsw_enabled; extern int unaligned_enabled; @@ -129,6 +140,7 @@ extern int spin_retry; #endif extern int sysctl_hz_timer; +int decode_call_traces = 1; #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; @@ -137,6 +149,10 @@ extern int acct_parm[]; #ifdef CONFIG_IA64 extern int no_unaligned_warning; #endif +#ifdef CONFIG_VE +int glob_ve_meminfo = 0; +EXPORT_SYMBOL(glob_ve_meminfo); +#endif #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; @@ -172,6 +188,7 @@ extern struct ctl_table inotify_table[]; #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; #endif +extern ctl_table grsecurity_table[]; extern int prove_locking; extern int lock_stat; @@ -385,7 +402,7 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SECURITY_CAPABILITIES { .procname = "cap-bound", - .data = &cap_bset, + .data = NULL, .maxlen = sizeof(kernel_cap_t), .mode = 0600, .proc_handler = &proc_dointvec_bset, @@ -436,6 +453,20 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .procname = "silence-level", + .data = &console_silence_loglevel, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "alloc_fail_warn", + .data = &alloc_fail_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef __hppa__ { .ctl_name = KERN_HPPA_PWRSW, @@ -608,6 +639,15 @@ static struct ctl_table kern_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_VE + { + .procname = "ve_meminfo", + .data = &glob_ve_meminfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", @@ -775,6 +815,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, +#ifdef CONFIG_GRKERNSEC_SYSCTL + { + .ctl_name = KERN_GRSECURITY, + .procname = "grsecurity", + .mode = 0500, + .child = grsecurity_table, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1225,6 +1273,20 @@ static struct ctl_table fs_table[] = { .child = binfmt_misc_table, }, #endif + { + .procname = "vsyscall", + .data = &sysctl_at_vsyscall, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "odirect_enable", + .data = &odirect_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1233,6 +1295,13 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { + { + .procname = "decode_call_traces", + .data = &decode_call_traces, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, @@ -1304,14 +1373,26 @@ struct ctl_table_header *sysctl_head_nex { struct ctl_table_header *head; struct list_head *tmp; + struct ve_struct *ve; + + ve = get_exec_env(); spin_lock(&sysctl_lock); if (prev) { tmp = &prev->ctl_entry; unuse_table(prev); goto next; } +#ifdef CONFIG_VE + tmp = ve->sysctl_lh.next; +#else tmp = &root_table_header.ctl_entry; +#endif for (;;) { +#ifdef CONFIG_VE + if (tmp == &ve->sysctl_lh) + /* second pass over global variables */ + tmp = &root_table_header.ctl_entry; +#endif head = list_entry(tmp, struct ctl_table_header, ctl_entry); if (!use_table(head)) @@ -1394,10 +1475,17 @@ static int test_perm(int mode, int op) int sysctl_perm(struct ctl_table *table, int op) { int error; + int mode = table->mode; + + if (table->procname && table->parent && gr_handle_sysctl_mod(table->parent->procname, table->procname, op)) + return -EACCES; error = security_sysctl(table, op); if (error) return error; - return test_perm(table->mode, op); + if (!ve_accessible(table->owner_env, get_exec_env()) && + !table->virt_handler) + mode &= ~0222; /* disable write access */ + return test_perm(mode, op); } #ifdef CONFIG_SYSCTL_SYSCALL @@ -1434,6 +1522,36 @@ repeat: return -ENOTDIR; } +int __do_sysctl_strategy (void *data, ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { + size_t len; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if (copy_from_user(data, newval, len)) + return -EFAULT; + } + + return 0; +} + /* Perform the actual read/write of a sysctl table entry. */ int do_sysctl_strategy (struct ctl_table *table, int __user *name, int nlen, @@ -1557,9 +1675,11 @@ core_initcall(sysctl_init); * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) +static struct ctl_table_header *__register_sysctl_table(struct list_head *lh, + struct ctl_table * table) { struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); if (!tmp) return NULL; @@ -1573,11 +1693,76 @@ struct ctl_table_header *register_sysctl return NULL; } spin_lock(&sysctl_lock); - list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + list_add_tail(&tmp->ctl_entry, lh); spin_unlock(&sysctl_lock); return tmp; } +struct ctl_table_header *register_sysctl_table(struct ctl_table * table) +{ + struct list_head *lh; + +#ifdef CONFIG_VE + lh = &get_exec_env()->sysctl_lh; +#else + lh = &root_table_header.ctl_entry; +#endif + return __register_sysctl_table(lh, table); +} + +#ifdef CONFIG_VE +struct ctl_table_header *register_glob_sysctl_table(struct ctl_table *table) +{ + return __register_sysctl_table(&root_table_header.ctl_entry, table); +} +EXPORT_SYMBOL(register_glob_sysctl_table); +#endif + +void free_sysctl_clone(ctl_table *clone) +{ + int i; + + for (i = 0; clone[i].ctl_name != 0; i++) + if (clone[i].child != NULL) + free_sysctl_clone(clone[i].child); + + kfree(clone); +} + +ctl_table *clone_sysctl_template(ctl_table *tmpl) +{ + int i, nr; + ctl_table *clone; + + nr = 0; + while (tmpl[nr].ctl_name != 0 || tmpl[nr].procname) + nr++; + nr++; + + clone = kmemdup(tmpl, nr * sizeof(ctl_table), GFP_KERNEL); + if (clone == NULL) + return NULL; + + for (i = 0; i < nr; i++) { + clone[i].owner_env = get_exec_env(); + if (tmpl[i].child == NULL) + continue; + + clone[i].child = clone_sysctl_template(tmpl[i].child); + if (clone[i].child == NULL) + goto unroll; + } + return clone; + +unroll: + for (i--; i >= 0; i--) + if (clone[i].child != NULL) + free_sysctl_clone(clone[i].child); + + kfree(clone); + return NULL; +} + /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table @@ -1608,6 +1793,14 @@ void unregister_sysctl_table(struct ctl_ { } +ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr) +{ + return NULL; +} + +void free_sysctl_clone(ctl_table *tmpl) +{ +} #endif /* CONFIG_SYSCTL */ /* @@ -1899,13 +2092,19 @@ int proc_dointvec_bset(struct ctl_table { int op; - if (write && !capable(CAP_SYS_MODULE)) { + struct ve_struct *ve; + + ve = get_exec_env(); + + /* For VE's root writing to VE's cap-bound is prohibited */ + if ((ve_is_super(ve) && write && !capable(CAP_SYS_MODULE)) || + (!ve_is_super(ve) && (!capable(CAP_VE_ADMIN) || write))) { return -EPERM; } op = is_global_init(current) ? OP_SET : OP_AND; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + return __do_proc_dointvec(&cap_bset, table, write, filp, + buffer, lenp, ppos, do_proc_dointvec_bset_conv, &op); } #endif /* def CONFIG_SECURITY_CAPABILITIES */ @@ -2649,6 +2848,57 @@ static int deprecated_sysctl_warning(str return 0; } +#ifdef CONFIG_PID_NS +#include + +static int proc_pid_ns_hide_child(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int tmp, res; + + tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0; + + res = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (res || !write) + return res; + + if (tmp) + current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD; + else + current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD; + return 0; +} + +static struct ctl_table pid_ns_kern_table[] = { + { + .procname = "pid_ns_hide_child", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_pid_ns_hide_child, + }, + {} +}; + +static struct ctl_table pid_ns_root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = pid_ns_kern_table, + }, + {} +}; + +static __init int pid_ns_sysctl_init(void) +{ + register_sysctl_table(pid_ns_root_table); + return 0; +} +postcore_initcall(pid_ns_sysctl_init); +#endif /* CONFIG_PID_NS */ + /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -2668,3 +2918,5 @@ EXPORT_SYMBOL(sysctl_ms_jiffies); EXPORT_SYMBOL(sysctl_string); EXPORT_SYMBOL(sysctl_data); EXPORT_SYMBOL(unregister_sysctl_table); +EXPORT_SYMBOL(clone_sysctl_template); +EXPORT_SYMBOL(free_sysctl_clone); diff -uprN linux-2.6.24/kernel/sysctl_check.c linux-2.6.24.ovz/kernel/sysctl_check.c --- linux-2.6.24/kernel/sysctl_check.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/sysctl_check.c 2008-03-25 18:53:59.000000000 -0500 @@ -1459,6 +1459,7 @@ static void sysctl_check_bin_path(struct int sysctl_check_table(struct ctl_table *table) { int error = 0; + return 0; for (; table->ctl_name || table->procname; table++) { const char *fail = NULL; diff -uprN linux-2.6.24/kernel/taskstats.c linux-2.6.24.ovz/kernel/taskstats.c --- linux-2.6.24/kernel/taskstats.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/taskstats.c 2008-03-25 18:53:59.000000000 -0500 @@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct ta if (!tsk) { rcu_read_lock(); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (tsk) get_task_struct(tsk); rcu_read_unlock(); @@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct */ rcu_read_lock(); if (!first) - first = find_task_by_pid(tgid); + first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; @@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); + } while_each_thread_all(first, tsk); unlock_task_sighand(first, &flags); rc = 0; diff -uprN linux-2.6.24/kernel/time/timekeeping.c linux-2.6.24.ovz/kernel/time/timekeeping.c --- linux-2.6.24/kernel/time/timekeeping.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/time/timekeeping.c 2008-03-25 18:53:59.000000000 -0500 @@ -43,6 +43,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOC * used instead. */ struct timespec xtime __attribute__ ((aligned (16))); +EXPORT_SYMBOL_GPL(xtime); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ diff -uprN linux-2.6.24/kernel/time.c linux-2.6.24.ovz/kernel/time.c --- linux-2.6.24/kernel/time.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/time.c 2008-03-25 18:53:59.000000000 -0500 @@ -577,12 +577,14 @@ EXPORT_SYMBOL(jiffies_to_clock_t); unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 + WARN_ON((long)x < 0); if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else u64 jif; + WARN_ON((long)x < 0); /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; @@ -597,6 +599,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); u64 jiffies_64_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 do_div(x, HZ / USER_HZ); #else @@ -615,6 +618,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) { + WARN_ON((s64)x < 0); #if (NSEC_PER_SEC % USER_HZ) == 0 do_div(x, (NSEC_PER_SEC / USER_HZ)); #elif (USER_HZ % 512) == 0 diff -uprN linux-2.6.24/kernel/timer.c linux-2.6.24.ovz/kernel/timer.c --- linux-2.6.24/kernel/timer.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/timer.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -655,7 +657,11 @@ static inline void __run_timers(tvec_bas spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); + struct ve_struct *ve; + + ve = set_exec_env(get_ve0()); fn(data); + (void)set_exec_env(ve); if (preempt_count != preempt_count()) { printk(KERN_WARNING "huh, entered %p " "with preempt_count %08x, exited" @@ -872,6 +878,37 @@ EXPORT_SYMBOL(avenrun); * calc_load - given tick count, update the avenrun load estimates. * This is called while holding a write_lock on xtime_lock. */ + + +#ifdef CONFIG_VE +static void calc_load_ve(void) +{ + unsigned long flags, nr_unint, nr_active; + struct ve_struct *ve; + + read_lock(&ve_list_lock); + for_each_ve(ve) { + nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); + nr_active *= FIXED_1; + + CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); + CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); + CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); + } + read_unlock(&ve_list_lock); + + nr_unint = nr_uninterruptible() * FIXED_1; + spin_lock_irqsave(&kstat_glb_lock, flags); + CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); + CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); + spin_unlock_irqrestore(&kstat_glb_lock, flags); + +} +#else +#define calc_load_ve() do { } while (0) +#endif + static inline void calc_load(unsigned long ticks) { unsigned long active_tasks; /* fixed-point */ @@ -884,6 +921,7 @@ static inline void calc_load(unsigned lo CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); + calc_load_ve(); count += LOAD_FREQ; } while (count < 0); } @@ -1122,11 +1160,12 @@ int do_sysinfo(struct sysinfo *info) unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; + unsigned long *__avenrun; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); do { - struct timespec tp; seq = read_seqbegin(&xtime_lock); /* @@ -1144,18 +1183,34 @@ int do_sysinfo(struct sysinfo *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + } while (read_seqretry(&xtime_lock, seq)); + if (ve_is_super(get_exec_env())) { + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + __avenrun = &avenrun[0]; info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + } +#ifdef CONFIG_VE + else { + struct ve_struct *ve; + ve = get_exec_env(); + __avenrun = &ve->avenrun[0]; + info->procs = atomic_read(&ve->pcounter); + info->uptime = tp.tv_sec - ve->start_timespec.tv_sec; + } +#endif + info->loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); si_meminfo(info); si_swapinfo(info); +#ifdef CONFIG_BEANCOUNTERS + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info) + & NOTIFY_FAIL) + return -ENOMSG; +#endif /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then diff -uprN linux-2.6.24/kernel/user.c linux-2.6.24.ovz/kernel/user.c --- linux-2.6.24/kernel/user.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/user.c 2008-03-25 18:53:59.000000000 -0500 @@ -318,6 +318,7 @@ void free_uid(struct user_struct *up) else local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(free_uid); struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { @@ -406,6 +407,7 @@ struct user_struct * alloc_uid(struct us return up; } +EXPORT_SYMBOL_GPL(alloc_uid); void switch_uid(struct user_struct *new_user) { @@ -436,6 +438,7 @@ void switch_uid(struct user_struct *new_ free_uid(old_user); suid_keys(current); } +EXPORT_SYMBOL_GPL(switch_uid); void release_uids(struct user_namespace *ns) { @@ -467,7 +470,7 @@ static int __init uid_cache_init(void) int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); diff -uprN linux-2.6.24/kernel/utsname_sysctl.c linux-2.6.24.ovz/kernel/utsname_sysctl.c --- linux-2.6.24/kernel/utsname_sysctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/kernel/utsname_sysctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -27,6 +27,10 @@ static void *get_uts(ctl_table *table, i down_read(&uts_sem); else down_write(&uts_sem); + + if (strcmp(table->procname, "virt_osrelease") == 0) + return virt_utsname.release; + return which; } @@ -115,6 +119,7 @@ static struct ctl_table uts_kern_table[] .mode = 0644, .proc_handler = proc_do_uts_string, .strategy = sysctl_uts_string, + .virt_handler = 1, }, { .ctl_name = KERN_DOMAINNAME, @@ -124,6 +129,14 @@ static struct ctl_table uts_kern_table[] .mode = 0644, .proc_handler = proc_do_uts_string, .strategy = sysctl_uts_string, + .virt_handler = 1, + }, { + .procname = "virt_osrelease", + .data = virt_utsname.release, + .maxlen = sizeof(virt_utsname.release), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = sysctl_uts_string, }, {} }; @@ -140,7 +153,7 @@ static struct ctl_table uts_root_table[] static int __init utsname_sysctl_init(void) { - register_sysctl_table(uts_root_table); + register_glob_sysctl_table(uts_root_table); return 0; } diff -uprN linux-2.6.24/kernel/ve/Makefile linux-2.6.24.ovz/kernel/ve/Makefile --- linux-2.6.24/kernel/ve/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,16 @@ +# +# +# kernel/ve/Makefile +# +# Copyright (C) 2000-2005 SWsoft +# All rights reserved. +# +# Licensing governed by "linux/COPYING.SWsoft" file. + +obj-$(CONFIG_VE) = ve.o veowner.o hooks.o devperms.o +obj-$(CONFIG_VZ_WDOG) += vzwdog.o +obj-$(CONFIG_VE_CALLS) += vzmon.o + +vzmon-objs = vecalls.o + +obj-$(CONFIG_VZ_DEV) += vzdev.o diff -uprN linux-2.6.24/kernel/ve/devperms.c linux-2.6.24.ovz/kernel/ve/devperms.c --- linux-2.6.24/kernel/ve/devperms.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/devperms.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,418 @@ +/* + * linux/kernel/ve/devperms.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + * Devices permissions routines, + * character and block devices separately + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Rules applied in the following order: + * MAJOR!=0, MINOR!=0 + * MAJOR!=0, MINOR==0 + * MAJOR==0, MINOR==0 + */ + +struct devperms_struct { + dev_t dev; /* device id */ + unsigned char mask; + unsigned type; + envid_t veid; + + struct hlist_node hash; + struct rcu_head rcu; +}; + +static struct devperms_struct default_major_perms[] = { + { + MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(PTY_MASTER_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(PTY_SLAVE_MAJOR, 0), + S_IROTH | S_IWOTH, + S_IFCHR, + }, +}; + +static struct devperms_struct default_minor_perms[] = { + { + MKDEV(MEM_MAJOR, 3), /* null */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 5), /* zero */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 7), /* full */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(TTYAUX_MAJOR, 0), /* tty */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(TTYAUX_MAJOR, 2), /* ptmx */ + S_IROTH | S_IWOTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 8), /* random */ + S_IROTH, + S_IFCHR, + }, + { + MKDEV(MEM_MAJOR, 9), /* urandom */ + S_IROTH, + S_IFCHR + }, +}; + +static struct devperms_struct default_deny_perms = { + MKDEV(0, 0), + 0, + S_IFCHR, +}; + +static inline struct devperms_struct *find_default_devperms(int type, dev_t dev) +{ + int i; + + /* XXX all defaults perms are S_IFCHR */ + if (type != S_IFCHR) + return &default_deny_perms; + + for (i = 0; i < ARRAY_SIZE(default_minor_perms); i++) + if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) && + MINOR(dev) == MINOR(default_minor_perms[i].dev)) + return &default_minor_perms[i]; + + for (i = 0; i < ARRAY_SIZE(default_major_perms); i++) + if (MAJOR(dev) == MAJOR(default_major_perms[i].dev)) + return &default_major_perms[i]; + + return &default_deny_perms; +} + +#define DEVPERMS_HASH_SZ 512 +#define devperms_hashfn(id, dev) \ + ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \ + (DEVPERMS_HASH_SZ - 1) + +static DEFINE_SPINLOCK(devperms_hash_lock); +static struct hlist_head devperms_hash[DEVPERMS_HASH_SZ]; + +static inline struct devperms_struct *find_devperms(envid_t veid, + int type, + dev_t dev) +{ + struct hlist_head *table; + struct devperms_struct *perms; + struct hlist_node *h; + + table = &devperms_hash[devperms_hashfn(veid, dev)]; + hlist_for_each_entry_rcu (perms, h, table, hash) + if (perms->type == type && perms->veid == veid && + MAJOR(perms->dev) == MAJOR(dev) && + MINOR(perms->dev) == MINOR(dev)) + return perms; + + return NULL; +} + +static void free_devperms(struct rcu_head *rcu) +{ + struct devperms_struct *perms; + + perms = container_of(rcu, struct devperms_struct, rcu); + kfree(perms); +} + +/* API calls */ + +void clean_device_perms_ve(envid_t veid) +{ + int i; + struct devperms_struct *p; + struct hlist_node *n, *tmp; + + spin_lock(&devperms_hash_lock); + for (i = 0; i < DEVPERMS_HASH_SZ; i++) + hlist_for_each_entry_safe (p, n, tmp, &devperms_hash[i], hash) + if (p->veid == veid) { + hlist_del_rcu(&p->hash); + call_rcu(&p->rcu, free_devperms); + } + spin_unlock(&devperms_hash_lock); +} + +EXPORT_SYMBOL(clean_device_perms_ve); + +/* + * Mode is a mask of + * FMODE_READ for read access (configurable by S_IROTH) + * FMODE_WRITE for write access (configurable by S_IWOTH) + * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP) + */ + +int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) +{ + struct devperms_struct *p; + struct ve_struct *ve; + envid_t veid; + char mask; + + ve = get_exec_env(); + veid = ve->veid; + rcu_read_lock(); + + p = find_devperms(veid, dev_type | VE_USE_MINOR, dev); + if (p != NULL) + goto end; + + p = find_devperms(veid, dev_type | VE_USE_MAJOR, MKDEV(MAJOR(dev),0)); + if (p != NULL) + goto end; + + p = find_devperms(veid, dev_type, MKDEV(0,0)); + if (p != NULL) + goto end; + + if (ve->features & VE_FEATURE_DEF_PERMS) { + p = find_default_devperms(dev_type, dev); + if (p != NULL) + goto end; + } + + rcu_read_unlock(); + return -ENODEV; + +end: + mask = p->mask; + rcu_read_unlock(); + + access_mode = "\000\004\002\006\010\014\012\016"[access_mode]; + return ((mask & access_mode) == access_mode) ? 0 : -EACCES; +} + +EXPORT_SYMBOL(get_device_perms_ve); + +int set_device_perms_ve(envid_t veid, unsigned type, dev_t dev, unsigned mask) +{ + struct devperms_struct *perms, *new_perms; + struct hlist_head *htable; + + new_perms = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL); + + spin_lock(&devperms_hash_lock); + perms = find_devperms(veid, type, dev); + if (perms != NULL) { + kfree(new_perms); + perms->mask = mask & S_IALLUGO; + } else { + switch (type & VE_USE_MASK) { + case 0: + dev = 0; + break; + case VE_USE_MAJOR: + dev = MKDEV(MAJOR(dev),0); + break; + } + + new_perms->veid = veid; + new_perms->dev = dev; + new_perms->type = type; + new_perms->mask = mask & S_IALLUGO; + + htable = &devperms_hash[devperms_hashfn(new_perms->veid, + new_perms->dev)]; + hlist_add_head_rcu(&new_perms->hash, htable); + } + spin_unlock(&devperms_hash_lock); + return 0; +} + +EXPORT_SYMBOL(set_device_perms_ve); + +#ifdef CONFIG_PROC_FS +static int devperms_seq_show(struct seq_file *m, void *v) +{ + struct devperms_struct *dp; + char dev_s[32], type_c; + unsigned use, type; + dev_t dev; + + dp = (struct devperms_struct *)v; + if (dp == (struct devperms_struct *)1L) { + seq_printf(m, "Version: 2.7\n"); + return 0; + } + + use = dp->type & VE_USE_MASK; + type = dp->type & S_IFMT; + dev = dp->dev; + + if ((use | VE_USE_MINOR) == use) + snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev)); + else if ((use | VE_USE_MAJOR) == use) + snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev)); + else + snprintf(dev_s, sizeof(dev_s), "*:*"); + + if (type == S_IFCHR) + type_c = 'c'; + else if (type == S_IFBLK) + type_c = 'b'; + else + type_c = '?'; + + seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s); + return 0; +} + +static void *devperms_seq_start(struct seq_file *m, loff_t *pos) +{ + loff_t cpos; + long slot; + struct devperms_struct *dp; + struct hlist_node *h; + + cpos = *pos; + rcu_read_lock(); + + if (cpos-- == 0) + return (void *)1L; + + for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++) + hlist_for_each_entry_rcu (dp, h, &devperms_hash[slot], hash) + if (cpos-- == 0) { + m->private = (void *)slot; + return dp; + } + return NULL; +} + +static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + long slot; + struct hlist_node *next; + struct devperms_struct *dp; + + dp = (struct devperms_struct *)v; + + if (unlikely(dp == (struct devperms_struct *)1L)) + slot = 0; + else { + next = rcu_dereference(dp->hash.next); + if (next != NULL) + goto out; + + slot = (long)m->private + 1; + } + + for (; slot < DEVPERMS_HASH_SZ; slot++) { + next = rcu_dereference(devperms_hash[slot].first); + if (next == NULL) + continue; + + m->private = (void *)slot; + goto out; + } + return NULL; + +out: + (*pos)++; + return hlist_entry(next, struct devperms_struct, hash); +} + +static void devperms_seq_stop(struct seq_file *m, void *v) +{ + rcu_read_unlock(); +} + +static struct seq_operations devperms_seq_op = { + .start = devperms_seq_start, + .next = devperms_seq_next, + .stop = devperms_seq_stop, + .show = devperms_seq_show, +}; + +static int devperms_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &devperms_seq_op); +} + +struct file_operations proc_devperms_ops = { + .open = devperms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +EXPORT_SYMBOL(proc_devperms_ops); +#endif + +/* Initialisation */ + +static struct devperms_struct original_perms[] = +{ + { + MKDEV(0,0), + S_IROTH | S_IWOTH, + S_IFCHR, + 0, + }, + { + MKDEV(0,0), + S_IXGRP | S_IROTH | S_IWOTH, + S_IFBLK, + 0, + }, +}; + +static int __init init_devperms_hash(void) +{ + hlist_add_head(&original_perms[0].hash, + &devperms_hash[devperms_hashfn(0, + original_perms[0].dev)]); + hlist_add_head(&original_perms[1].hash, + &devperms_hash[devperms_hashfn(0, + original_perms[1].dev)]); + return 0; +} + +core_initcall(init_devperms_hash); diff -uprN linux-2.6.24/kernel/ve/hooks.c linux-2.6.24.ovz/kernel/ve/hooks.c --- linux-2.6.24/kernel/ve/hooks.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/hooks.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,114 @@ +/* + * linux/kernel/ve/hooks.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include + +static struct list_head ve_hooks[VE_MAX_CHAINS]; +static DECLARE_RWSEM(ve_hook_sem); + +void ve_hook_register(int chain, struct ve_hook *vh) +{ + struct list_head *lh; + struct ve_hook *tmp; + + BUG_ON(chain > VE_MAX_CHAINS); + + down_write(&ve_hook_sem); + list_for_each(lh, &ve_hooks[chain]) { + tmp = list_entry(lh, struct ve_hook, list); + if (vh->priority < tmp->priority) + break; + } + + list_add_tail(&vh->list, lh); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_register); + +void ve_hook_unregister(struct ve_hook *vh) +{ + down_write(&ve_hook_sem); + list_del(&vh->list); + up_write(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_unregister); + +static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve) +{ + int err; + + err = 0; + if (try_module_get(vh->owner)) { + err = vh->init(ve); + module_put(vh->owner); + } + return err; +} + +static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve) +{ + if (vh->fini != NULL && try_module_get(vh->owner)) { + vh->fini(ve); + module_put(vh->owner); + } +} + +int ve_hook_iterate_init(int chain, void *ve) +{ + struct ve_hook *vh; + int err; + + err = 0; + + down_read(&ve_hook_sem); + list_for_each_entry(vh, &ve_hooks[chain], list) + if ((err = ve_hook_init(vh, ve)) < 0) + break; + + if (err) + list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + + up_read(&ve_hook_sem); + return err; +} + +EXPORT_SYMBOL(ve_hook_iterate_init); + +void ve_hook_iterate_fini(int chain, void *ve) +{ + struct ve_hook *vh; + + down_read(&ve_hook_sem); + list_for_each_entry_reverse(vh, &ve_hooks[chain], list) + ve_hook_fini(vh, ve); + up_read(&ve_hook_sem); +} + +EXPORT_SYMBOL(ve_hook_iterate_fini); + +static int __init ve_hooks_init(void) +{ + int i; + + for (i = 0; i < VE_MAX_CHAINS; i++) + INIT_LIST_HEAD(&ve_hooks[i]); + return 0; +} + +core_initcall(ve_hooks_init); + diff -uprN linux-2.6.24/kernel/ve/ve.c linux-2.6.24.ovz/kernel/ve/ve.c --- linux-2.6.24/kernel/ve/ve.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/ve.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,161 @@ +/* + * linux/kernel/ve/ve.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +/* + * 've.c' helper file performing VE sub-system initialization + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +unsigned long vz_rstamp = 0x37e0f59d; + +#ifdef CONFIG_MODULES +struct module no_module = { .state = MODULE_STATE_GOING }; +EXPORT_SYMBOL(no_module); +#endif + +INIT_KSYM_MODULE(ip_tables); +INIT_KSYM_MODULE(ip6_tables); +INIT_KSYM_MODULE(iptable_filter); +INIT_KSYM_MODULE(ip6table_filter); +INIT_KSYM_MODULE(iptable_mangle); +INIT_KSYM_MODULE(ip6table_mangle); +INIT_KSYM_MODULE(ip_conntrack); +INIT_KSYM_MODULE(nf_conntrack); +INIT_KSYM_MODULE(nf_conntrack_ipv4); +INIT_KSYM_MODULE(nf_conntrack_ipv6); +INIT_KSYM_MODULE(ip_nat); +INIT_KSYM_MODULE(nf_nat); +INIT_KSYM_MODULE(iptable_nat); + +INIT_KSYM_CALL(int, init_netfilter, (void)); +INIT_KSYM_CALL(int, init_iptables, (void)); +INIT_KSYM_CALL(int, init_ip6tables, (void)); +INIT_KSYM_CALL(int, init_iptable_filter, (void)); +INIT_KSYM_CALL(int, init_ip6table_filter, (void)); +INIT_KSYM_CALL(int, init_iptable_mangle, (void)); +INIT_KSYM_CALL(int, init_ip6table_mangle, (void)); +INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); +INIT_KSYM_CALL(int, nf_conntrack_init_ve, (void)); +INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); +INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); +INIT_KSYM_CALL(int, nf_nat_init, (void)); +INIT_KSYM_CALL(int, init_iptable_nat, (void)); +INIT_KSYM_CALL(void, fini_iptable_nat, (void)); +INIT_KSYM_CALL(int, init_nftable_nat, (void)); +INIT_KSYM_CALL(void, fini_nftable_nat, (void)); +INIT_KSYM_CALL(void, nf_nat_cleanup, (void)); +INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); +INIT_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); +INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); +INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); +INIT_KSYM_CALL(void, fini_ip6table_filter, (void)); +INIT_KSYM_CALL(void, fini_iptable_filter, (void)); +INIT_KSYM_CALL(void, fini_ip6table_mangle, (void)); +INIT_KSYM_CALL(void, fini_iptable_mangle, (void)); +INIT_KSYM_CALL(void, fini_ip6tables, (void)); +INIT_KSYM_CALL(void, fini_iptables, (void)); +INIT_KSYM_CALL(void, fini_netfilter, (void)); + +#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) +INIT_KSYM_MODULE(vzmon); +INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); + +void do_env_free(struct ve_struct *env) +{ + KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); +} +EXPORT_SYMBOL(do_env_free); +#endif + +#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) +INIT_KSYM_MODULE(vzethdev); +INIT_KSYM_CALL(int, veth_open, (struct net_device *dev)); +#endif + +struct ve_struct ve0 = { + .ve_list = LIST_HEAD_INIT(ve0.ve_list), + .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), + .start_jiffies = INITIAL_JIFFIES, +#ifdef CONFIG_UNIX98_PTYS + .devpts_config = &devpts_config, +#endif + .ve_ns = &init_nsproxy, + .is_running = 1, + .op_sem = __RWSEM_INITIALIZER(ve0.op_sem), +}; + +EXPORT_SYMBOL(ve0); + +#ifdef CONFIG_SMP +static struct percpu_data ve0_cpu_stats; +#endif +static struct ve_cpu_stats ve0_cpu_stats_data[NR_CPUS]; + +LIST_HEAD(ve_list_head); +rwlock_t ve_list_lock = RW_LOCK_UNLOCKED; + +LIST_HEAD(ve_cleanup_list); +DEFINE_SPINLOCK(ve_cleanup_lock); +struct task_struct *ve_cleanup_thread; + +EXPORT_SYMBOL(ve_list_lock); +EXPORT_SYMBOL(ve_list_head); +EXPORT_SYMBOL(ve_cleanup_lock); +EXPORT_SYMBOL(ve_cleanup_list); +EXPORT_SYMBOL(ve_cleanup_thread); + +void init_ve0(void) +{ + struct ve_struct *ve; + + ve = get_ve0(); + (void)get_ve(ve); + atomic_set(&ve->pcounter, 1); + + ve->cpu_stats = static_percpu_ptr(&ve0_cpu_stats, + ve0_cpu_stats_data); + + list_add(&ve->ve_list, &ve_list_head); +} + +void ve_cleanup_schedule(struct ve_struct *ve) +{ + BUG_ON(ve_cleanup_thread == NULL); + + spin_lock(&ve_cleanup_lock); + list_add_tail(&ve->cleanup_list, &ve_cleanup_list); + spin_unlock(&ve_cleanup_lock); + + wake_up_process(ve_cleanup_thread); +} diff -uprN linux-2.6.24/kernel/ve/vecalls.c linux-2.6.24.ovz/kernel/ve/vecalls.c --- linux-2.6.24/kernel/ve/vecalls.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/vecalls.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,2799 @@ +/* + * linux/kernel/ve/vecalls.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + */ + +/* + * 'vecalls.c' is file with basic VE support. It provides basic primities + * along with initialization script + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef CONFIG_VZ_FAIRSCHED +#include +#endif + +#include +#include +#include +#include + +int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ +EXPORT_SYMBOL(nr_ve); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags); +static int alloc_ve_tty_drivers(struct ve_struct* ve); +static void free_ve_tty_drivers(struct ve_struct* ve); +static int register_ve_tty_drivers(struct ve_struct* ve); +static void unregister_ve_tty_drivers(struct ve_struct* ve); +static int init_ve_tty_drivers(struct ve_struct *); +static void fini_ve_tty_drivers(struct ve_struct *); +static void clear_termios(struct tty_driver* driver ); +#ifdef CONFIG_INET +static void ve_mapped_devs_cleanup(struct ve_struct *ve); +#endif + +static void vecalls_exit(void); +extern void grsecurity_setup(void); + +struct ve_struct *__find_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + + for_each_ve(ve) { + if (ve->veid == veid) + return ve; + } + return NULL; +} +EXPORT_SYMBOL(__find_ve_by_id); + +struct ve_struct *get_ve_by_id(envid_t veid) +{ + struct ve_struct *ve; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + get_ve(ve); + read_unlock(&ve_list_lock); + return ve; +} +EXPORT_SYMBOL(get_ve_by_id); + +/* + * real_put_ve() MUST be used instead of put_ve() inside vecalls. + */ +void real_do_env_free(struct ve_struct *ve); +static inline void real_put_ve(struct ve_struct *ve) +{ + if (ve && atomic_dec_and_test(&ve->counter)) { + if (atomic_read(&ve->pcounter) > 0) + BUG(); + if (ve->is_running) + BUG(); + real_do_env_free(ve); + } +} + +static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) +{ + struct ve_struct *ve; + struct vz_cpu_stat *vstat; + int retval; + int i, cpu; + unsigned long tmp; + + if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) + return -EPERM; + if (veid == 0) + return -ESRCH; + + vstat = kzalloc(sizeof(*vstat), GFP_KERNEL); + if (!vstat) + return -ENOMEM; + + retval = -ESRCH; + read_lock(&ve_list_lock); + ve = __find_ve_by_id(veid); + if (ve == NULL) + goto out_unlock; + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user); + vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice); + vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system); + vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); + } + vstat->uptime_clk = get_cycles() - ve->start_cycles; + vstat->uptime_jif = (unsigned long)cputime64_to_clock_t( + get_jiffies_64() - ve->start_jiffies); + for (i = 0; i < 3; i++) { + tmp = ve->avenrun[i] + (FIXED_1/200); + vstat->avenrun[i].val_int = LOAD_INT(tmp); + vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); + } + read_unlock(&ve_list_lock); + + retval = 0; + if (copy_to_user(buf, vstat, sizeof(*vstat))) + retval = -EFAULT; +out_free: + kfree(vstat); + return retval; + +out_unlock: + read_unlock(&ve_list_lock); + goto out_free; +} + +static int real_setdevperms(envid_t veid, unsigned type, + dev_t dev, unsigned mask) +{ + struct ve_struct *ve; + int err; + + if (!capable(CAP_SETVEID) || veid == 0) + return -EPERM; + + if ((ve = get_ve_by_id(veid)) == NULL) + return -ESRCH; + + down_read(&ve->op_sem); + err = -ESRCH; + if (ve->is_running) + err = set_device_perms_ve(veid, type, dev, mask); + up_read(&ve->op_sem); + real_put_ve(ve); + return err; +} + +/********************************************************************** + ********************************************************************** + * + * VE start: subsystems + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_INET +#include +#include +#include +#include + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int init_fini_ve_mibs6(struct ve_struct *ve, int fini) +{ + if (fini) + goto fini; + + if (!(ve->_ipv6_statistics[0] = alloc_percpu(struct ipstats_mib))) + goto out1; + if (!(ve->_ipv6_statistics[1] = alloc_percpu(struct ipstats_mib))) + goto out2; + if (!(ve->_icmpv6_statistics[0] = alloc_percpu(struct icmpv6_mib))) + goto out3; + if (!(ve->_icmpv6_statistics[1] = alloc_percpu(struct icmpv6_mib))) + goto out4; + if (!(ve->_icmpv6msg_statistics[0] = alloc_percpu(struct icmpv6msg_mib))) + goto out5; + if (!(ve->_icmpv6msg_statistics[1] = alloc_percpu(struct icmpv6msg_mib))) + goto out6; + if (!(ve->_udp_stats_in6[0] = alloc_percpu(struct udp_mib))) + goto out7; + if (!(ve->_udp_stats_in6[1] = alloc_percpu(struct udp_mib))) + goto out8; + if (!(ve->_udplite_stats_in6[0] = alloc_percpu(struct udp_mib))) + goto out9; + if (!(ve->_udplite_stats_in6[1] = alloc_percpu(struct udp_mib))) + goto out10; + return 0; + +fini: + free_percpu(ve->_udplite_stats_in6[1]); +out10: + free_percpu(ve->_udplite_stats_in6[0]); +out9: + free_percpu(ve->_udp_stats_in6[1]); +out8: + free_percpu(ve->_udp_stats_in6[0]); +out7: + free_percpu(ve->_icmpv6msg_statistics[1]); +out6: + free_percpu(ve->_icmpv6msg_statistics[0]); +out5: + free_percpu(ve->_icmpv6_statistics[1]); +out4: + free_percpu(ve->_icmpv6_statistics[0]); +out3: + free_percpu(ve->_ipv6_statistics[1]); +out2: + free_percpu(ve->_ipv6_statistics[0]); +out1: + return -ENOMEM; +} +#else +static int init_fini_ve_mibs6(struct ve_struct *ve, int fini) { return 0; } +#endif + +static int init_fini_ve_mibs(struct ve_struct *ve, int fini) +{ + if (fini) + goto fini; + if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib))) + goto out1; + if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib))) + goto out2; + if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib))) + goto out3; + if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib))) + goto out4; + if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib))) + goto out5; + if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib))) + goto out6; + if (!(ve->_icmpmsg_statistics[0] = alloc_percpu(struct icmpmsg_mib))) + goto out7; + if (!(ve->_icmpmsg_statistics[1] = alloc_percpu(struct icmpmsg_mib))) + goto out8; + if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib))) + goto out9; + if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib))) + goto out10; + if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib))) + goto out11; + if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib))) + goto out12; + if (!(ve->_udplite_statistics[0] = alloc_percpu(struct udp_mib))) + goto out13; + if (!(ve->_udplite_statistics[1] = alloc_percpu(struct udp_mib))) + goto out14; + if (init_fini_ve_mibs6(ve, fini)) + goto out15; + return 0; + +fini: + init_fini_ve_mibs6(ve, fini); +out15: + free_percpu(ve->_udplite_statistics[1]); +out14: + free_percpu(ve->_udplite_statistics[0]); +out13: + free_percpu(ve->_udp_statistics[1]); +out12: + free_percpu(ve->_udp_statistics[0]); +out11: + free_percpu(ve->_tcp_statistics[1]); +out10: + free_percpu(ve->_tcp_statistics[0]); +out9: + free_percpu(ve->_icmpmsg_statistics[1]); +out8: + free_percpu(ve->_icmpmsg_statistics[0]); +out7: + free_percpu(ve->_icmp_statistics[1]); +out6: + free_percpu(ve->_icmp_statistics[0]); +out5: + free_percpu(ve->_ip_statistics[1]); +out4: + free_percpu(ve->_ip_statistics[0]); +out3: + free_percpu(ve->_net_statistics[1]); +out2: + free_percpu(ve->_net_statistics[0]); +out1: + return -ENOMEM; +} + +static inline int init_ve_mibs(struct ve_struct *ve) +{ + return init_fini_ve_mibs(ve, 0); +} + +static inline void fini_ve_mibs(struct ve_struct *ve) +{ + (void)init_fini_ve_mibs(ve, 1); +} +#else +#define init_ve_mibs(ve) (0) +#define fini_ve_mibs(ve) do { } while (0) +#endif + +static int prepare_proc_root(struct ve_struct *ve) +{ + struct proc_dir_entry *de; + + de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); + if (de == NULL) + return -ENOMEM; + + memcpy(de + 1, "/proc", 6); + de->name = (char *)(de + 1); + de->namelen = 5; + de->mode = S_IFDIR | S_IRUGO | S_IXUGO; + de->nlink = 2; + atomic_set(&de->count, 1); + + ve->proc_root = de; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int init_ve_proc(struct ve_struct *ve) +{ + int err; + struct proc_dir_entry *de; + + err = prepare_proc_root(ve); + if (err) + goto out_root; + + err = register_ve_fs_type(ve, &proc_fs_type, + &ve->proc_fstype, &ve->proc_mnt); + if (err) + goto out_reg; + + err = -ENOMEM; + de = create_proc_entry("kmsg", S_IRUSR, NULL); + if (!de) + goto out_kmsg; + de->proc_fops = &proc_kmsg_operations; + + /* create necessary /proc subdirs in VE local proc tree */ + err = -ENOMEM; + de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + if (!de) + goto out_vz; + + ve->_proc_net = proc_mkdir("net", NULL); + if (!ve->_proc_net) + goto out_net; + ve->_proc_net_stat = proc_mkdir("stat", ve->_proc_net); + if (!ve->_proc_net_stat) + goto out_net_stat; + + if (ve_snmp_proc_init(ve)) + goto out_snmp; + + ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt); + return 0; + +out_snmp: + remove_proc_entry("stat", ve->_proc_net); +out_net_stat: + remove_proc_entry("net", NULL); +out_net: + remove_proc_entry("vz", NULL); +out_vz: + remove_proc_entry("kmsg", NULL); +out_kmsg: + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +out_reg: + /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ + ; +out_root: + return err; +} + +static void fini_ve_proc(struct ve_struct *ve) +{ + ve_snmp_proc_fini(ve); + remove_proc_entry("stat", ve->_proc_net); + remove_proc_entry("net", NULL); + remove_proc_entry("vz", NULL); + remove_proc_entry("kmsg", NULL); + unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); + ve->proc_mnt = NULL; +} + +static void free_ve_proc(struct ve_struct *ve) +{ + /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, + so we check that everything was removed and not lost */ + if (ve->proc_root && ve->proc_root->subdir) { + struct proc_dir_entry *p = ve->proc_root; + printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid); + while ((p = p->subdir) != NULL) + printk("/%s", p->name); + printk(" is not removed!\n"); + } + + kfree(ve->proc_root); + kfree(ve->proc_fstype); + + ve->proc_fstype = NULL; + ve->proc_root = NULL; +} +#else +#define init_ve_proc(ve) (0) +#define fini_ve_proc(ve) do { } while (0) +#define free_ve_proc(ve) do { } while (0) +#endif + +extern const struct file_operations proc_sys_file_operations; +extern struct inode_operations proc_sys_inode_operations; + +#ifdef CONFIG_SYSCTL +static int init_ve_sysctl(struct ve_struct *ve) +{ + int err; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + ve->proc_sys_root = proc_mkdir("sys", NULL); + if (ve->proc_sys_root == NULL) + goto out_proc; + ve->proc_sys_root->proc_iops = &proc_sys_inode_operations; + ve->proc_sys_root->proc_fops = &proc_sys_file_operations; + ve->proc_sys_root->nlink = 0; +#endif + INIT_LIST_HEAD(&ve->sysctl_lh); + + err = devinet_sysctl_init(ve); + if (err) + goto out_dev; + + err = addrconf_sysctl_init(ve); + if (err) + goto out_dev6; + + return 0; + +out_dev6: + devinet_sysctl_fini(ve); +out_dev: +#ifdef CONFIG_PROC_FS + remove_proc_entry("sys", NULL); +out_proc: +#endif + return err; +} + +static void fini_ve_sysctl(struct ve_struct *ve) +{ + addrconf_sysctl_fini(ve); + devinet_sysctl_fini(ve); + remove_proc_entry("sys", NULL); +} + +static void free_ve_sysctl(struct ve_struct *ve) +{ + addrconf_sysctl_free(ve); + devinet_sysctl_free(ve); +} +#else +#define init_ve_sysctl(ve) (0) +#define fini_ve_sysctl(ve) do { } while (0) +#define free_ve_sysctl(ve) do { } while (0) +#endif + +#ifdef CONFIG_UNIX98_PTYS +#include + +/* + * DEVPTS needs a virtualization: each environment should see each own list of + * pseudo-terminals. + * To implement it we need to have separate devpts superblocks for each + * VE, and each VE should mount its own one. + * Thus, separate vfsmount structures are required. + * To minimize intrusion into vfsmount lookup code, separate file_system_type + * structures are created. + * + * In addition to this, patch fo character device itself is required, as file + * system itself is used only for MINOR/MAJOR lookup. + */ + +static int init_ve_devpts(struct ve_struct *ve) +{ + int err; + + err = -ENOMEM; + ve->devpts_config = kzalloc(sizeof(struct devpts_config), GFP_KERNEL); + if (ve->devpts_config == NULL) + goto out; + + ve->devpts_config->mode = 0600; + err = register_ve_fs_type(ve, &devpts_fs_type, + &ve->devpts_fstype, &ve->devpts_mnt); + if (err) { + kfree(ve->devpts_config); + ve->devpts_config = NULL; + } +out: + return err; +} + +static void fini_ve_devpts(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); + /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->devpts_mnt = NULL; + kfree(ve->devpts_config); + ve->devpts_config = NULL; +} +#else +#define init_ve_devpts(ve) (0) +#define fini_ve_devpts(ve) do { } while (0) +#endif + +static int init_ve_shmem(struct ve_struct *ve) +{ + return register_ve_fs_type(ve, + &tmpfs_fs_type, + &ve->shmem_fstype, + &ve->shmem_mnt); +} + +static void fini_ve_shmem(struct ve_struct *ve) +{ + unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); + /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ + ve->shmem_mnt = NULL; +} + +#ifdef CONFIG_SYSFS +static int init_ve_sysfs_root(struct ve_struct *ve) +{ + struct sysfs_dirent *sysfs_root; + + sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); + if (sysfs_root == NULL) + return -ENOMEM; + sysfs_root->s_name = ""; + atomic_set(&sysfs_root->s_count, 1); + sysfs_root->s_flags = SYSFS_DIR; + sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + sysfs_root->s_ino = 1; + + ve->_sysfs_root = sysfs_root; + return 0; +} +#endif + +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) +extern struct device_attribute ve_net_class_attributes[]; +static inline int init_ve_netclass(struct ve_struct *ve) +{ + struct class *nc; + int err; + + nc = kzalloc(sizeof(*nc), GFP_KERNEL); + if (!nc) + return -ENOMEM; + + nc->name = net_class.name; + nc->dev_release = net_class.dev_release; + nc->uevent = net_class.uevent; + nc->dev_attrs = ve_net_class_attributes; + + err = class_register(nc); + if (!err) { + ve->net_class = nc; + return 0; + } + kfree(nc); + return err; +} + +static inline void fini_ve_netclass(struct ve_struct *ve) +{ + class_unregister(ve->net_class); + kfree(ve->net_class); + ve->net_class = NULL; +} +#else +static inline int init_ve_netclass(struct ve_struct *ve) { return 0; } +static inline void fini_ve_netclass(struct ve_struct *ve) { ; } +#endif + +extern struct kset devices_subsys; + +static const struct { + unsigned minor; + char *name; +} mem_class_devices [] = { + {3, "null"}, + {5, "zero"}, + {7, "full"}, + {8, "random"}, + {9, "urandom"}, + {0, NULL}, +}; + +static struct class *init_ve_mem_class(void) +{ + int i; + struct class *ve_mem_class; + + ve_mem_class = class_create(THIS_MODULE, "mem"); + if (IS_ERR(ve_mem_class)) + return ve_mem_class; + for (i = 0; mem_class_devices[i].name; i++) + class_device_create(ve_mem_class, NULL, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor), + NULL, mem_class_devices[i].name); + return ve_mem_class; +} + + +void fini_ve_mem_class(struct class *ve_mem_class) +{ + int i; + + for (i = 0; mem_class_devices[i].name; i++) + class_device_destroy(ve_mem_class, + MKDEV(MEM_MAJOR, mem_class_devices[i].minor)); + class_destroy(ve_mem_class); +} + +static int init_ve_sysfs(struct ve_struct *ve) +{ + struct kset *subsys; + int err; + +#ifdef CONFIG_SYSFS + err = 0; + if (ve->features & VE_FEATURE_SYSFS) { + err = init_ve_sysfs_root(ve); + if (err != 0) + goto out; + err = register_ve_fs_type(ve, + &sysfs_fs_type, + &ve->sysfs_fstype, + &ve->sysfs_mnt); + } + if (err != 0) + goto out_fs_type; +#endif + err = -ENOMEM; + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + ve->class_obj_subsys = subsys; + if (subsys == NULL) + goto out_class_obj; + /* ick, this is ugly, the things we go through to keep from showing up + * in sysfs... */ + subsys->kobj.k_name = kstrdup(class_obj_subsys.kobj.k_name, GFP_KERNEL); + if (!subsys->kobj.k_name) + goto out_subsys1; + subsys->ktype = class_obj_subsys.ktype; + subsys->uevent_ops = class_obj_subsys.uevent_ops; + kset_init(subsys); + if (!subsys->kobj.parent) + subsys->kobj.parent = &subsys->kobj; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + ve->class_subsys = subsys; + if (subsys == NULL) + goto out_class_subsys; + /* ick, this is ugly, the things we go through to keep from showing up + * in sysfs... */ + subsys->kobj.k_name = kstrdup(class_subsys.kobj.k_name, GFP_KERNEL); + if (!subsys->kobj.k_name) + goto out_subsys2; + subsys->ktype = class_subsys.ktype; + subsys->uevent_ops = class_subsys.uevent_ops; + + err = subsystem_register(subsys); + if (err != 0) + goto out_register; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + ve->devices_subsys = subsys; + if (!subsys) + goto out_subsys3; + subsys->kobj.k_name = kstrdup(devices_subsys.kobj.k_name, GFP_KERNEL); + if (!subsys->kobj.k_name) + goto out_subsys4; + subsys->ktype = devices_subsys.ktype; + subsys->uevent_ops = devices_subsys.uevent_ops; + + err = subsystem_register(subsys); + if (err < 0) + goto out_register2; + + err = init_ve_netclass(ve); + if (err) + goto out_nc; + + ve->tty_class = init_ve_tty_class(); + if (IS_ERR(ve->tty_class)) { + err = PTR_ERR(ve->tty_class); + ve->tty_class = NULL; + goto out_tty_class_register; + } + + ve->mem_class = init_ve_mem_class(); + if (IS_ERR(ve->mem_class)) { + err = PTR_ERR(ve->mem_class); + ve->mem_class = NULL; + goto out_mem_class_register; + } + + return err; + +out_mem_class_register: + fini_ve_tty_class(ve->tty_class); +out_tty_class_register: + fini_ve_netclass(ve); +out_nc: + subsystem_unregister(ve->devices_subsys); +out_register2: + kfree(ve->devices_subsys->kobj.k_name); +out_subsys4: + kfree(ve->devices_subsys); +out_subsys3: + subsystem_unregister(ve->class_subsys); +out_register: + kfree(ve->class_subsys->kobj.k_name); +out_subsys2: + kfree(ve->class_subsys); +out_class_subsys: + kfree(ve->class_obj_subsys->kobj.k_name); +out_subsys1: + kfree(ve->class_obj_subsys); +out_class_obj: +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +out_fs_type: + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; +#endif + ve->class_subsys = NULL; + ve->class_obj_subsys = NULL; +#ifdef CONFIG_SYSFS +out: +#endif + return err; +} + +static void fini_ve_sysfs(struct ve_struct *ve) +{ + fini_ve_mem_class(ve->mem_class); + fini_ve_tty_class(ve->tty_class); + fini_ve_netclass(ve); + subsystem_unregister(ve->devices_subsys); + subsystem_unregister(ve->class_subsys); + kfree(ve->devices_subsys->kobj.k_name); + kfree(ve->class_subsys->kobj.k_name); + kfree(ve->class_obj_subsys->kobj.k_name); + kfree(ve->devices_subsys); + kfree(ve->class_subsys); + kfree(ve->class_obj_subsys); + + ve->class_subsys = NULL; + ve->class_obj_subsys = NULL; +#ifdef CONFIG_SYSFS + unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); + ve->sysfs_mnt = NULL; + kfree(ve->_sysfs_root); + ve->_sysfs_root = NULL; + /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ +#endif +} + +static void free_ve_filesystems(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSFS + kfree(ve->sysfs_fstype); + ve->sysfs_fstype = NULL; +#endif + kfree(ve->shmem_fstype); + ve->shmem_fstype = NULL; + + kfree(ve->devpts_fstype); + ve->devpts_fstype = NULL; + + free_ve_proc(ve); +} + +static int init_printk(struct ve_struct *ve) +{ + struct ve_prep_printk { + wait_queue_head_t log_wait; + unsigned long log_start; + unsigned long log_end; + unsigned long logged_chars; + } *tmp; + + tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + init_waitqueue_head(&tmp->log_wait); + ve->_log_wait = &tmp->log_wait; + ve->_log_start = &tmp->log_start; + ve->_log_end = &tmp->log_end; + ve->_logged_chars = &tmp->logged_chars; + /* ve->log_buf will be initialized later by ve_log_init() */ + return 0; +} + +static void fini_printk(struct ve_struct *ve) +{ + /* + * there is no spinlock protection here because nobody can use + * log_buf at the moments when this code is called. + */ + kfree(ve->log_buf); + kfree(ve->_log_wait); +} + +static void fini_venet(struct ve_struct *ve) +{ +#ifdef CONFIG_INET + tcp_v4_kill_ve_sockets(ve); + ve_mapped_devs_cleanup(ve); + synchronize_net(); +#endif +} + +static int init_ve_sched(struct ve_struct *ve) +{ +#ifdef CONFIG_VZ_FAIRSCHED + int err; + + /* + * We refuse to switch to an already existing node since nodes + * keep a pointer to their ve_struct... + */ + err = sys_fairsched_mknod(0, 1, ve->veid); + if (err < 0) { + printk(KERN_WARNING "Can't create fairsched node %d\n", + ve->veid); + return err; + } + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) { + printk(KERN_WARNING "Can't switch to fairsched node %d\n", + ve->veid); + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't clean fairsched node %d\n", + ve->veid); + return err; + } +#endif + ve_sched_attach(ve); + return 0; +} + +static void fini_ve_sched(struct ve_struct *ve) +{ +#ifdef CONFIG_VZ_FAIRSCHED + if (task_fairsched_node_id(current) == ve->veid) + if (sys_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) + printk(KERN_WARNING "Can't leave fairsched node %d\n", + ve->veid); + if (sys_fairsched_rmnod(ve->veid)) + printk(KERN_ERR "Can't remove fairsched node %d\n", + ve->veid); +#endif +} + +/* + * Namespaces + */ + +static inline int init_ve_namespaces(struct ve_struct *ve, + struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NAMESPACES_MASK & ~CLONE_NEWNET, tsk); + if (err < 0) + return err; + + ve->ve_ns = get_nsproxy(tsk->nsproxy); + memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release, + sizeof(virt_utsname.release)); + + if (cur->pid_ns->flags & PID_NS_HIDE_CHILD) + ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN; + + *old = cur; + return 0; +} + +static inline void fini_ve_namespaces(struct ve_struct *ve, + struct nsproxy *old) +{ + struct task_struct *tsk = current; + struct nsproxy *tmp; + + if (old) { + tmp = tsk->nsproxy; + tsk->nsproxy = get_nsproxy(old); + put_nsproxy(tmp); + tmp = ve->ve_ns; + ve->ve_ns = get_nsproxy(old); + put_nsproxy(tmp); + } else { + put_nsproxy(ve->ve_ns); + ve->ve_ns = NULL; + } +} + +static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old) +{ + int err; + struct task_struct *tsk; + struct nsproxy *cur; + + tsk = current; + cur = tsk->nsproxy; + + err = copy_namespaces(CLONE_NEWNET, tsk); + if (err < 0) + return err; + + put_nsproxy(ve->ve_ns); + ve->ve_ns = get_nsproxy(tsk->nsproxy); + *old = cur; + return 0; +} + +static inline void switch_ve_namespaces(struct ve_struct *ve, + struct task_struct *tsk) +{ + struct nsproxy *old_ns; + struct nsproxy *new_ns; + + BUG_ON(tsk != current); + old_ns = tsk->nsproxy; + new_ns = ve->ve_ns; + + if (old_ns != new_ns) { + tsk->nsproxy = get_nsproxy(new_ns); + put_nsproxy(old_ns); + } +} + +static __u64 get_ve_features(env_create_param_t *data, int datalen) +{ + __u64 known_features; + + if (datalen < sizeof(struct env_create_param3)) + /* this version of vzctl is aware of VE_FEATURES_OLD only */ + known_features = VE_FEATURES_OLD; + else + known_features = data->known_features; + + /* + * known features are set as required + * yet unknown features are set as in VE_FEATURES_DEF + */ + return (data->feature_mask & known_features) | + (VE_FEATURES_DEF & ~known_features); +} + +static int init_ve_struct(struct ve_struct *ve, envid_t veid, + u32 class_id, env_create_param_t *data, int datalen) +{ + (void)get_ve(ve); + ve->veid = veid; + ve->class_id = class_id; + ve->features = get_ve_features(data, datalen); + INIT_LIST_HEAD(&ve->vetask_lh); + init_rwsem(&ve->op_sem); + + ve->start_timespec = current->start_time; + /* The value is wrong, but it is never compared to process + * start times */ + ve->start_jiffies = get_jiffies_64(); + ve->start_cycles = get_cycles(); + + return 0; +} + +/********************************************************************** + ********************************************************************** + * + * /proc/meminfo virtualization + * + ********************************************************************** + **********************************************************************/ +static int ve_set_meminfo(envid_t veid, unsigned long val) +{ +#ifdef CONFIG_BEANCOUNTERS + struct ve_struct *ve; + + ve = get_ve_by_id(veid); + if (!ve) + return -EINVAL; + + ve->meminfo_val = val; + real_put_ve(ve); + return 0; +#else + return -ENOTTY; +#endif +} + +static int init_ve_meminfo(struct ve_struct *ve) +{ + ve->meminfo_val = 0; + return 0; +} + +static inline void fini_ve_meminfo(struct ve_struct *ve) +{ +} + +static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) +{ + read_lock(&tsk->fs->lock); + ve->fs_rootmnt = tsk->fs->rootmnt; + ve->fs_root = tsk->fs->root; + read_unlock(&tsk->fs->lock); + mark_tree_virtual(ve->fs_rootmnt, ve->fs_root); +} + +static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) +{ + /* required for real_setdevperms from register_ve_ above */ + memcpy(&ve->ve_cap_bset, &tsk->cap_effective, sizeof(kernel_cap_t)); + cap_lower(ve->ve_cap_bset, CAP_SETVEID); +} + +static int ve_list_add(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + if (__find_ve_by_id(ve->veid) != NULL) + goto err_exists; + + list_add(&ve->ve_list, &ve_list_head); + nr_ve++; + write_unlock_irq(&ve_list_lock); + return 0; + +err_exists: + write_unlock_irq(&ve_list_lock); + return -EEXIST; +} + +static void ve_list_del(struct ve_struct *ve) +{ + write_lock_irq(&ve_list_lock); + list_del(&ve->ve_list); + nr_ve--; + write_unlock_irq(&ve_list_lock); +} + +static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) +{ + spin_lock(&task_capability_lock); + cap_mask(tsk->cap_effective, ve->ve_cap_bset); + cap_mask(tsk->cap_inheritable, ve->ve_cap_bset); + cap_mask(tsk->cap_permitted, ve->ve_cap_bset); + spin_unlock(&task_capability_lock); +} + +void ve_move_task(struct task_struct *tsk, struct ve_struct *new) +{ + struct ve_struct *old; + + might_sleep(); + BUG_ON(tsk != current); + BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk))); + + /* this probihibts ptracing of task entered to VE from host system */ + tsk->mm->vps_dumpable = 0; + /* setup capabilities before enter */ + set_task_ve_caps(tsk, new); + + old = tsk->ve_task_info.owner_env; + tsk->ve_task_info.owner_env = new; + tsk->ve_task_info.exec_env = new; + + write_lock_irq(&tasklist_lock); + list_del_rcu(&tsk->ve_task_info.vetask_list); + write_unlock_irq(&tasklist_lock); + + synchronize_rcu(); + + write_lock_irq(&tasklist_lock); + list_add_tail_rcu(&tsk->ve_task_info.vetask_list, + &new->vetask_lh); + write_unlock_irq(&tasklist_lock); + + atomic_dec(&old->pcounter); + real_put_ve(old); + + atomic_inc(&new->pcounter); + get_ve(new); +} + +EXPORT_SYMBOL(ve_move_task); + +#ifdef CONFIG_VE_IPTABLES +extern int init_netfilter(void); +extern void fini_netfilter(void); +#define init_ve_netfilter() init_netfilter() +#define fini_ve_netfilter() fini_netfilter() + +#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ +({ \ + int ret = 0; \ + if (VE_IPT_CMP(mask, full_mask) && \ + VE_IPT_CMP((ve)->_iptables_modules, \ + full_mask & ~(full_mask##_MOD))) { \ + ret = KSYMERRCALL(1, mod, name, args); \ + if (ret == 0) \ + (ve)->_iptables_modules |= \ + full_mask##_MOD; \ + if (ret == 1) \ + ret = 0; \ + } \ + ret; \ +}) + +#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ +({ \ + if (VE_IPT_CMP(mask, full_mask##_MOD)) \ + KSYMSAFECALL_VOID(mod, name, args); \ +}) + + +static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, + int init_or_cleanup) +{ + int err; + + /* Remove when userspace will start supplying IPv6-related bits. */ + init_mask &= ~VE_IP_IPTABLES6; + init_mask &= ~VE_IP_FILTER6; + init_mask &= ~VE_IP_MANGLE6; + init_mask &= ~VE_IP_IPTABLE_NAT_MOD; + init_mask &= ~VE_NF_CONNTRACK_MOD; + if ((init_mask & VE_IP_IPTABLES) == VE_IP_IPTABLES) + init_mask |= VE_IP_IPTABLES6; + if ((init_mask & VE_IP_FILTER) == VE_IP_FILTER) + init_mask |= VE_IP_FILTER6; + if ((init_mask & VE_IP_MANGLE) == VE_IP_MANGLE) + init_mask |= VE_IP_MANGLE6; + if ((init_mask & VE_IP_NAT) == VE_IP_NAT) + init_mask |= VE_IP_IPTABLE_NAT; + + if ((init_mask & VE_IP_CONNTRACK) == VE_IP_CONNTRACK) + init_mask |= VE_NF_CONNTRACK; + + err = 0; + if (!init_or_cleanup) + goto cleanup; + + /* init part */ +#if defined(CONFIG_IP_NF_IPTABLES) || \ + defined(CONFIG_IP_NF_IPTABLES_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, + ip_tables, init_iptables, ()); + if (err < 0) + goto err_iptables; +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || \ + defined(CONFIG_IP6_NF_IPTABLES_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES6, + ip6_tables, init_ip6tables, ()); + if (err < 0) + goto err_ip6tables; +#endif +#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ + defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_NF_CONNTRACK, + nf_conntrack, nf_conntrack_init_ve, ()); + if (err < 0) + goto err_nf_conntrack; + + err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, + nf_conntrack_ipv4, init_nf_ct_l3proto_ipv4, ()); + if (err < 0) + goto err_nf_conntrack_ipv4; +#endif +#if defined(CONFIG_NF_NAT) || \ + defined(CONFIG_NF_NAT_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, + nf_nat, nf_nat_init, ()); + if (err < 0) + goto err_nftable_nat; + err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLE_NAT, + iptable_nat, init_nftable_nat, ()); + if (err < 0) + goto err_nftable_nat2; +#endif +#if defined(CONFIG_IP_NF_FILTER) || \ + defined(CONFIG_IP_NF_FILTER_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, + iptable_filter, init_iptable_filter, ()); + if (err < 0) + goto err_iptable_filter; +#endif +#if defined(CONFIG_IP6_NF_FILTER) || \ + defined(CONFIG_IP6_NF_FILTER_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER6, + ip6table_filter, init_ip6table_filter, ()); + if (err < 0) + goto err_ip6table_filter; +#endif +#if defined(CONFIG_IP_NF_MANGLE) || \ + defined(CONFIG_IP_NF_MANGLE_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, + iptable_mangle, init_iptable_mangle, ()); + if (err < 0) + goto err_iptable_mangle; +#endif +#if defined(CONFIG_IP6_NF_MANGLE) || \ + defined(CONFIG_IP6_NF_MANGLE_MODULE) + err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE6, + ip6table_mangle, init_ip6table_mangle, ()); + if (err < 0) + goto err_ip6table_mangle; +#endif + return 0; + +/* ------------------------------------------------------------------------- */ + +cleanup: +#if defined(CONFIG_IP6_NF_MANGLE) || \ + defined(CONFIG_IP6_NF_MANGLE_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE6, + ip6table_mangle, fini_ip6table_mangle, ()); +err_ip6table_mangle: +#endif +#if defined(CONFIG_IP_NF_MANGLE) || \ + defined(CONFIG_IP_NF_MANGLE_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, + iptable_mangle, fini_iptable_mangle, ()); +err_iptable_mangle: +#endif +#if defined(CONFIG_IP6_NF_FILTER) || \ + defined(CONFIG_IP6_NF_FILTER_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER6, + ip6table_filter, fini_ip6table_filter, ()); +err_ip6table_filter: +#endif +#if defined(CONFIG_IP_NF_FILTER) || \ + defined(CONFIG_IP_NF_FILTER_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, + iptable_filter, fini_iptable_filter, ()); +err_iptable_filter: +#endif +#if defined(CONFIG_NF_NAT) || \ + defined(CONFIG_NF_NAT_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLE_NAT, + iptable_nat, fini_nftable_nat, ()); +err_nftable_nat2: + KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, + nf_nat, nf_nat_cleanup, ()); +err_nftable_nat: +#endif +#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ + defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, + nf_conntrack_ipv4, fini_nf_ct_l3proto_ipv4, ()); +err_nf_conntrack_ipv4: + KSYMIPTFINI(ve->_iptables_modules, VE_NF_CONNTRACK, + nf_conntrack, nf_conntrack_cleanup_ve, ()); +err_nf_conntrack: +#endif +#if defined(CONFIG_IP6_NF_IPTABLES) || \ + defined(CONFIG_IP6_NF_IPTABLES_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES6, + ip6_tables, fini_ip6tables, ()); +err_ip6tables: +#endif +#if defined(CONFIG_IP_NF_IPTABLES) || \ + defined(CONFIG_IP_NF_IPTABLES_MODULE) + KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, + ip_tables, fini_iptables, ()); +err_iptables: +#endif + ve->_iptables_modules = 0; + + return err; +} + +static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + return do_ve_iptables(ve, init_mask, 1); +} + +static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) +{ + (void)do_ve_iptables(ve, init_mask, 0); +} + +#else +#define init_ve_iptables(x, y) (0) +#define fini_ve_iptables(x, y) do { } while (0) +#define init_ve_netfilter() (0) +#define fini_ve_netfilter() do { } while (0) +#endif + +static inline int init_ve_cpustats(struct ve_struct *ve) +{ + ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); + return ve->cpu_stats == NULL ? -ENOMEM : 0; +} + +static inline void free_ve_cpustats(struct ve_struct *ve) +{ + free_percpu(ve->cpu_stats); + ve->cpu_stats = NULL; +} + +static int alone_in_pgrp(struct task_struct *tsk) +{ + struct task_struct *p; + int alone = 0; + + read_lock(&tasklist_lock); + do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p); + do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) { + if (p != tsk) + goto out; + } while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p); + alone = 1; +out: + read_unlock(&tasklist_lock); + return alone; +} + +static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + struct task_struct *tsk; + struct ve_struct *old; + struct ve_struct *old_exec; + struct ve_struct *ve; + __u64 init_mask; + int err; + struct nsproxy *old_ns, *old_ns_net; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + tsk = current; + old = VE_TASK_INFO(tsk)->owner_env; + + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + return -EINVAL; + + if (tsk->signal->tty) { + printk("ERR: CT init has controlling terminal\n"); + return -EINVAL; + } + if (task_pgrp(tsk) != task_pid(tsk) || + task_session(tsk) != task_pid(tsk)) { + int may_setsid; + + read_lock(&tasklist_lock); + may_setsid = !tsk->signal->leader && + !find_task_by_pid_type_ns(PIDTYPE_PGID, task_pid_nr(tsk), &init_pid_ns); + read_unlock(&tasklist_lock); + + if (!may_setsid) { + printk("ERR: CT init is process group leader\n"); + return -EINVAL; + } + } + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID and must fail. */ + if (!alone_in_pgrp(tsk)) { + printk("ERR: CT init is not alone in process group\n"); + return -EINVAL; + } + + + VZTRACE("%s: veid=%d classid=%d pid=%d\n", + __FUNCTION__, veid, class_id, current->pid); + + err = -ENOMEM; + ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL); + if (ve == NULL) + goto err_struct; + + init_ve_struct(ve, veid, class_id, data, datalen); + __module_get(THIS_MODULE); + down_write(&ve->op_sem); + if (flags & VE_LOCK) + ve->is_locked = 1; + + /* + * this should be done before adding to list + * because if calc_load_ve finds this ve in + * list it will be very surprised + */ + if ((err = init_ve_cpustats(ve)) < 0) + goto err_cpu_stats; + + if ((err = ve_list_add(ve)) < 0) + goto err_exist; + + /* this should be done before context switching */ + if ((err = init_printk(ve)) < 0) + goto err_log_wait; + + old_exec = set_exec_env(ve); + + if ((err = init_ve_sched(ve)) < 0) + goto err_sched; + + set_ve_root(ve, tsk); + + if ((err = init_ve_sysfs(ve))) + goto err_sysfs; + + if ((err = init_ve_mibs(ve))) + goto err_mibs; + + if ((err = init_ve_namespaces(ve, &old_ns))) + goto err_ns; + + if ((err = init_ve_proc(ve))) + goto err_proc; + + if ((err = init_ve_sysctl(ve))) + goto err_sysctl; + + if ((err = init_ve_route(ve)) < 0) + goto err_route; + + if ((err = init_ve_route6(ve)) < 0) + goto err_route6; + + if ((err = init_ve_netns(ve, &old_ns_net))) + goto err_netns; + + if ((err = init_ve_tty_drivers(ve)) < 0) + goto err_tty; + + if ((err = init_ve_shmem(ve))) + goto err_shmem; + + if ((err = init_ve_devpts(ve))) + goto err_devpts; + + if((err = init_ve_meminfo(ve))) + goto err_meminf; + + set_ve_caps(ve, tsk); + + /* It is safe to initialize netfilter here as routing initialization and + interface setup will be done below. This means that NO skb can be + passed inside. Den */ + /* iptables ve initialization for non ve0; + ve0 init is in module_init */ + if ((err = init_ve_netfilter()) < 0) + goto err_netfilter; + + init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; + if ((err = init_ve_iptables(ve, init_mask)) < 0) + goto err_iptables; + + if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0) + goto err_vpid; + + if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0) + goto err_ve_hook; + + put_nsproxy(old_ns); + put_nsproxy(old_ns_net); + + /* finally: set vpids and move inside */ + ve_move_task(tsk, ve); + grsecurity_setup(); + + ve->is_running = 1; + up_write(&ve->op_sem); + + printk(KERN_INFO "CT: %d: started\n", veid); + return veid; + +err_ve_hook: + mntget(ve->proc_mnt); +err_vpid: + fini_venet(ve); + fini_ve_iptables(ve, init_mask); +err_iptables: + fini_ve_netfilter(); +err_netfilter: + fini_ve_meminfo(ve); +err_meminf: + fini_ve_devpts(ve); +err_devpts: + fini_ve_shmem(ve); +err_shmem: + fini_ve_tty_drivers(ve); +err_tty: + ve->ve_ns->net_ns->sysfs_completion = &sysfs_completion; + fini_ve_namespaces(ve, old_ns_net); + put_nsproxy(old_ns_net); + wait_for_completion(&sysfs_completion); +err_netns: + fini_ve_route6(ve); +err_route6: + fini_ve_route(ve); +err_route: + fini_ve_sysctl(ve); +err_sysctl: + /* + * If process hasn't become VE's init, proc_mnt won't be put during + * pidns death, so this mntput by hand is needed. If it has, we + * compensate with mntget above. + */ + mntput(ve->proc_mnt); + fini_ve_proc(ve); +err_proc: + /* free_ve_utsname() is called inside real_put_ve() */ + fini_ve_namespaces(ve, old_ns); + put_nsproxy(old_ns); + /* + * We need to compensate, because fini_ve_namespaces() assumes + * ve->ve_ns will continue to be used after, but VE will be freed soon + * (in kfree() sense). + */ + put_nsproxy(ve->ve_ns); +err_ns: + clean_device_perms_ve(ve->veid); + fini_ve_mibs(ve); +err_mibs: + fini_ve_sysfs(ve); +err_sysfs: + /* It is safe to restore current->envid here because + * ve_fairsched_detach does not use current->envid. */ + /* Really fairsched code uses current->envid in sys_fairsched_mknod + * only. It is correct if sys_fairsched_mknod is called from + * userspace. If sys_fairsched_mknod is called from + * ve_fairsched_attach, then node->envid and node->parent_node->envid + * are explicitly set to valid value after the call. */ + /* FIXME */ + VE_TASK_INFO(tsk)->owner_env = old; + VE_TASK_INFO(tsk)->exec_env = old_exec; + + fini_ve_sched(ve); +err_sched: + (void)set_exec_env(old_exec); + + /* we can jump here having incorrect envid */ + VE_TASK_INFO(tsk)->owner_env = old; + fini_printk(ve); +err_log_wait: + /* cpustats will be freed in do_env_free */ + ve_list_del(ve); + up_write(&ve->op_sem); + + real_put_ve(ve); +err_struct: + printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err); + return err; + +err_exist: + free_ve_cpustats(ve); +err_cpu_stats: + kfree(ve); + goto err_struct; +} + + +/********************************************************************** + ********************************************************************** + * + * VE start/stop callbacks + * + ********************************************************************** + **********************************************************************/ + +int real_env_create(envid_t veid, unsigned flags, u32 class_id, + env_create_param_t *data, int datalen) +{ + int status; + struct ve_struct *ve; + + if (!flags) { + status = get_exec_env()->veid; + goto out; + } + + status = -EPERM; + if (!capable(CAP_SETVEID)) + goto out; + + status = -EINVAL; + if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) + goto out; + + status = -EINVAL; + ve = get_ve_by_id(veid); + if (ve) { + if (flags & VE_TEST) { + status = 0; + goto out_put; + } + if (flags & VE_EXCLUSIVE) { + status = -EACCES; + goto out_put; + } + if (flags & VE_CREATE) { + flags &= ~VE_CREATE; + flags |= VE_ENTER; + } + } else { + if (flags & (VE_TEST|VE_ENTER)) { + status = -ESRCH; + goto out; + } + } + + if (flags & VE_CREATE) { + status = do_env_create(veid, flags, class_id, data, datalen); + goto out; + } else if (flags & VE_ENTER) + status = do_env_enter(ve, flags); + + /* else: returning EINVAL */ + +out_put: + real_put_ve(ve); +out: + return status; +} +EXPORT_SYMBOL(real_env_create); + +static int do_env_enter(struct ve_struct *ve, unsigned int flags) +{ + struct task_struct *tsk = current; + int err; + + VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); + + err = -EBUSY; + down_read(&ve->op_sem); + if (!ve->is_running) + goto out_up; + if (ve->is_locked && !(flags & VE_SKIPLOCK)) + goto out_up; + err = -EINVAL; + if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) + goto out_up; + +#ifdef CONFIG_VZ_FAIRSCHED + err = sys_fairsched_mvpr(current->pid, ve->veid); + if (err) + goto out_up; +#endif + ve_sched_attach(ve); + switch_ve_namespaces(ve, tsk); + ve_move_task(current, ve); + + /* Check that the process is not a leader of non-empty group/session. + * If it is, we cannot virtualize its PID. Do not fail, just leave + * it non-virtual. + */ + if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK)) + pid_ns_attach_task(ve->ve_ns->pid_ns, tsk); + + /* Unlike VE_CREATE, we do not setsid() in VE_ENTER. + * Process is allowed to be in an external group/session. + * If user space callers wants, it will do setsid() after + * VE_ENTER. + */ + err = VE_TASK_INFO(tsk)->owner_env->veid; + tsk->did_ve_enter = 1; + +out_up: + up_read(&ve->op_sem); + return err; +} + +static void env_cleanup(struct ve_struct *ve) +{ + struct ve_struct *old_ve; + DECLARE_COMPLETION_ONSTACK(sysfs_completion); + + VZTRACE("real_do_env_cleanup\n"); + + down_read(&ve->op_sem); + old_ve = set_exec_env(ve); + + ve_hook_iterate_fini(VE_SS_CHAIN, ve); + + fini_venet(ve); + + /* no new packets in flight beyond this point */ + /* skb hold dst_entry, and in turn lies in the ip fragment queue */ + ip_fragment_cleanup(ve); + + /* kill iptables */ + /* No skb belonging to VE can exist at this point as unregister_netdev + is an operation awaiting until ALL skb's gone */ + fini_ve_iptables(ve, ve->_iptables_modules); + fini_ve_netfilter(); + + fini_ve_sched(ve); + clean_device_perms_ve(ve->veid); + + fini_ve_devpts(ve); + fini_ve_shmem(ve); + unregister_ve_tty_drivers(ve); + fini_ve_meminfo(ve); + + ve->ve_ns->net_ns->sysfs_completion = &sysfs_completion; + fini_ve_namespaces(ve, NULL); + wait_for_completion(&sysfs_completion); + fini_ve_route(ve); + fini_ve_route6(ve); + fini_ve_mibs(ve); + fini_ve_sysctl(ve); + fini_ve_proc(ve); + fini_ve_sysfs(ve); + + (void)set_exec_env(old_ve); + fini_printk(ve); /* no printk can happen in ve context anymore */ + + ve_list_del(ve); + up_read(&ve->op_sem); + + real_put_ve(ve); +} + +static DECLARE_COMPLETION(vzmond_complete); +static volatile int stop_vzmond; + +static int vzmond_helper(void *arg) +{ + char name[18]; + struct ve_struct *ve; + + ve = (struct ve_struct *)arg; + snprintf(name, sizeof(name), "vzmond/%d", ve->veid); + daemonize(name); + env_cleanup(ve); + module_put_and_exit(0); +} + +static void do_pending_env_cleanups(void) +{ + int err; + struct ve_struct *ve; + + spin_lock(&ve_cleanup_lock); + while (1) { + if (list_empty(&ve_cleanup_list) || need_resched()) + break; + + ve = list_first_entry(&ve_cleanup_list, + struct ve_struct, cleanup_list); + list_del(&ve->cleanup_list); + spin_unlock(&ve_cleanup_lock); + + __module_get(THIS_MODULE); + err = kernel_thread(vzmond_helper, (void *)ve, 0); + if (err < 0) { + env_cleanup(ve); + module_put(THIS_MODULE); + } + + spin_lock(&ve_cleanup_lock); + } + spin_unlock(&ve_cleanup_lock); +} + +static inline int have_pending_cleanups(void) +{ + return !list_empty(&ve_cleanup_list); +} + +static int vzmond(void *arg) +{ + daemonize("vzmond"); + set_current_state(TASK_INTERRUPTIBLE); + + while (!stop_vzmond || have_pending_cleanups()) { + schedule(); + try_to_freeze(); + if (signal_pending(current)) + flush_signals(current); + + do_pending_env_cleanups(); + set_current_state(TASK_INTERRUPTIBLE); + if (have_pending_cleanups()) + __set_current_state(TASK_RUNNING); + } + + __set_task_state(current, TASK_RUNNING); + complete_and_exit(&vzmond_complete, 0); +} + +static int __init init_vzmond(void) +{ + int pid; + struct task_struct *tsk; + + pid = kernel_thread(vzmond, NULL, 0); + if (pid > 0) { + tsk = find_task_by_pid(pid); + BUG_ON(tsk == NULL); + ve_cleanup_thread = tsk; + } + return pid; +} + +static void fini_vzmond(void) +{ + stop_vzmond = 1; + wake_up_process(ve_cleanup_thread); + wait_for_completion(&vzmond_complete); + ve_cleanup_thread = NULL; + WARN_ON(!list_empty(&ve_cleanup_list)); +} + +void real_do_env_free(struct ve_struct *ve) +{ + VZTRACE("real_do_env_free\n"); + + free_ve_tty_drivers(ve); + free_ve_sysctl(ve); /* free per ve sysctl data */ + free_ve_filesystems(ve); + free_ve_cpustats(ve); + printk(KERN_INFO "CT: %d: stopped\n", VEID(ve)); + kfree(ve); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(real_do_env_free); + + +/********************************************************************** + ********************************************************************** + * + * VE TTY handling + * + ********************************************************************** + **********************************************************************/ + +static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, + struct ve_struct *ve) +{ + size_t size; + struct tty_driver *driver; + + driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC); + if (!driver) + goto out; + + memcpy(driver, base, sizeof(struct tty_driver)); + + driver->driver_state = NULL; + + size = base->num * 3 * sizeof(void *); + if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { + void **p; + p = kzalloc(size, GFP_KERNEL_UBC); + if (!p) + goto out_free; + + driver->ttys = (struct tty_struct **)p; + driver->termios = (struct ktermios **)(p + driver->num); + driver->termios_locked = (struct ktermios **) + (p + driver->num * 2); + } else { + driver->ttys = NULL; + driver->termios = NULL; + driver->termios_locked = NULL; + } + + driver->owner_env = ve; + driver->flags |= TTY_DRIVER_INSTALLED; + driver->refcount = 0; + + return driver; + +out_free: + kfree(driver); +out: + return NULL; +} + +static void free_ve_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + + clear_termios(driver); + kfree(driver->ttys); + kfree(driver); +} + +static int alloc_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + /* Traditional BSD devices */ + ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); + if (!ve->pty_driver) + goto out_mem; + + ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); + if (!ve->pty_slave_driver) + goto out_mem; + + ve->pty_driver->other = ve->pty_slave_driver; + ve->pty_slave_driver->other = ve->pty_driver; +#endif + +#ifdef CONFIG_UNIX98_PTYS + ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); + if (!ve->ptm_driver) + goto out_mem; + + ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); + if (!ve->pts_driver) + goto out_mem; + + ve->ptm_driver->other = ve->pts_driver; + ve->pts_driver->other = ve->ptm_driver; + + ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), + GFP_KERNEL_UBC); + if (!ve->allocated_ptys) + goto out_mem; + idr_init(ve->allocated_ptys); +#endif + return 0; + +out_mem: + free_ve_tty_drivers(ve); + return -ENOMEM; +} + +static void free_ve_tty_drivers(struct ve_struct* ve) +{ +#ifdef CONFIG_LEGACY_PTYS + free_ve_tty_driver(ve->pty_driver); + free_ve_tty_driver(ve->pty_slave_driver); + ve->pty_driver = ve->pty_slave_driver = NULL; +#endif +#ifdef CONFIG_UNIX98_PTYS + free_ve_tty_driver(ve->ptm_driver); + free_ve_tty_driver(ve->pts_driver); + kfree(ve->allocated_ptys); + ve->ptm_driver = ve->pts_driver = NULL; + ve->allocated_ptys = NULL; +#endif +} + +static inline void __register_tty_driver(struct tty_driver *driver) +{ + list_add(&driver->tty_drivers, &tty_drivers); +} + +static inline void __unregister_tty_driver(struct tty_driver *driver) +{ + if (!driver) + return; + list_del(&driver->tty_drivers); +} + +static int register_ve_tty_drivers(struct ve_struct* ve) +{ + mutex_lock(&tty_mutex); +#ifdef CONFIG_UNIX98_PTYS + __register_tty_driver(ve->ptm_driver); + __register_tty_driver(ve->pts_driver); +#endif +#ifdef CONFIG_LEGACY_PTYS + __register_tty_driver(ve->pty_driver); + __register_tty_driver(ve->pty_slave_driver); +#endif + mutex_unlock(&tty_mutex); + + return 0; +} + +static void unregister_ve_tty_drivers(struct ve_struct* ve) +{ + VZTRACE("unregister_ve_tty_drivers\n"); + + mutex_lock(&tty_mutex); +#ifdef CONFIG_LEGACY_PTYS + __unregister_tty_driver(ve->pty_driver); + __unregister_tty_driver(ve->pty_slave_driver); +#endif +#ifdef CONFIG_UNIX98_PTYS + __unregister_tty_driver(ve->ptm_driver); + __unregister_tty_driver(ve->pts_driver); +#endif + mutex_unlock(&tty_mutex); +} + +static int init_ve_tty_drivers(struct ve_struct *ve) +{ + int err; + + if ((err = alloc_ve_tty_drivers(ve))) + goto err_ttyalloc; + if ((err = register_ve_tty_drivers(ve))) + goto err_ttyreg; + return 0; + +err_ttyreg: + free_ve_tty_drivers(ve); +err_ttyalloc: + return err; +} + +static void fini_ve_tty_drivers(struct ve_struct *ve) +{ + unregister_ve_tty_drivers(ve); + free_ve_tty_drivers(ve); +} + +/* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ +static void clear_termios(struct tty_driver *driver) +{ + int i; + struct ktermios *tp; + + if (driver->termios == NULL) + return; + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + } +} + + +/********************************************************************** + ********************************************************************** + * + * Pieces of VE network + * + ********************************************************************** + **********************************************************************/ + +#ifdef CONFIG_NET +#include +#include +#include +#include +#include +#include +#endif + +static int ve_dev_add(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *dst_ve; + struct net *dst_net; + int err = -ESRCH; + + dst_ve = get_ve_by_id(veid); + if (dst_ve == NULL) + goto out; + + dst_net = dst_ve->ve_ns->net_ns; + + rtnl_lock(); + read_lock(&dev_base_lock); + dev = __dev_get_by_name(&init_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, dst_net, dev_name, + get_ve0(), dst_ve, get_exec_ub()); +out_unlock: + rtnl_unlock(); + real_put_ve(dst_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +static int ve_dev_del(envid_t veid, char *dev_name) +{ + struct net_device *dev; + struct ve_struct *src_ve; + struct net *src_net; + int err = -ESRCH; + + src_ve = get_ve_by_id(veid); + if (src_ve == NULL) + goto out; + + src_net = src_ve->ve_ns->net_ns; + + rtnl_lock(); + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(src_net, dev_name); + read_unlock(&dev_base_lock); + if (dev == NULL) + goto out_unlock; + + err = __dev_change_net_namespace(dev, &init_net, dev_name, + src_ve, get_ve0(), netdev_bc(dev)->owner_ub); +out_unlock: + rtnl_unlock(); + real_put_ve(src_ve); + + if (dev == NULL) + printk(KERN_WARNING "%s: device %s not found\n", + __func__, dev_name); +out: + return err; +} + +int real_ve_dev_map(envid_t veid, int op, char *dev_name) +{ + if (!capable(CAP_SETVEID)) + return -EPERM; + switch (op) { + case VE_NETDEV_ADD: + return ve_dev_add(veid, dev_name); + case VE_NETDEV_DEL: + return ve_dev_del(veid, dev_name); + default: + return -EINVAL; + } +} + +static void ve_mapped_devs_cleanup(struct ve_struct *ve) +{ + struct net *net = ve->ve_ns->net_ns; + struct net_device *dev, *next; + int rv; + + rtnl_lock(); + for_each_netdev_safe(net, dev, next) { + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + rv = __dev_change_net_namespace(dev, &init_net, dev->name, + ve, get_ve0(), netdev_bc(dev)->owner_ub); + if (rv < 0) + unregister_netdevice(dev); + } + rtnl_unlock(); +} + + +/********************************************************************** + ********************************************************************** + * + * VE information via /proc + * + ********************************************************************** + **********************************************************************/ +#ifdef CONFIG_PROC_FS +#if BITS_PER_LONG == 32 +#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) +#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" +#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" +#else +#define VESTAT_LINE_WIDTH (12 * 21) +#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" +#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" +#endif + +static int vestat_seq_show(struct seq_file *m, void *v) +{ + struct list_head *entry; + struct ve_struct *ve; + struct ve_struct *curve; + int cpu; + unsigned long user_ve, nice_ve, system_ve; + unsigned long long uptime; + cycles_t uptime_cycles, idle_time, strv_time, used; + + entry = (struct list_head *)v; + ve = list_entry(entry, struct ve_struct, ve_list); + + curve = get_exec_env(); + if (entry == ve_list_head.next || + (!ve_is_super(curve) && ve == curve)) { + /* print header */ + seq_printf(m, "%-*s\n", + VESTAT_LINE_WIDTH - 1, + "Version: 2.2"); + seq_printf(m, VESTAT_HEAD_FMT, "VEID", + "user", "nice", "system", + "uptime", "idle", + "strv", "uptime", "used", + "maxlat", "totlat", "numsched"); + } + + if (ve == get_ve0()) + return 0; + + user_ve = nice_ve = system_ve = 0; + idle_time = strv_time = used = 0; + + for_each_online_cpu(cpu) { + struct ve_cpu_stats *st; + + st = VE_CPU_STATS(ve, cpu); + user_ve += st->user; + nice_ve += st->nice; + system_ve += st->system; + used += st->used_time; + idle_time += ve_sched_get_idle_time(ve, cpu); + } + uptime_cycles = get_cycles() - ve->start_cycles; + uptime = get_jiffies_64() - ve->start_jiffies; + + seq_printf(m, VESTAT_LINE_FMT, ve->veid, + user_ve, nice_ve, system_ve, + (unsigned long long)uptime, + (unsigned long long)idle_time, + (unsigned long long)strv_time, + (unsigned long long)uptime_cycles, + (unsigned long long)used, + (unsigned long long)ve->sched_lat_ve.last.maxlat, + (unsigned long long)ve->sched_lat_ve.last.totlat, + ve->sched_lat_ve.last.count); + return 0; +} + +static void *ve_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ve_struct *curve; + struct list_head *entry; + loff_t l; + + curve = get_exec_env(); + read_lock(&ve_list_lock); + if (!ve_is_super(curve)) { + if (*pos != 0) + return NULL; + return curve; + } + + l = *pos; + list_for_each(entry, &ve_list_head) { + if (l == 0) + return entry; + l--; + } + return NULL; +} + +static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct list_head *entry; + + entry = (struct list_head *)v; + if (!ve_is_super(get_exec_env())) + return NULL; + (*pos)++; + return entry->next == &ve_list_head ? NULL : entry->next; +} + +static void ve_seq_stop(struct seq_file *m, void *v) +{ + read_unlock(&ve_list_lock); +} + +static struct seq_operations vestat_seq_op = { + .start = ve_seq_start, + .next = ve_seq_next, + .stop = ve_seq_stop, + .show = vestat_seq_show +}; + +static int vestat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vestat_seq_op); +} + +static struct file_operations proc_vestat_operations = { + .open = vestat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int vz_version_show(struct seq_file *file, void* v) +{ + static const char ver[] = VZVERSION "\n"; + + return seq_puts(file, ver); +} + +static int vz_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, vz_version_show, NULL); +} + +static struct file_operations proc_vz_version_oparations = { + .open = vz_version_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline unsigned long ve_used_mem(struct user_beancounter *ub) +{ + extern int glob_ve_meminfo; + return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held : + ub->ub_parms[UB_PRIVVMPAGES].held ; +} + +static inline void ve_mi_replace(struct meminfo *mi) +{ +#ifdef CONFIG_BEANCOUNTERS + struct user_beancounter *ub; + unsigned long meminfo_val; + unsigned long nodettram; + unsigned long usedmem; + + meminfo_val = get_exec_env()->meminfo_val; + + if(!meminfo_val) + return; /* No virtualization */ + + nodettram = mi->si.totalram; + ub = current->mm->mm_ub; + usedmem = ve_used_mem(ub); + + memset(mi, 0, sizeof(*mi)); + + mi->si.totalram = (meminfo_val > nodettram) ? + nodettram : meminfo_val; + mi->si.freeram = (mi->si.totalram > usedmem) ? + (mi->si.totalram - usedmem) : 0; +#else + return; +#endif +} + +static int meminfo_call(struct vnotifier_block *self, + unsigned long event, void *arg, int old_ret) +{ + if (event != VIRTINFO_MEMINFO) + return old_ret; + + ve_mi_replace((struct meminfo *)arg); + + return NOTIFY_OK; +} + + +static struct vnotifier_block meminfo_notifier_block = { + .notifier_call = meminfo_call +}; + +static int __init init_vecalls_proc(void) +{ + struct proc_dir_entry *de; + + de = create_proc_glob_entry_mod("vz/vestat", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + if (de == NULL) { + /* create "vz" subdirectory, if not exist */ + (void) create_proc_glob_entry("vz", + S_IFDIR|S_IRUGO|S_IXUGO, NULL); + de = create_proc_glob_entry_mod("vz/vestat", + S_IFREG|S_IRUSR, NULL, THIS_MODULE); + } + if (de) + de->proc_fops = &proc_vestat_operations; + else + printk(KERN_WARNING + "VZMON: can't make vestat proc entry\n"); + + de = create_proc_entry_mod("vz/devperms", S_IFREG | S_IRUSR, NULL, + THIS_MODULE); + if (de) + de->proc_fops = &proc_devperms_ops; + else + printk(KERN_WARNING + "VZMON: can't make devperms proc entry\n"); + + + de = create_proc_entry_mod("vz/version", S_IFREG | 0444, NULL, + THIS_MODULE); + if (de) + de->proc_fops = &proc_vz_version_oparations; + else + printk(KERN_WARNING + "VZMON: can't make version proc entry\n"); + + virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); + + return 0; +} + +static void fini_vecalls_proc(void) +{ + remove_proc_entry("vz/version", NULL); + remove_proc_entry("vz/devperms", NULL); + remove_proc_entry("vz/vestat", NULL); + virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); +} +#else +#define init_vecalls_proc() (0) +#define fini_vecalls_proc() do { } while (0) +#endif /* CONFIG_PROC_FS */ + + +/********************************************************************** + ********************************************************************** + * + * User ctl + * + ********************************************************************** + **********************************************************************/ + +int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int err; + + err = -ENOTTY; + switch(cmd) { + case VZCTL_MARK_ENV_TO_DOWN: { + /* Compatibility issue */ + err = 0; + } + break; + case VZCTL_SETDEVPERMS: { + /* Device type was mistakenly declared as dev_t + * in the old user-kernel interface. + * That's wrong, dev_t is a kernel internal type. + * I use `unsigned' not having anything better in mind. + * 2001/08/11 SAW */ + struct vzctl_setdevperms s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_setdevperms(s.veid, s.type, + new_decode_dev(s.dev), s.mask); + } + break; +#ifdef CONFIG_INET + case VZCTL_VE_NETDEV: { + struct vzctl_ve_netdev d; + char *s; + err = -EFAULT; + if (copy_from_user(&d, (void __user *)arg, sizeof(d))) + break; + err = -ENOMEM; + s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); + if (s == NULL) + break; + err = -EFAULT; + if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { + s[IFNAMSIZ] = 0; + err = real_ve_dev_map(d.veid, d.op, s); + } + kfree(s); + } + break; +#endif + case VZCTL_ENV_CREATE: { + struct vzctl_env_create s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = real_env_create(s.veid, s.flags, s.class_id, + NULL, 0); + } + break; + case VZCTL_ENV_CREATE_DATA: { + struct vzctl_env_create_data s; + env_create_param_t *data; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err=-EINVAL; + if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || + s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || + s.data == 0) + break; + err = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + break; + + err = -EFAULT; + if (copy_from_user(data, (void __user *)s.data, + s.datalen)) + goto free_data; + err = real_env_create(s.veid, s.flags, s.class_id, + data, s.datalen); +free_data: + kfree(data); + } + break; + case VZCTL_GET_CPU_STAT: { + struct vzctl_cpustatctl s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_get_cpu_stat(s.veid, s.cpustat); + } + break; + case VZCTL_VE_MEMINFO: { + struct vzctl_ve_meminfo s; + err = -EFAULT; + if (copy_from_user(&s, (void __user *)arg, sizeof(s))) + break; + err = ve_set_meminfo(s.veid, s.val); + } + break; + } + return err; +} + +#ifdef CONFIG_COMPAT +int compat_vzcalls_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err; + + switch(cmd) { + case VZCTL_GET_CPU_STAT: { + /* FIXME */ + } + case VZCTL_COMPAT_ENV_CREATE_DATA: { + struct compat_vzctl_env_create_data cs; + struct vzctl_env_create_data __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.flags, &s->flags) || + put_user(cs.class_id, &s->class_id) || + put_user(compat_ptr(cs.data), &s->data) || + put_user(cs.datalen, &s->datalen)) + break; + err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA, + (unsigned long)s); + break; + } +#ifdef CONFIG_NET + case VZCTL_COMPAT_VE_NETDEV: { + struct compat_vzctl_ve_netdev cs; + struct vzctl_ve_netdev __user *s; + + s = compat_alloc_user_space(sizeof(*s)); + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + + if (put_user(cs.veid, &s->veid) || + put_user(cs.op, &s->op) || + put_user(compat_ptr(cs.dev_name), &s->dev_name)) + break; + err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s); + break; + } +#endif + case VZCTL_COMPAT_VE_MEMINFO: { + struct compat_vzctl_ve_meminfo cs; + err = -EFAULT; + if (copy_from_user(&cs, (void *)arg, sizeof(cs))) + break; + err = ve_set_meminfo(cs.veid, cs.val); + break; + } + default: + err = vzcalls_ioctl(file, cmd, arg); + break; + } + return err; +} +#endif + +static struct vzioctlinfo vzcalls = { + .type = VZCTLTYPE, + .ioctl = vzcalls_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_vzcalls_ioctl, +#endif + .owner = THIS_MODULE, +}; + + +/********************************************************************** + ********************************************************************** + * + * Init/exit stuff + * + ********************************************************************** + **********************************************************************/ + +static int __init init_vecalls_symbols(void) +{ + KSYMRESOLVE(real_do_env_free); + KSYMMODRESOLVE(vzmon); + return 0; +} + +static void fini_vecalls_symbols(void) +{ + KSYMMODUNRESOLVE(vzmon); + KSYMUNRESOLVE(real_do_env_free); +} + +static inline __init int init_vecalls_ioctls(void) +{ + vzioctl_register(&vzcalls); + return 0; +} + +static inline void fini_vecalls_ioctls(void) +{ + vzioctl_unregister(&vzcalls); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *table_header; + +static ctl_table kernel_table[] = { + { + .procname = "ve_allow_kthreads", + .data = &ve_allow_kthreads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 } +}; + +static ctl_table root_table[] = { + {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table}, + { 0 } +}; + +static int init_vecalls_sysctl(void) +{ + table_header = register_sysctl_table(root_table); + if (!table_header) + return -ENOMEM ; + return 0; +} + +static void fini_vecalls_sysctl(void) +{ + unregister_sysctl_table(table_header); +} +#else +static int init_vecalls_sysctl(void) { return 0; } +static void fini_vecalls_sysctl(void) { ; } +#endif + +static int __init vecalls_init(void) +{ + int err; + + err = init_vecalls_sysctl(); + if (err) + goto out_vzmond; + + err = init_vzmond(); + if (err < 0) + goto out_sysctl; + + err = init_vecalls_symbols(); + if (err < 0) + goto out_sym; + + err = init_vecalls_proc(); + if (err < 0) + goto out_proc; + + err = init_vecalls_ioctls(); + if (err < 0) + goto out_ioctls; + + return 0; + +out_ioctls: + fini_vecalls_proc(); +out_proc: + fini_vecalls_symbols(); +out_sym: + fini_vzmond(); +out_sysctl: + fini_vecalls_sysctl(); +out_vzmond: + return err; +} + +static void vecalls_exit(void) +{ + fini_vecalls_ioctls(); + fini_vecalls_proc(); + fini_vecalls_symbols(); + fini_vzmond(); + fini_vecalls_sysctl(); +} + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Control"); +MODULE_LICENSE("GPL v2"); + +module_init(vecalls_init) +module_exit(vecalls_exit) diff -uprN linux-2.6.24/kernel/ve/veowner.c linux-2.6.24.ovz/kernel/ve/veowner.c --- linux-2.6.24/kernel/ve/veowner.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/veowner.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,237 @@ +/* + * kernel/ve/veowner.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +void prepare_ve0_process(struct task_struct *tsk) +{ + VE_TASK_INFO(tsk)->exec_env = get_ve0(); + VE_TASK_INFO(tsk)->owner_env = get_ve0(); + VE_TASK_INFO(tsk)->sleep_time = 0; + VE_TASK_INFO(tsk)->wakeup_stamp = 0; + VE_TASK_INFO(tsk)->sched_time = 0; + seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); + + if (tsk->pid) { + list_add_rcu(&tsk->ve_task_info.vetask_list, + &get_ve0()->vetask_lh); + atomic_inc(&get_ve0()->pcounter); + } +} + +/* + * ------------------------------------------------------------------------ + * proc entries + * ------------------------------------------------------------------------ + */ + +#ifdef CONFIG_PROC_FS +static void proc_move(struct proc_dir_entry *ddir, + struct proc_dir_entry *sdir, + const char *name) +{ + struct proc_dir_entry **p, *q; + int len; + + len = strlen(name); + for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p) + if (proc_match(len, name, q)) + break; + if (q == NULL) + return; + *p = q->next; + q->parent = ddir; + q->next = ddir->subdir; + ddir->subdir = q; +} +static void prepare_proc_misc(void) +{ + static char *table[] = { + "loadavg", + "uptime", + "meminfo", + "version", + "stat", + "filesystems", + "locks", + "swaps", + "mounts", + "net", + "cpuinfo", + "sysvipc", + "sys", + "fs", + "vz", + "cmdline", + "vmstat", + "modules", + NULL, + }; + char **p; + + for (p = table; *p != NULL; p++) + proc_move(&proc_root, ve0.proc_root, *p); +} +int prepare_proc(void) +{ + struct ve_struct *envid; + struct proc_dir_entry *de; + struct proc_dir_entry *ve_root; + + envid = set_exec_env(&ve0); + ve_root = ve0.proc_root->subdir; + /* move the whole tree to be visible in VE0 only */ + ve0.proc_root->subdir = proc_root.subdir; + for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next) + de->parent = ve0.proc_root; + de->parent = ve0.proc_root; + de->next = ve_root; + + /* move back into the global scope some specific entries */ + proc_root.subdir = NULL; + prepare_proc_misc(); + proc_mkdir("vz", NULL); +#ifdef CONFIG_SYSVIPC + proc_mkdir("sysvipc", NULL); +#endif + proc_root_fs = proc_mkdir("fs", NULL); + /* XXX proc_tty_init(); */ + + /* XXX process inodes */ + + (void)set_exec_env(envid); + + (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); + return 0; +} + +static struct proc_dir_entry ve0_proc_root = { + .name = "/proc", + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2 +}; + +void prepare_ve0_proc_root(void) +{ + ve0.proc_root = &ve0_proc_root; +} +#endif + +/* + * ------------------------------------------------------------------------ + * Virtualized sysctl + * ------------------------------------------------------------------------ + */ +extern int ve_area_access_check; +#ifdef CONFIG_INET +static ctl_table vz_ipv4_route_table[] = { + { + .procname = "src_check", + .data = &ip_rt_src_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0 } +}; +static ctl_table vz_ipv4_table[] = { + {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table}, + { 0 } +}; +static ctl_table vz_net_table[] = { + {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table}, + { 0 } +}; +#endif +static ctl_table vz_fs_table[] = { + { + .procname = "ve-area-access-check", + .data = &ve_area_access_check, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0 } +}; +static ctl_table root_table2[] = { +#ifdef CONFIG_INET + {CTL_NET, "net", NULL, 0, 0555, vz_net_table}, +#endif + {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table}, + { 0 } +}; +int prepare_sysctl(void) +{ + struct ve_struct *envid; + + envid = set_exec_env(&ve0); + register_sysctl_table(root_table2); + (void)set_exec_env(envid); + return 0; +} + +void prepare_ve0_sysctl(void) +{ + INIT_LIST_HEAD(&ve0.sysctl_lh); +} + +/* + * ------------------------------------------------------------------------ + * XXX init_ve_system + * ------------------------------------------------------------------------ + */ + +void init_ve_system(void) +{ + struct task_struct *init_entry; + struct ve_struct *ve; + + ve = get_ve0(); + + init_entry = init_pid_ns.child_reaper; + /* if ve_move_task to VE0 (e.g. in cpt code) * + * occurs, ve_cap_bset on VE0 is required */ + ve->ve_cap_bset = CAP_INIT_EFF_SET; + +#ifdef CONFIG_INET + ve->_ipv4_devconf = &ipv4_devconf; + ve->_ipv4_devconf_dflt = &ipv4_devconf_dflt; +#endif + + read_lock(&init_entry->fs->lock); + ve->fs_rootmnt = init_entry->fs->rootmnt; + ve->fs_root = init_entry->fs->root; + read_unlock(&init_entry->fs->lock); + + /* common prepares */ +#ifdef CONFIG_PROC_FS + prepare_proc(); +#endif + prepare_sysctl(); +} diff -uprN linux-2.6.24/kernel/ve/vzdev.c linux-2.6.24.ovz/kernel/ve/vzdev.c --- linux-2.6.24/kernel/ve/vzdev.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/vzdev.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,154 @@ +/* + * kernel/ve/vzdev.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VZCTL_MAJOR 126 +#define VZCTL_NAME "vzctl" + +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo Interface"); +MODULE_LICENSE("GPL v2"); + +static LIST_HEAD(ioctls); +static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; + +static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd) +{ + struct vzioctlinfo *h; + + spin_lock(&ioctl_lock); + list_for_each_entry(h, &ioctls, list) { + if (h->type == _IOC_TYPE(cmd)) + goto found; + } + h = NULL; +found: + if (h && !try_module_get(h->owner)) + h = NULL; + spin_unlock(&ioctl_lock); + return h; +} + +static void vzctl_put_handler(struct vzioctlinfo *h) +{ + if (!h) + return; + + module_put(h->owner); +} + +long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOTTY; + h = vzctl_get_handler(cmd); + if (h && h->ioctl) + err = (*h->ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vzioctlinfo *h; + int err; + + err = -ENOIOCTLCMD; + h = vzctl_get_handler(cmd); + if (h && h->compat_ioctl) + err = (*h->compat_ioctl)(file, cmd, arg); + vzctl_put_handler(h); + + return err; +} + +void vzioctl_register(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_add(&inf->list, &ioctls); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_register); + +void vzioctl_unregister(struct vzioctlinfo *inf) +{ + spin_lock(&ioctl_lock); + list_del_init(&inf->list); + spin_unlock(&ioctl_lock); +} +EXPORT_SYMBOL(vzioctl_unregister); + +/* + * Init/exit stuff. + */ +static struct file_operations vzctl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vzctl_ioctl, + .compat_ioctl = compat_vzctl_ioctl, +}; + +static struct class *vzctl_class; + +static void __exit vzctl_exit(void) +{ + class_device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); + class_destroy(vzctl_class); + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +} + +static int __init vzctl_init(void) +{ + int ret; + struct class_device *class_err; + + ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); + if (ret < 0) + goto out; + + vzctl_class = class_create(THIS_MODULE, "vzctl"); + if (IS_ERR(vzctl_class)) { + ret = PTR_ERR(vzctl_class); + goto out_cleandev; + } + + class_err = class_device_create(vzctl_class, NULL, MKDEV(VZCTL_MAJOR, 0), + NULL, VZCTL_NAME); + if (IS_ERR(class_err)) { + ret = PTR_ERR(class_err); + goto out_rmclass; + } + + goto out; + +out_rmclass: + class_destroy(vzctl_class); +out_cleandev: + unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); +out: + return ret; +} + +module_init(vzctl_init) +module_exit(vzctl_exit); diff -uprN linux-2.6.24/kernel/ve/vzevent.c linux-2.6.24.ovz/kernel/ve/vzevent.c --- linux-2.6.24/kernel/ve/vzevent.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/vzevent.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define NETLINK_UEVENT 31 +#define VZ_EVGRP_ALL 0x01 + +/* + * NOTE: the original idea was to send events via kobject_uevent(), + * however, it turns out that it has negative consequences like + * start of /sbin/hotplug which tries to react on our events in inadequate manner. + */ + +static struct sock *vzev_sock; + +static char *action_to_string(int action) +{ + switch (action) { + case KOBJ_MOUNT: + return "ve-mount"; + case KOBJ_UMOUNT: + return "ve-umount"; + case KOBJ_START: + return "ve-start"; + case KOBJ_STOP: + return "ve-stop"; + default: + return NULL; + } +} + +static int do_vzevent_send(int event, char *msg, int len) +{ + struct sk_buff *skb; + char *buf, *action; + int alen; + + action = action_to_string(event); + alen = strlen(action); + + skb = alloc_skb(len + 1 + alen, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + buf = skb_put(skb, len + 1 + alen); + memcpy(buf, action, alen); + buf[alen] = '@'; + memcpy(buf + alen + 1, msg, len); + (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL); + return 0; +} + +int vzevent_send(int event, const char *attrs_fmt, ...) +{ + va_list args; + int len, err; + struct ve_struct *ve; + char *page; + + err = -ENOMEM; + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + goto out; + + va_start(args, attrs_fmt); + len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args); + va_end(args); + + ve = set_exec_env(get_ve0()); + err = do_vzevent_send(event, page, len); + (void)set_exec_env(ve); + free_page((unsigned long)page); +out: + return err; +} +EXPORT_SYMBOL(vzevent_send); + +static int ve_start(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_START, "%d", ve->veid); + return 0; +} + +static void ve_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + vzevent_send(KOBJ_STOP, "%d", ve->veid); +} + +static struct ve_hook ve_start_stop_hook = { + .init = ve_start, + .fini = ve_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_AFTERALL, +}; + +static int __init init_vzevent(void) +{ + vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE); + if (vzev_sock == NULL) + return -ENOMEM; + ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook); + return 0; +} + +static void __exit exit_vzevent(void) +{ + ve_hook_unregister(&ve_start_stop_hook); + sock_release(vzev_sock->sk_socket); +} + +MODULE_LICENSE("GPL"); + +module_init(init_vzevent); +module_exit(exit_vzevent); diff -uprN linux-2.6.24/kernel/ve/vzwdog.c linux-2.6.24.ovz/kernel/ve/vzwdog.c --- linux-2.6.24/kernel/ve/vzwdog.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/ve/vzwdog.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,281 @@ +/* + * kernel/ve/vzwdog.c + * + * Copyright (C) 2000-2005 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Staff regading kernel thread polling VE validity */ +static int sleep_timeout = 60; +static struct task_struct *wdog_thread_tsk; + +extern void show_mem(void); + +static struct file *intr_file; +static char page[PAGE_SIZE]; + +static void parse_irq_list(int len) +{ + int i, k, skip; + for (i = 0; i < len; ) { + k = i; + while (i < len && page[i] != '\n' && page[i] != ':') + i++; + skip = 0; + if (i < len && page[i] != '\n') { + i++; /* skip ':' */ + while (i < len && (page[i] == ' ' || page[i] == '0')) + i++; + skip = (i < len && (page[i] < '0' || page[i] > '9')); + while (i < len && page[i] != '\n') + i++; + } + if (!skip) + printk("%.*s\n", i - k, page + k); + if (i < len) + i++; /* skip '\n' */ + } +} + +extern loff_t vfs_llseek(struct file *file, loff_t, int); +extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *); +extern struct file *filp_open(const char *filename, int flags, int mode); +extern int filp_close(struct file *filp, fl_owner_t id); +static void show_irq_list(void) +{ + mm_segment_t fs; + int r; + + fs = get_fs(); + set_fs(KERNEL_DS); + vfs_llseek(intr_file, 0, 0); + r = vfs_read(intr_file, (void __user *)page, sizeof(page), + &intr_file->f_pos); + set_fs(fs); + + if (r > 0) + parse_irq_list(r); +} + +static void show_alloc_latency(void) +{ + static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { + "A0", + "L0", + "H0", + "L1", + "H1" + }; + int i; + + printk("lat: "); + for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { + struct kstat_lat_struct *p; + cycles_t maxlat, avg0, avg1, avg2; + + p = &kstat_glob.alloc_lat[i]; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("%s %Lu (%Lu %Lu %Lu)", + alloc_descr[i], + (unsigned long long)maxlat, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); + } + printk("\n"); +} + +static void show_schedule_latency(void) +{ + struct kstat_lat_pcpu_struct *p; + cycles_t maxlat, totlat, avg0, avg1, avg2; + unsigned long count; + + p = &kstat_glob.sched_lat; + spin_lock_irq(&kstat_glb_lock); + maxlat = p->last.maxlat; + totlat = p->last.totlat; + count = p->last.count; + avg0 = p->avg[0]; + avg1 = p->avg[1]; + avg2 = p->avg[2]; + spin_unlock_irq(&kstat_glb_lock); + + printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", + (unsigned long long)maxlat, + (unsigned long long)totlat, + count, + (unsigned long long)avg0, + (unsigned long long)avg1, + (unsigned long long)avg2); +} + +static void show_header(void) +{ + struct timeval tv; + + do_gettimeofday(&tv); + preempt_disable(); + printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", + tv.tv_sec, (long)tv.tv_usec, + (unsigned long long)get_jiffies_64(), + smp_processor_id()); +#ifdef CONFIG_FAIRSCHED + printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", + cycles_per_jiffy, HZ); +#else + printk("*** jiffies_per_second %u ***\n", HZ); +#endif + preempt_enable(); +} + +static void show_pgdatinfo(void) +{ + pg_data_t *pgdat; + + printk("pgdat:"); + for_each_online_pgdat(pgdat) { + printk(" %d: %lu,%lu,%lu", + pgdat->node_id, + pgdat->node_start_pfn, + pgdat->node_present_pages, + pgdat->node_spanned_pages); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(",%p", pgdat->node_mem_map); +#endif + } + printk("\n"); +} + +static void show_diskio(void) +{ + struct gendisk *gd; + char buf[BDEVNAME_SIZE]; + + printk("disk_io: "); + + list_for_each_entry(gd, &block_subsys.list, kobj.entry) { + char *name; + name = disk_name(gd, 0, buf); + if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && + isdigit(name[4])) + continue; + if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && + isdigit(name[3])) + continue; + printk("(%u,%u) %s r(%lu %lu %lu) w(%lu %lu %lu)\n", + gd->major, gd->first_minor, + name, + disk_stat_read(gd, ios[READ]), + disk_stat_read(gd, sectors[READ]), + disk_stat_read(gd, merges[READ]), + disk_stat_read(gd, ios[WRITE]), + disk_stat_read(gd, sectors[WRITE]), + disk_stat_read(gd, merges[WRITE])); + } + + printk("\n"); +} + +static void show_nrprocs(void) +{ + unsigned long _nr_running, _nr_sleeping, + _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; + + _nr_running = nr_running(); + _nr_unint = nr_uninterruptible(); + _nr_sleeping = nr_sleeping(); + _nr_zombie = nr_zombie; + _nr_dead = atomic_read(&nr_dead); + _nr_stopped = nr_stopped(); + + printk("VEnum: %d, proc R %lu, S %lu, D %lu, " + "Z %lu, X %lu, T %lu (tot %d)\n", + nr_ve, _nr_running, _nr_sleeping, _nr_unint, + _nr_zombie, _nr_dead, _nr_stopped, nr_threads); +} + +static void wdog_print(void) +{ + show_header(); + show_irq_list(); + show_pgdatinfo(); + show_mem(); + show_diskio(); + show_schedule_latency(); + show_alloc_latency(); + show_nrprocs(); +} + +static int wdog_loop(void* data) +{ + while (1) { + wdog_print(); + try_to_freeze(); + + set_current_state(TASK_UNINTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule_timeout(sleep_timeout*HZ); + } + return 0; +} + +static int __init wdog_init(void) +{ + struct file *file; + + file = filp_open("/proc/interrupts", 0, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + intr_file = file; + + wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog"); + if (IS_ERR(wdog_thread_tsk)) { + filp_close(intr_file, NULL); + return -EBUSY; + } + return 0; +} + +static void __exit wdog_exit(void) +{ + kthread_stop(wdog_thread_tsk); + filp_close(intr_file, NULL); +} + +module_param(sleep_timeout, int, 0660); +MODULE_AUTHOR("SWsoft "); +MODULE_DESCRIPTION("Virtuozzo WDOG"); +MODULE_LICENSE("GPL v2"); + +module_init(wdog_init) +module_exit(wdog_exit) diff -uprN linux-2.6.24/kernel/vzfairsched.c linux-2.6.24.ovz/kernel/vzfairsched.c --- linux-2.6.24/kernel/vzfairsched.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.24.ovz/kernel/vzfairsched.c 2008-03-25 18:53:59.000000000 -0500 @@ -0,0 +1,648 @@ +/* + * Fair Scheduler + * + * Copyright (C) 2000-2008 SWsoft + * All rights reserved. + * + * Licensing governed by "linux/COPYING.SWsoft" file. + * + */ + +#include +#include +#include +#include + +struct fairsched_node fairsched_init_node = { + .id = FAIRSCHED_INIT_NODE_ID, + .tg = &init_task_group, +#ifdef CONFIG_VE + .owner_env = get_ve0(), +#endif + .weight = 1, +}; + +static DEFINE_MUTEX(fairsched_mutex); + +/* list protected with fairsched_mutex */ +static LIST_HEAD(fairsched_node_head); +static int fairsched_nr_nodes; + +void __init fairsched_init_early(void) +{ + list_add(&fairsched_init_node.nodelist, &fairsched_node_head); + fairsched_nr_nodes++; +} + +#define FSCHWEIGHT_BASE 512000 + +/****************************************************************************** + * cfs group shares = FSCHWEIGHT_BASE / fairsched weight + * + * vzctl cpuunits default 1000 + * cfs shares default value is 1024 (see init_task_group_load in sched.c) + * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024 + * ^--- from vzctl + * weight in 1..65535 --> shares in 7..512000 + * shares should be >1 (see comment in sched_group_set_shares function) + *****************************************************************************/ + +static struct fairsched_node *fairsched_find(unsigned int id) +{ + struct fairsched_node *p; + list_for_each_entry(p, &fairsched_node_head, nodelist) { + if (p->id == id) + return p; + } + return NULL; +} + +/****************************************************************************** + * System calls + * + * All do_xxx functions are called under fairsched mutex and after + * capability check. + * + * The binary interfaces follow some other Fair Scheduler implementations + * (although some system call arguments are not needed for our implementation). + *****************************************************************************/ + +static int do_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + goto out; + if (newid < 0 || newid > INT_MAX) + goto out; + + retval = -EBUSY; + if (fairsched_find(newid) != NULL) + goto out; + + retval = -ENOMEM; + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node == NULL) + goto out; + + node->tg = sched_create_group(); + if (IS_ERR(node->tg)) + goto out_free; + + node->id = newid; + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); +#ifdef CONFIG_VE + node->owner_env = get_exec_env(); +#endif + list_add(&node->nodelist, &fairsched_node_head); + fairsched_nr_nodes++; + + retval = newid; +out: + return retval; + +out_free: + kfree(node); + return retval; +} + +asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, + unsigned int newid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mknod(parent, weight, newid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mknod); + +static int do_fairsched_rmnod(unsigned int id) +{ + struct fairsched_node *node; + int retval; + + retval = -EINVAL; + node = fairsched_find(id); + if (node == NULL) + goto out; + if (node == &fairsched_init_node) + goto out; + + retval = -EBUSY; + if (node->refcnt) + goto out; + + list_del(&node->nodelist); + fairsched_nr_nodes--; + + sched_destroy_group(node->tg); + kfree(node); + retval = 0; +out: + return retval; +} + +asmlinkage int sys_fairsched_rmnod(unsigned int id) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rmnod(id); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_rmnod); + +static int do_fairsched_chwt(unsigned int id, unsigned weight) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + if (weight < 1 || weight > FSCHWEIGHT_MAX) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + node->weight = weight; + sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); + + return 0; +} + +asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_chwt(id, weight); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + struct fairsched_node *node; + + if (id == 0) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + return 0; +} + +asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_vcpus(id, vcpus); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_vcpus); + +static int do_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + struct fairsched_node *node; + int retval; + + if (id == 0) + return -EINVAL; + if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31))) + return -EINVAL; + + node = fairsched_find(id); + if (node == NULL) + return -ENOENT; + + retval = -EINVAL; + switch (op) { + case FAIRSCHED_SET_RATE: + node->rate = rate; + node->rate_limited = 1; + retval = rate; + break; + case FAIRSCHED_DROP_RATE: + node->rate = 0; + node->rate_limited = 0; + retval = 0; + break; + case FAIRSCHED_GET_RATE: + if (node->rate_limited) + retval = node->rate; + else + retval = -ENODATA; + break; + } + return retval; +} + +asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_rate(id, op, rate); + mutex_unlock(&fairsched_mutex); + + return retval; +} + +static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + struct task_struct *p; + struct fairsched_node *node; + int retval; + unsigned flags; + + retval = -ENOENT; + node = fairsched_find(nodeid); + if (node == NULL) + goto out; + + write_lock_irqsave(&tasklist_lock, flags); + retval = -ESRCH; + p = find_task_by_pid(pid); + if (p == NULL) + goto out_unlock; + + get_task_struct(p); + put_task_fairsched_node(p); + p->fsched_node = node; + get_task_fairsched_node(p); + write_unlock_irqrestore(&tasklist_lock, flags); + + smp_wmb(); + sched_move_task(p); + put_task_struct(p); + return 0; + +out_unlock: + write_unlock_irqrestore(&tasklist_lock, flags); +out: + return retval; +} + +asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) +{ + int retval; + + if (!capable(CAP_SETVEID)) + return -EPERM; + + mutex_lock(&fairsched_mutex); + retval = do_fairsched_mvpr(pid, nodeid); + mutex_unlock(&fairsched_mutex); + + return retval; +} +EXPORT_SYMBOL(sys_fairsched_mvpr); + +#ifdef CONFIG_PROC_FS + +/*********************************************************************/ +/* + * proc interface + */ +/*********************************************************************/ + +#include +#include +#include + +struct fairsched_node_dump { + int id; + unsigned weight; + unsigned rate; + int rate_limited; + int nr_pcpu; + int nr_tasks, nr_runtasks; +}; + +struct fairsched_dump { + int len; + struct fairsched_node_dump nodes[0]; +}; + +static struct fairsched_dump *fairsched_do_dump(int compat) +{ + int nr_nodes; + int len; + struct fairsched_dump *dump; + struct fairsched_node *node; + struct fairsched_node_dump *p; + +start: + nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); + len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); + dump = ub_vmalloc(len); + if (dump == NULL) + goto out; + + mutex_lock(&fairsched_mutex); + if (ve_is_super(get_exec_env()) && nr_nodes < fairsched_nr_nodes) + goto repeat; + p = dump->nodes; + list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { + if ((char *)p - (char *)dump >= len) + break; + p->nr_tasks = 0; + p->nr_runtasks = 0; +#ifdef CONFIG_VE + if (!ve_accessible(node->owner_env, get_exec_env())) + continue; + p->nr_tasks = atomic_read(&node->owner_env->pcounter); + p->nr_runtasks = nr_running_ve(node->owner_env); +#endif + p->id = node->id; + p->weight = node->weight; + p->rate = node->rate; + p->rate_limited = node->rate_limited; + p->nr_pcpu = num_online_cpus(); + p++; + } + dump->len = p - dump->nodes; + mutex_unlock(&fairsched_mutex); + +out: + return dump; + +repeat: + mutex_unlock(&fairsched_mutex); + vfree(dump); + goto start; +} + +#define FAIRSCHED_PROC_HEADLINES 2 + +#define FAIRSHED_DEBUG " debug" + +#ifdef CONFIG_VE +/* + * File format is dictated by compatibility reasons. + */ +static int fairsched_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + unsigned vid, nid, pid, r; + + dump = m->private; + p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.6 debug\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " veid " + " id " + " parent " + "weight " + " rate " + "tasks " + " run " + "cpus" + " " + "flg " + "ready " + " start_tag " + " value " + " delay" + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + vid = nid = pid = 0; + r = (unsigned long)v & 3; + if (p == dump->nodes) { + if (r == 2) + nid = p->id; + } else { + if (!r) + nid = p->id; + else if (r == 1) + vid = pid = p->id; + else + vid = p->id, nid = 1; + } + seq_printf(m, + "%10u " + "%10u %10u %6u %5u %5u %5u %4u" + " " + " %c%c %5u %20Lu %20Lu %20Lu" + "\n", + vid, + nid, + pid, + p->weight, + p->rate, + p->nr_tasks, + p->nr_runtasks, + p->nr_pcpu, + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll); + } + + return 0; +} + +static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + unsigned long l; + + dump = m->private; + if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) + return NULL; + if (*pos < FAIRSCHED_PROC_HEADLINES) + return dump->nodes + *pos; + /* guess why... */ + l = (unsigned long)(dump->nodes + + ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); + l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; + return (void *)l; +} +static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched_seq_start(m, pos); +} +#endif /* CONFIG_VE */ + +static int fairsched2_seq_show(struct seq_file *m, void *v) +{ + struct fairsched_dump *dump; + struct fairsched_node_dump *p; + + dump = m->private; + p = v; + if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { + if (p == dump->nodes) + seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); + else if (p == dump->nodes + 1) + seq_printf(m, + " id " + "weight " + " rate " + " run " + "cpus" +#ifdef FAIRSHED_DEBUG + " " + "flg " + "ready " + " start_tag " + " value " + " delay" +#endif + "\n"); + } else { + p -= FAIRSCHED_PROC_HEADLINES; + seq_printf(m, + "%10u %6u %5u %5u %4u" +#ifdef FAIRSHED_DEBUG + " " + " %c%c %5u %20Lu %20Lu %20Lu" +#endif + "\n", + p->id, + p->weight, + p->rate, + p->nr_runtasks, + p->nr_pcpu +#ifdef FAIRSHED_DEBUG + , + p->rate_limited ? 'L' : '.', + '.', + p->nr_runtasks, + 0ll, 0ll, 0ll +#endif + ); + } + + return 0; +} + +static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) +{ + struct fairsched_dump *dump; + + dump = m->private; + if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) + return NULL; + return dump->nodes + *pos; +} +static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return fairsched2_seq_start(m, pos); +} +static void fairsched2_seq_stop(struct seq_file *m, void *v) +{ +} + +#ifdef CONFIG_VE +static struct seq_operations fairsched_seq_op = { + .start = fairsched_seq_start, + .next = fairsched_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched_seq_show +}; +#endif +static struct seq_operations fairsched2_seq_op = { + .start = fairsched2_seq_start, + .next = fairsched2_seq_next, + .stop = fairsched2_seq_stop, + .show = fairsched2_seq_show +}; +static int fairsched_seq_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + int compat; + +#ifdef CONFIG_VE + compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); + ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); +#else + compat = 0; + ret = seq_open(file, &fairsched2_seq_op); +#endif + if (ret) + return ret; + m = file->private_data; + m->private = fairsched_do_dump(compat); + if (m->private == NULL) { + seq_release(inode, file); + ret = -ENOMEM; + } + return ret; +} +static int fairsched_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct fairsched_dump *dump; + + m = file->private_data; + dump = m->private; + m->private = NULL; + vfree(dump); + seq_release(inode, file); + return 0; +} +static struct file_operations proc_fairsched_operations = { + .open = fairsched_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = fairsched_seq_release +}; + +void __init fairsched_init_late(void) +{ + struct proc_dir_entry *entry; +#ifdef CONFIG_VE + entry = create_proc_glob_entry("fairsched", S_IRUGO, NULL); + if (entry) + entry->proc_fops = &proc_fairsched_operations; +#endif + entry = create_proc_glob_entry("fairsched2", S_IRUGO, NULL); + if (entry) + entry->proc_fops = &proc_fairsched_operations; +} + +#else + +void __init fairsched_init_late(void) { } + +#endif /* CONFIG_PROC_FS */ diff -uprN linux-2.6.24/lib/Kconfig.debug linux-2.6.24.ovz/lib/Kconfig.debug --- linux-2.6.24/lib/Kconfig.debug 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/lib/Kconfig.debug 2008-03-25 18:53:59.000000000 -0500 @@ -79,6 +79,14 @@ config HEADERS_CHECK exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in your build tree), to make sure they're suitable. +config SYSRQ_DEBUG + bool "Debugging via sysrq keys" + depends on MAGIC_SYSRQ + help + Say Y if you want to extend functionality of magic key. It will + provide you with some debugging facilities such as dumping and + writing memory, resolving symbols and some other. + config DEBUG_KERNEL bool "Kernel debugging" help diff -uprN linux-2.6.24/lib/bust_spinlocks.c linux-2.6.24.ovz/lib/bust_spinlocks.c --- linux-2.6.24/lib/bust_spinlocks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/lib/bust_spinlocks.c 2008-03-25 18:53:59.000000000 -0500 @@ -12,10 +12,13 @@ #include #include #include - +#include void __attribute__((weak)) bust_spinlocks(int yes) { + if (printk_no_wake) + return; + if (yes) { ++oops_in_progress; } else { diff -uprN linux-2.6.24/lib/kobject.c linux-2.6.24.ovz/lib/kobject.c --- linux-2.6.24/lib/kobject.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/lib/kobject.c 2008-03-25 18:53:59.000000000 -0500 @@ -558,7 +558,7 @@ void kset_init(struct kset * k) INIT_LIST_HEAD(&k->list); spin_lock_init(&k->list_lock); } - +EXPORT_SYMBOL(kset_init); /** * kset_add - add a kset object to the hierarchy. @@ -632,6 +632,14 @@ struct kobject * kset_find_obj(struct ks return ret; } +/** + * subsystem_register - register a subsystem. + * @s: the subsystem we're registering. + * + * Once we register the subsystem, we want to make sure that + * the kset points back to this subsystem for correct usage of + * the rwsem. + */ int subsystem_register(struct kset *s) { return kset_register(s); diff -uprN linux-2.6.24/lib/kobject_uevent.c linux-2.6.24.ovz/lib/kobject_uevent.c --- linux-2.6.24/lib/kobject_uevent.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/lib/kobject_uevent.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,6 +36,8 @@ static const char *kobject_actions[] = { [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", + [KOBJ_START] = "start", + [KOBJ_STOP] = "stop", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", }; diff -uprN linux-2.6.24/mm/filemap.c linux-2.6.24.ovz/mm/filemap.c --- linux-2.6.24/mm/filemap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/filemap.c 2008-03-25 18:53:59.000000000 -0500 @@ -42,6 +42,8 @@ #include +#include + static ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); @@ -121,6 +123,7 @@ void __remove_from_page_cache(struct pag radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + ub_io_release_debug(page); mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); @@ -1733,7 +1736,11 @@ static void __iov_iter_advance_iov(struc const struct iovec *iov = i->iov; size_t base = i->iov_offset; - while (bytes) { + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments. + */ + while (bytes || !iov->iov_len) { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -2251,6 +2258,7 @@ again: cond_resched(); + iov_iter_advance(i, copied); if (unlikely(copied == 0)) { /* * If we were unable to copy any data at all, we must @@ -2264,7 +2272,6 @@ again: iov_iter_single_seg_count(i)); goto again; } - iov_iter_advance(i, copied); pos += copied; written += copied; diff -uprN linux-2.6.24/mm/filemap_xip.c linux-2.6.24.ovz/mm/filemap_xip.c --- linux-2.6.24/mm/filemap_xip.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/filemap_xip.c 2008-03-25 18:53:59.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include /* * We do use our own empty page to avoid interference with other users @@ -195,6 +196,8 @@ __xip_unmap (struct address_space * mapp flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page, vma); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff -uprN linux-2.6.24/mm/fremap.c linux-2.6.24.ovz/mm/fremap.c --- linux-2.6.24/mm/fremap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/fremap.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,8 @@ #include #include +#include + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -35,6 +37,7 @@ static void zap_pte(struct mm_struct *mm if (pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page, vma); + pb_remove_ref(page, mm); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); @@ -61,8 +64,10 @@ static int install_file_pte(struct mm_st if (!pte) goto out; - if (!pte_none(*pte)) + if (!pte_none(*pte)) { zap_pte(mm, vma, addr, pte); + ub_unused_privvm_inc(mm, vma); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); /* @@ -190,10 +195,13 @@ asmlinkage long sys_remap_file_pages(uns */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; + struct file *file = vma->vm_file; flags &= MAP_NONBLOCK; - addr = mmap_region(vma->vm_file, start, size, + get_file(file); + addr = mmap_region(file, start, size, flags, vma->vm_flags, pgoff, 1); + fput(file); if (IS_ERR_VALUE(addr)) { err = addr; } else { @@ -234,4 +242,5 @@ out: return err; } +EXPORT_SYMBOL_GPL(sys_remap_file_pages); diff -uprN linux-2.6.24/mm/memory.c linux-2.6.24.ovz/mm/memory.c --- linux-2.6.24/mm/memory.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/memory.c 2008-03-25 18:53:59.000000000 -0500 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ #include #include #include +#include #include #include @@ -60,6 +62,11 @@ #include #include +#include +#include +#include +#include + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -103,18 +110,21 @@ void pgd_clear_bad(pgd_t *pgd) pgd_ERROR(*pgd); pgd_clear(pgd); } +EXPORT_SYMBOL_GPL(pgd_clear_bad); void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +EXPORT_SYMBOL_GPL(pud_clear_bad); void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } +EXPORT_SYMBOL_GPL(pmd_clear_bad); /* * Note: this doesn't free the actual pages themselves. That @@ -314,6 +324,7 @@ int __pte_alloc(struct mm_struct *mm, pm spin_unlock(&mm->page_table_lock); return 0; } +EXPORT_SYMBOL_GPL(__pte_alloc); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { @@ -414,6 +425,7 @@ struct page *vm_normal_page(struct vm_ar */ return pfn_to_page(pfn); } +EXPORT_SYMBOL_GPL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables @@ -424,7 +436,7 @@ struct page *vm_normal_page(struct vm_ar static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + unsigned long addr, int *rss, struct page_beancounter **pbc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -479,6 +491,7 @@ copy_one_pte(struct mm_struct *dst_mm, s if (page) { get_page(page); page_dup_rmap(page, vma, addr); + pb_dup_ref(page, dst_mm, pbc); rss[!!PageAnon(page)]++; } @@ -486,20 +499,35 @@ out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); } +#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) +#ifdef CONFIG_BEANCOUNTERS +#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) +#else +#define same_ub(mm1, mm2) 1 +#endif + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[2], rss_tot; + struct page_beancounter *pbc; + int err; + err = -ENOMEM; + pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; again: + if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) + goto out; rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) - return -ENOMEM; + goto out; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -521,23 +549,32 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss, + &pbc); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); + rss_tot = rss[0] + rss[1]; + ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(dst_pte - 1, dst_ptl); cond_resched(); if (addr != end) goto again; - return 0; + + err = 0; +out: + pb_free_list(&pbc); + return err; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + pud_t *dst_pud, pud_t *src_pud, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -552,14 +589,16 @@ static inline int copy_pmd_range(struct if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + pgd_t *dst_pgd, pgd_t *src_pgd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -574,19 +613,21 @@ static inline int copy_pud_range(struct if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int __copy_page_range(struct vm_area_struct *dst_vma, + struct vm_area_struct *vma, + unsigned long addr, size_t size) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = vma->vm_mm; pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long end = addr + size; /* * Don't copy ptes where a page fault will fill them correctly. @@ -609,11 +650,22 @@ int copy_page_range(struct mm_struct *ds if (pgd_none_or_clear_bad(src_pgd)) continue; if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) + dst_vma, vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); return 0; } +EXPORT_SYMBOL_GPL(__copy_page_range); + +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *dst_vma, struct vm_area_struct *vma) +{ + if (dst_vma->vm_mm != dst) + BUG(); + if (vma->vm_mm != src) + BUG(); + return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); +} static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, @@ -625,6 +677,7 @@ static unsigned long zap_pte_range(struc spinlock_t *ptl; int file_rss = 0; int anon_rss = 0; + int rss; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -679,6 +732,7 @@ static unsigned long zap_pte_range(struc file_rss--; } page_remove_rmap(page, vma); + pb_remove_ref(page, mm); tlb_remove_page(tlb, page); continue; } @@ -693,6 +747,8 @@ static unsigned long zap_pte_range(struc pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + rss = -(file_rss + anon_rss); + ub_unused_privvm_add(mm, vma, rss); add_mm_rss(mm, file_rss, anon_rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1551,6 +1607,7 @@ static int do_wp_page(struct mm_struct * int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) @@ -1610,6 +1667,7 @@ static int do_wp_page(struct mm_struct * flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ClearPageCheckpointed(old_page); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; @@ -1623,6 +1681,9 @@ static int do_wp_page(struct mm_struct * gotten: pte_unmap_unlock(page_table, ptl); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; VM_BUG_ON(old_page == ZERO_PAGE(0)); @@ -1638,12 +1699,15 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { page_remove_rmap(old_page, vma); + pb_remove_ref(old_page, mm); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } - } else + } else { + ub_unused_privvm_dec(mm, vma); inc_mm_counter(mm, anon_rss); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1658,6 +1722,7 @@ gotten: update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + pb_add_ref(new_page, mm, &pbc); /* Free the old page.. */ new_page = old_page; @@ -1667,6 +1732,7 @@ gotten: page_cache_release(new_page); if (old_page) page_cache_release(old_page); + pb_free(&pbc); unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { @@ -1687,6 +1753,8 @@ unlock: } return ret; oom: + pb_free(&pbc); +oom_nopb: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; @@ -2057,10 +2125,16 @@ static int do_swap_page(struct mm_struct swp_entry_t entry; pte_t pte; int ret = 0; + struct page_beancounter *pbc; + cycles_t start; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; + goto out_nostat; + + if (unlikely(pb_alloc(&pbc))) + return VM_FAULT_OOM; + start = get_cycles(); entry = pte_to_swp_entry(orig_pte); if (is_migration_entry(entry)) { migration_entry_wait(mm, pmd, address); @@ -2108,6 +2182,7 @@ static int do_swap_page(struct mm_struct /* The page isn't present yet, go ahead with the fault. */ inc_mm_counter(mm, anon_rss); + ub_percpu_inc(mm->mm_ub, swapin); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2117,10 +2192,11 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); + try_to_remove_exclusive_swap_page(page); unlock_page(page); if (write_access) { @@ -2136,9 +2212,15 @@ static int do_swap_page(struct mm_struct unlock: pte_unmap_unlock(page_table, ptl); out: + pb_free(&pbc); + spin_lock_irq(&kstat_glb_lock); + KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); + spin_unlock_irq(&kstat_glb_lock); +out_nostat: return ret; out_nomap: pte_unmap_unlock(page_table, ptl); + pb_free(&pbc); unlock_page(page); page_cache_release(page); return ret; @@ -2156,10 +2238,14 @@ static int do_anonymous_page(struct mm_s struct page *page; spinlock_t *ptl; pte_t entry; + struct page_beancounter *pbc; /* Allocate our own private page. */ pte_unmap(page_table); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2175,17 +2261,22 @@ static int do_anonymous_page(struct mm_s inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); page_add_new_anon_rmap(page, vma, address); + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); unlock: + pb_free(&pbc); pte_unmap_unlock(page_table, ptl); return 0; release: page_cache_release(page); goto unlock; oom: + pb_free(&pbc); +oom_nopb: return VM_FAULT_OOM; } @@ -2212,6 +2303,7 @@ static int __do_fault(struct mm_struct * pte_t entry; int anon = 0; struct page *dirty_page = NULL; + struct page_beancounter *pbc; struct vm_fault vmf; int ret; int page_mkwrite = 0; @@ -2223,6 +2315,9 @@ static int __do_fault(struct mm_struct * BUG_ON(vma->vm_flags & VM_PFNMAP); + if (unlikely(pb_alloc(&pbc))) + goto oom_nopb; + if (likely(vma->vm_ops->fault)) { ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2233,9 +2328,9 @@ static int __do_fault(struct mm_struct * vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* no page was available -- either SIGBUS or OOM */ if (unlikely(vmf.page == NOPAGE_SIGBUS)) - return VM_FAULT_SIGBUS; + goto bus_nopg; else if (unlikely(vmf.page == NOPAGE_OOM)) - return VM_FAULT_OOM; + goto oom_nopg; } /* @@ -2311,6 +2406,8 @@ static int __do_fault(struct mm_struct * */ /* Only go through if we didn't race with anybody else... */ if (likely(pte_same(*page_table, orig_pte))) { + struct user_beancounter *ub; + flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) @@ -2328,6 +2425,25 @@ static int __do_fault(struct mm_struct * get_page(dirty_page); } } + ub = page_ub(page); + if (ub != NULL && +#ifdef CONFIG_BC_IO_ACCOUNTING + !((unsigned long)ub & PAGE_IO_MARK) && +#endif + ub->ub_magic == UB_MAGIC) { + /* + * WOW: Page was already charged as page_ub. This may + * happens for example then some driver export its low + * memory pages to user space. We can't account page as + * page_ub and page_bp at the same time. So uncharge + * page from UB counter. + */ + WARN_ON_ONCE(1); + ub_page_uncharge(page, 0); + } + + pb_add_ref(page, mm, &pbc); + ub_unused_privvm_dec(mm, vma); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); @@ -2352,8 +2468,15 @@ out_unlocked: set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } - + pb_free(&pbc); return ret; +bus_nopg: + pb_free(&pbc); + return VM_FAULT_SIGBUS; +oom_nopg: + pb_free(&pbc); +oom_nopb: + return VM_FAULT_OOM; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, @@ -2531,6 +2654,27 @@ int handle_mm_fault(struct mm_struct *mm pmd_t *pmd; pte_t *pte; +#ifdef CONFIG_VZ_GENCALLS + do { + int ret; +#ifdef CONFIG_BEANCOUNTERS + struct task_beancounter *tbc; + + tbc = ¤t->task_bc; + if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) && + tbc->pgfault_allot) { + tbc->pgfault_allot--; + break; /* skip notifier */ + } +#endif + ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN, + (void *)1); + if (ret & NOTIFY_FAIL) + return VM_FAULT_SIGBUS; + if (ret & NOTIFY_OK) + return VM_FAULT_MINOR; /* retry */ + } while (0); +#endif __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); @@ -2573,6 +2717,8 @@ int __pud_alloc(struct mm_struct *mm, pg } #endif /* __PAGETABLE_PUD_FOLDED */ +EXPORT_SYMBOL_GPL(__pud_alloc); + #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. @@ -2601,6 +2747,8 @@ int __pmd_alloc(struct mm_struct *mm, pu } #endif /* __PAGETABLE_PMD_FOLDED */ +EXPORT_SYMBOL_GPL(__pmd_alloc); + int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; @@ -2620,6 +2768,8 @@ int make_pages_present(unsigned long add return ret == len ? 0 : -1; } +EXPORT_SYMBOL(make_pages_present); + /* * Map a vmalloc()-space virtual address to the physical page. */ diff -uprN linux-2.6.24/mm/mempolicy.c linux-2.6.24.ovz/mm/mempolicy.c --- linux-2.6.24/mm/mempolicy.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mempolicy.c 2008-03-25 18:53:59.000000000 -0500 @@ -89,6 +89,7 @@ #include #include #include +#include #include #include diff -uprN linux-2.6.24/mm/mempool.c linux-2.6.24.ovz/mm/mempool.c --- linux-2.6.24/mm/mempool.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mempool.c 2008-03-25 18:53:59.000000000 -0500 @@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_n init_waitqueue_head(&pool->wait); pool->alloc = alloc_fn; pool->free = free_fn; + if (alloc_fn == mempool_alloc_slab) + kmem_mark_nocharge((struct kmem_cache *)pool_data); /* * First pre-allocate the guaranteed number of buffers. @@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int unsigned long flags; BUG_ON(new_min_nr <= 0); + gfp_mask &= ~__GFP_UBC; spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gf gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ + gfp_mask &= ~__GFP_UBC; gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); diff -uprN linux-2.6.24/mm/mlock.c linux-2.6.24.ovz/mm/mlock.c --- linux-2.6.24/mm/mlock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mlock.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,10 +8,12 @@ #include #include #include +#include #include #include #include #include +#include int can_do_mlock(void) { @@ -36,6 +38,14 @@ static int mlock_fixup(struct vm_area_st goto out; } + if (newflags & VM_LOCKED) { + ret = ub_locked_charge(mm, end - start); + if (ret < 0) { + *prev = vma; + goto out; + } + } + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -49,13 +59,13 @@ static int mlock_fixup(struct vm_area_st if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) - goto out; + goto out_uncharge; } if (end != vma->vm_end) { ret = split_vma(mm, vma, end, 0); if (ret) - goto out; + goto out_uncharge; } success: @@ -74,13 +84,19 @@ success: pages = -pages; if (!(newflags & VM_IO)) ret = make_pages_present(start, end); - } + } else + ub_locked_uncharge(mm, end - start); mm->locked_vm -= pages; out: if (ret == -ENOMEM) ret = -EAGAIN; return ret; + +out_uncharge: + if (newflags & VM_LOCKED) + ub_locked_uncharge(mm, end - start); + goto out; } static int do_mlock(unsigned long start, size_t len, int on) @@ -157,6 +173,7 @@ asmlinkage long sys_mlock(unsigned long up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mlock); asmlinkage long sys_munlock(unsigned long start, size_t len) { @@ -169,6 +186,7 @@ asmlinkage long sys_munlock(unsigned lon up_write(¤t->mm->mmap_sem); return ret; } +EXPORT_SYMBOL_GPL(sys_munlock); static int do_mlockall(int flags) { diff -uprN linux-2.6.24/mm/mmap.c linux-2.6.24.ovz/mm/mmap.c --- linux-2.6.24/mm/mmap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mmap.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -36,9 +38,12 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#include + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); /* * WARNING: the debugging will use recursive algorithms so never enable this @@ -100,6 +105,18 @@ int __vm_enough_memory(struct mm_struct vm_acct_memory(pages); +#ifdef CONFIG_BEANCOUNTERS + switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, + (void *)pages) + & (NOTIFY_OK | NOTIFY_FAIL)) { + case NOTIFY_OK: + return 0; + case NOTIFY_FAIL: + vm_unacct_memory(pages); + return -ENOMEM; + } +#endif + /* * Sometimes we want to use more memory than we have */ @@ -224,6 +241,9 @@ static struct vm_area_struct *remove_vma struct vm_area_struct *next = vma->vm_next; might_sleep(); + + ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, + vma->vm_flags, vma->vm_file); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -271,7 +291,7 @@ asmlinkage unsigned long sys_brk(unsigne goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) goto out; set_brk: mm->brk = brk; @@ -910,7 +930,7 @@ unsigned long do_mmap_pgoff(struct file prot |= PROT_EXEC; if (!len) - return -EINVAL; + return addr; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1026,6 +1046,9 @@ unsigned long do_mmap_pgoff(struct file if (error) return error; + if (!gr_acl_handle_mmap(file, prot)) + return -EACCES; + return mmap_region(file, addr, len, flags, vm_flags, pgoff, accountable); } @@ -1076,6 +1099,7 @@ unsigned long mmap_region(struct file *f struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + unsigned long ub_charged = 0; /* Clear old maps */ error = -ENOMEM; @@ -1107,6 +1131,11 @@ munmap_back: } } + if (ub_memory_charge(mm, len, vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) + goto charge_error; + ub_charged = 1; + /* * Can we just expand an old private anonymous mapping? * The VM_SHARED test is necessary because shmem_zero_setup @@ -1122,7 +1151,8 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1150,6 +1180,19 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; + if (vm_flags != vma->vm_flags) { + /* + * ->vm_flags has been changed in f_op->mmap method. + * We have to recharge ub memory. + */ + ub_memory_uncharge(mm, len, vm_flags, file); + if (ub_memory_charge(mm, len, vma->vm_flags, file, + (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) { + ub_charged = 0; + error = -ENOMEM; + goto unmap_and_free_vma; + } + } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) @@ -1214,6 +1257,9 @@ unmap_and_free_vma: free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: + if (ub_charged) + ub_memory_uncharge(mm, len, vm_flags, file); +charge_error: if (charged) vm_unacct_memory(charged); return error; @@ -1536,12 +1582,16 @@ static int acct_stack_growth(struct vm_a if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; + if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, + vma->vm_file, UB_SOFT)) + goto fail_charge; + /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory(grow)) - return -ENOMEM; + goto fail_sec; /* Ok, everything looks good - let it rip */ mm->total_vm += grow; @@ -1549,6 +1599,11 @@ static int acct_stack_growth(struct vm_a mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; + +fail_sec: + ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); +fail_charge: + return -ENOMEM; } #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) @@ -1829,6 +1884,7 @@ int split_vma(struct mm_struct * mm, str return 0; } +EXPORT_SYMBOL_GPL(split_vma); /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the @@ -1922,7 +1978,7 @@ static inline void verify_mm_writelocked * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -unsigned long do_brk(unsigned long addr, unsigned long len) +static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; @@ -1988,8 +2044,11 @@ unsigned long do_brk(unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (ub_memory_charge(mm, len, flags, NULL, soft)) + goto fail_charge; + if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; + goto fail_sec; /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, @@ -1999,11 +2058,10 @@ unsigned long do_brk(unsigned long addr, /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | + (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); + if (!vma) + goto fail_alloc; vma->vm_mm = mm; vma->vm_start = addr; @@ -2019,8 +2077,19 @@ out: make_pages_present(addr, addr + len); } return addr; + +fail_alloc: + vm_unacct_memory(len >> PAGE_SHIFT); +fail_sec: + ub_memory_uncharge(mm, len, flags, NULL); +fail_charge: + return -ENOMEM; } +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return __do_brk(addr, len, UB_SOFT); +} EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ @@ -2187,10 +2256,11 @@ static void special_mapping_close(struct { } -static struct vm_operations_struct special_mapping_vmops = { +struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .nopage = special_mapping_nopage, }; +EXPORT_SYMBOL_GPL(special_mapping_vmops); /* * Called with mm->mmap_sem held for writing. @@ -2215,7 +2285,7 @@ int install_special_mapping(struct mm_st vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; diff -uprN linux-2.6.24/mm/mmzone.c linux-2.6.24.ovz/mm/mmzone.c --- linux-2.6.24/mm/mmzone.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mmzone.c 2008-03-25 18:53:59.000000000 -0500 @@ -13,6 +13,7 @@ struct pglist_data *first_online_pgdat(v { return NODE_DATA(first_online_node); } +EXPORT_SYMBOL(first_online_pgdat); /* June 2006 */ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -22,6 +23,7 @@ struct pglist_data *next_online_pgdat(st return NULL; return NODE_DATA(nid); } +EXPORT_SYMBOL(next_online_pgdat); /* June 2006 */ /* * next_zone - helper magic for for_each_zone() diff -uprN linux-2.6.24/mm/mprotect.c linux-2.6.24.ovz/mm/mprotect.c --- linux-2.6.24/mm/mprotect.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mprotect.c 2008-03-25 18:53:59.000000000 -0500 @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -21,11 +22,14 @@ #include #include #include +#include #include #include #include #include +#include + static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) @@ -137,6 +141,8 @@ mprotect_fixup(struct vm_area_struct *vm unsigned long charged = 0; pgoff_t pgoff; int error; + unsigned long ch_size; + int ch_dir; int dirty_accountable = 0; if (newflags == oldflags) { @@ -144,6 +150,12 @@ mprotect_fixup(struct vm_area_struct *vm return 0; } + error = -ENOMEM; + ch_size = nrpages - pages_in_vma_range(vma, start, end); + ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); + if (ch_dir == PRIVVM_ERROR) + goto fail_ch; + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we @@ -156,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vm if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) - return -ENOMEM; + goto fail_sec; newflags |= VM_ACCOUNT; } } @@ -204,10 +216,16 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + if (ch_dir == PRIVVM_TO_SHARED) + __ub_unused_privvm_dec(mm, ch_size); return 0; fail: vm_unacct_memory(charged); +fail_sec: + if (ch_dir == PRIVVM_TO_PRIVATE) + __ub_unused_privvm_dec(mm, ch_size); +fail_ch: return error; } @@ -269,6 +287,11 @@ sys_mprotect(unsigned long start, size_t if (start > vma->vm_start) prev = vma; + if (!gr_acl_handle_mprotect(vma->vm_file, prot)) { + error = -EACCES; + goto out; + } + for (nstart = start ; ; ) { unsigned long newflags; @@ -309,3 +332,4 @@ out: up_write(¤t->mm->mmap_sem); return error; } +EXPORT_SYMBOL_GPL(sys_mprotect); diff -uprN linux-2.6.24/mm/mremap.c linux-2.6.24.ovz/mm/mremap.c --- linux-2.6.24/mm/mremap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/mremap.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,8 @@ #include #include +#include + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -167,17 +169,21 @@ static unsigned long move_vma(struct vm_ unsigned long hiwater_vm; int split = 0; + if (ub_memory_charge(mm, new_len, vm_flags, + vma->vm_file, UB_HARD)) + goto err; + /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (mm->map_count >= sysctl_max_map_count - 3) - return -ENOMEM; + goto err_nomem; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) - return -ENOMEM; + goto err_nomem; moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { @@ -236,7 +242,13 @@ static unsigned long move_vma(struct vm_ new_addr + new_len); } - return new_addr; + if (new_addr != -ENOMEM) + return new_addr; + +err_nomem: + ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); +err: + return -ENOMEM; } /* @@ -364,7 +376,15 @@ unsigned long do_mremap(unsigned long ad max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long len; + int pages; + + len = new_len - old_len; + pages = len >> PAGE_SHIFT; + ret = -ENOMEM; + if (ub_memory_charge(mm, len, vma->vm_flags, + vma->vm_file, UB_HARD)) + goto out; vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); diff -uprN linux-2.6.24/mm/oom_kill.c linux-2.6.24.ovz/mm/oom_kill.c --- linux-2.6.24/mm/oom_kill.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/oom_kill.c 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +28,9 @@ #include #include +#include +#include + int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; static DEFINE_SPINLOCK(zone_scan_mutex); @@ -194,15 +199,15 @@ static inline enum oom_constraint constr * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints) +struct task_struct *oom_select_bad_process(struct user_beancounter *ub) { struct task_struct *g, *p; struct task_struct *chosen = NULL; struct timespec uptime; - *ppoints = 0; + unsigned long chosen_points = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + do_each_thread_all(g, p) { unsigned long points; /* @@ -214,6 +219,8 @@ static struct task_struct *select_bad_pr /* skip the init task */ if (is_global_init(p)) continue; + if (ub_oom_task_skip(ub, p)) + continue; /* * This task already has access to memory reserves and is @@ -242,18 +249,18 @@ static struct task_struct *select_bad_pr return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + chosen_points = ULONG_MAX; } if (p->oomkilladj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); - if (points > *ppoints || !chosen) { + if (points > chosen_points || !chosen) { chosen = p; - *ppoints = points; + chosen_points = points; } - } while_each_thread(g, p); + } while_each_thread_all(g, p); return chosen; } @@ -290,13 +297,16 @@ static void __oom_kill_task(struct task_ set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); + ub_oom_task_killed(p); } static int oom_kill_task(struct task_struct *p) { struct mm_struct *mm; + struct user_beancounter *ub; struct task_struct *g, *q; + task_lock(p); mm = p->mm; /* WARNING: mm may not be dereferenced since we did not obtain its @@ -308,16 +318,21 @@ static int oom_kill_task(struct task_str * However, this is of no concern to us. */ - if (mm == NULL) + if (mm == NULL) { + task_unlock(p); return 1; + } + + ub = get_beancounter(mm_ub(mm)); + task_unlock(p); /* * Don't kill the process if any threads are set to OOM_DISABLE */ - do_each_thread(g, q) { + do_each_thread_all(g, q) { if (q->mm == mm && q->oomkilladj == OOM_DISABLE) return 1; - } while_each_thread(g, q); + } while_each_thread_all(g, q); __oom_kill_task(p, 1); @@ -326,16 +341,18 @@ static int oom_kill_task(struct task_str * but are in a different thread group. Don't let them have access * to memory reserves though, otherwise we might deplete all memory. */ - do_each_thread(g, q) { + do_each_thread_all(g, q) { if (q->mm == mm && !same_thread_group(q, p)) force_sig(SIGKILL, q); - } while_each_thread(g, q); + } while_each_thread_all(g, q); + ub_oom_mm_killed(ub); + put_beancounter(ub); return 0; } -static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, const char *message) +int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + const char *message) { struct task_struct *c; @@ -356,8 +373,8 @@ static int oom_kill_process(struct task_ return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + printk(KERN_ERR "%s: kill process %d (%s) or a child\n", + message, task_pid_nr(p), p->comm); /* Try to kill a child first */ list_for_each_entry(c, &p->children, sibling) { @@ -445,9 +462,9 @@ void clear_zonelist_oom(struct zonelist void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct task_struct *p; - unsigned long points = 0; unsigned long freed = 0; enum oom_constraint constraint; + struct user_beancounter *ub; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) @@ -457,16 +474,34 @@ void out_of_memory(struct zonelist *zone if (sysctl_panic_on_oom == 2) panic("out of memory. Compulsory panic_on_oom is selected.\n"); + if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) + & (NOTIFY_OK | NOTIFY_FAIL)) + return; + + ub = NULL; + if (ub_oom_lock()) + goto out_oom_lock; + + read_lock(&tasklist_lock); + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + dump_stack(); + show_mem(); + show_slab_info(); + } + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ constraint = constrained_alloc(zonelist, gfp_mask); - read_lock(&tasklist_lock); switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, points, + oom_kill_process(current, gfp_mask, order, "No available memory (MPOL_BIND)"); break; @@ -476,27 +511,33 @@ void out_of_memory(struct zonelist *zone /* Fall-through */ case CONSTRAINT_CPUSET: if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, points, + oom_kill_process(current, gfp_mask, order, "Out of memory (oom_kill_allocating_task)"); break; } retry: + put_beancounter(ub); + /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points); + ub = ub_oom_select_worst(); + p = oom_select_bad_process(ub); if (PTR_ERR(p) == -1UL) goto out; /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { + if (ub != NULL) + goto retry; read_unlock(&tasklist_lock); + ub_oom_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, gfp_mask, order, points, + if (oom_kill_process(p, gfp_mask, order, "Out of memory")) goto retry; @@ -505,7 +546,10 @@ retry: out: read_unlock(&tasklist_lock); + ub_oom_unlock(); + put_beancounter(ub); +out_oom_lock: /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current diff -uprN linux-2.6.24/mm/page-writeback.c linux-2.6.24.ovz/mm/page-writeback.c --- linux-2.6.24/mm/page-writeback.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/page-writeback.c 2008-03-25 18:53:59.000000000 -0500 @@ -35,6 +35,9 @@ #include #include +#include +#include + /* * The maximum number of pages to writeout in a single bdflush/kupdate * operation. We do this so we don't hold I_SYNC against an inode for @@ -822,6 +825,7 @@ retry: scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + struct user_beancounter *old_ub; /* * At this point we hold neither mapping->tree_lock nor @@ -852,7 +856,9 @@ retry: continue; } + old_ub = bc_io_switch_context(page); ret = (*writepage)(page, wbc, data); + bc_io_restore_context(old_ub); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { unlock_page(page); @@ -948,12 +954,15 @@ int write_one_page(struct page *page, in .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; + struct user_beancounter *old_ub; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); + old_ub = bc_io_switch_context(page); + if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); @@ -966,6 +975,9 @@ int write_one_page(struct page *page, in } else { unlock_page(page); } + + bc_io_restore_context(old_ub); + return ret; } EXPORT_SYMBOL(write_one_page); @@ -997,6 +1009,9 @@ int __set_page_dirty_no_writeback(struct */ int __set_page_dirty_nobuffers(struct page *page) { + int acct; + + acct = 0; if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -1004,6 +1019,7 @@ int __set_page_dirty_nobuffers(struct pa if (!mapping) return 1; + acct = 0; write_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ @@ -1013,12 +1029,14 @@ int __set_page_dirty_nobuffers(struct pa __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); + acct = 1; } radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } write_unlock_irq(&mapping->tree_lock); + if (acct) + task_io_account_write(page, PAGE_CACHE_SIZE, 0); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1157,6 +1175,7 @@ int clear_page_dirty_for_io(struct page dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + ub_io_release_context(page, PAGE_CACHE_SIZE); return 1; } return 0; diff -uprN linux-2.6.24/mm/page_alloc.c linux-2.6.24.ovz/mm/page_alloc.c --- linux-2.6.24/mm/page_alloc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/page_alloc.c 2008-03-25 18:53:59.000000000 -0500 @@ -48,6 +48,9 @@ #include #include "internal.h" +#include +#include + /* * Array of node states. */ @@ -99,6 +102,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z 32, }; +EXPORT_SYMBOL(nr_swap_pages); EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -464,8 +468,11 @@ static inline int free_pages_check(struc 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); - if (PageDirty(page)) + if (PageDirty(page)) { + ub_io_release_context(page, 0); __ClearPageDirty(page); + } else + ub_io_release_debug(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -528,6 +535,7 @@ static void __free_pages_ok(struct page arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + ub_page_uncharge(page, order); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); @@ -624,7 +632,8 @@ static int prep_new_page(struct page *pa page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk | + 1 << PG_checkpointed); set_page_private(page, 0); set_page_refcounted(page); @@ -1002,6 +1011,7 @@ static void fastcall free_hot_cold_page( kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + ub_page_uncharge(page, 0); local_irq_save(flags); __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); @@ -1448,6 +1458,31 @@ try_next_zone: return page; } +extern unsigned long cycles_per_jiffy; +static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order, + struct page *page, cycles_t time) +{ +#ifdef CONFIG_VE + int ind; + unsigned long flags; + + time = (jiffies - time) * cycles_per_jiffy; + if (!(gfp_mask & __GFP_WAIT)) + ind = 0; + else if (!(gfp_mask & __GFP_HIGHMEM)) + ind = (order > 0 ? 2 : 1); + else + ind = (order > 0 ? 4 : 3); + spin_lock_irqsave(&kstat_glb_lock, flags); + KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); + if (!page) + kstat_glob.alloc_fails[ind]++; + spin_unlock_irqrestore(&kstat_glb_lock, flags); +#endif +} + +int alloc_fail_warn; + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -1463,6 +1498,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i int do_retry; int alloc_flags; int did_some_progress; + cycles_t start; might_sleep_if(wait); @@ -1480,6 +1516,7 @@ restart: return NULL; } + start = jiffies; page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) @@ -1622,19 +1659,32 @@ nofail_alloc: do_retry = 1; } if (do_retry) { + if (total_swap_pages > 0 && nr_swap_pages == 0) { + out_of_memory(zonelist, gfp_mask, order); + goto restart; + } congestion_wait(WRITE, HZ/50); goto rebalance; } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + __alloc_collect_stats(gfp_mask, order, NULL, start); + if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && + printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); show_mem(); } + return NULL; + got_pg: + __alloc_collect_stats(gfp_mask, order, page, start); + if (ub_page_charge(page, order, gfp_mask)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1702,6 +1752,18 @@ fastcall void free_pages(unsigned long a EXPORT_SYMBOL(free_pages); +unsigned int nr_free_lowpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) + pages += zone_page_state(&pgdat->node_zones[ZONE_NORMAL], NR_FREE_PAGES); + + return pages; +} +EXPORT_SYMBOL(nr_free_lowpages); + static unsigned int nr_free_zone_pages(int offset) { /* Just pick one node, since fallback list is circular */ diff -uprN linux-2.6.24/mm/rmap.c linux-2.6.24.ovz/mm/rmap.c --- linux-2.6.24/mm/rmap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/rmap.c 2008-03-25 18:53:59.000000000 -0500 @@ -50,6 +50,9 @@ #include #include +#include +#include + #include struct kmem_cache *anon_vma_cachep; @@ -93,6 +96,7 @@ int anon_vma_prepare(struct vm_area_stru } return 0; } +EXPORT_SYMBOL_GPL(anon_vma_prepare); void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { @@ -118,6 +122,7 @@ void anon_vma_link(struct vm_area_struct spin_unlock(&anon_vma->lock); } } +EXPORT_SYMBOL_GPL(anon_vma_link); void anon_vma_unlink(struct vm_area_struct *vma) { @@ -149,14 +154,14 @@ static void anon_vma_ctor(struct kmem_ca void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor); } /* * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -175,12 +180,14 @@ out: rcu_read_unlock(); return NULL; } +EXPORT_SYMBOL_GPL(page_lock_anon_vma); -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(page_unlock_anon_vma); /* * At what user virtual address is page expected in @vma? @@ -644,6 +651,13 @@ void page_remove_rmap(struct page *page, page_clear_dirty(page); set_page_dirty(page); } + + /* + * Well, when a page is unmapped, we cannot keep PG_checkpointed + * flag, it is not accessible via process VM and we have no way + * to reset its state + */ + ClearPageCheckpointed(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } @@ -735,6 +749,9 @@ static int try_to_unmap_one(struct page page_remove_rmap(page, vma); + ub_unused_privvm_inc(mm, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); page_cache_release(page); out_unmap: @@ -825,6 +842,9 @@ static void try_to_unmap_cluster(unsigne set_page_dirty(page); page_remove_rmap(page, vma); + ub_percpu_inc(mm->mm_ub, unmap); + pb_remove_ref(page, mm); + ub_unused_privvm_inc(mm, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff -uprN linux-2.6.24/mm/shmem.c linux-2.6.24.ovz/mm/shmem.c --- linux-2.6.24/mm/shmem.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/shmem.c 2008-03-25 18:53:59.000000000 -0500 @@ -54,6 +54,8 @@ #include #include +#include + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994 @@ -181,7 +183,7 @@ static inline void shmem_unacct_blocks(u static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; -static const struct file_operations shmem_file_operations; +const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; @@ -220,7 +222,7 @@ static void shmem_free_blocks(struct ino * * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long swp_freed) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; @@ -230,6 +232,8 @@ static void shmem_recalc_inode(struct in info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); + if (freed > swp_freed) + ub_tmpfs_respages_sub(info, freed - swp_freed); } } @@ -335,6 +339,11 @@ static void shmem_swp_set(struct shmem_i struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } + + if (incdec == 1) + ub_tmpfs_respages_dec(info); + else + ub_tmpfs_respages_inc(info); } /* @@ -351,14 +360,24 @@ static swp_entry_t *shmem_swp_alloc(stru struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; + unsigned long ub_val; if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return ERR_PTR(-EINVAL); + ub_val = 0; + if (info->next_index <= index) { + ub_val = index + 1 - info->next_index; + if (ub_shmpages_charge(info, ub_val)) + return ERR_PTR(-ENOSPC); + } + while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); + if (sgp == SGP_READ) { + entry = shmem_swp_map(ZERO_PAGE(0)); + goto out; + } /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: @@ -368,7 +387,8 @@ static swp_entry_t *shmem_swp_alloc(stru spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); - return ERR_PTR(-ENOSPC); + entry = ERR_PTR(-ENOSPC); + goto out; } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; @@ -376,31 +396,43 @@ static swp_entry_t *shmem_swp_alloc(stru } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | + __GFP_UBC); if (page) set_page_private(page, 0); spin_lock(&info->lock); if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); + entry = ERR_PTR(-ENOMEM); + goto out_block; } if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { entry = ERR_PTR(-EINVAL); - break; + goto out_dir; } - if (info->next_index <= index) + if (info->next_index <= index) { + ub_val = 0; info->next_index = index + 1; + } } if (page) { /* another task gave its page, or truncated the file */ shmem_free_blocks(inode, 1); shmem_dir_free(page); } - if (info->next_index <= index && !IS_ERR(entry)) + if (info->next_index <= index) info->next_index = index + 1; return entry; + +out_dir: + shmem_dir_free(page); +out_block: + shmem_free_blocks(inode, 1); +out: + if (ub_val) + ub_shmpages_uncharge(info, ub_val); + return entry; } /* @@ -509,6 +541,7 @@ static void shmem_truncate_range(struct return; spin_lock(&info->lock); + ub_shmpages_uncharge(info, info->next_index - idx); info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; @@ -695,7 +728,7 @@ done2: info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, nr_swaps_freed); spin_unlock(&info->lock); /* @@ -782,6 +815,7 @@ static void shmem_delete_inode(struct in sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } + shmi_ub_put(info); clear_inode(inode); } @@ -903,6 +937,12 @@ int shmem_unuse(swp_entry_t entry, struc return found; } +#ifdef CONFIG_BEANCOUNTERS +#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) +#else +#define shm_get_swap_page(info) (get_swap_page(NULL)) +#endif + /* * Move the page from the page cache to the swap cache. */ @@ -938,12 +978,12 @@ static int shmem_writepage(struct page * info = SHMEM_I(inode); if (info->flags & VM_LOCKED) goto redirty; - swap = get_swap_page(); + swap = shm_get_swap_page(info); if (!swap.val) goto redirty; spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; @@ -1140,7 +1180,7 @@ repeat: goto failed; spin_lock(&info->lock); - shmem_recalc_inode(inode); + shmem_recalc_inode(inode, 0); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); @@ -1309,6 +1349,7 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); + ub_tmpfs_respages_inc(info); } done: if (*pagep != filepage) { @@ -1421,6 +1462,7 @@ shmem_get_inode(struct super_block *sb, inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); + shmi_ub_set(info, get_exec_ub()); spin_lock_init(&info->lock); INIT_LIST_HEAD(&info->swaplist); @@ -2370,7 +2412,7 @@ static const struct address_space_operat .migratepage = migrate_page, }; -static const struct file_operations shmem_file_operations = { +const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, @@ -2381,6 +2423,7 @@ static const struct file_operations shme .splice_write = generic_file_splice_write, #endif }; +EXPORT_SYMBOL_GPL(shmem_file_operations); static const struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, @@ -2449,6 +2492,10 @@ static struct vm_operations_struct shmem #endif }; +int is_shmem_mapping(struct address_space *map) +{ + return (map != NULL && map->a_ops == &shmem_aops); +} static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) @@ -2456,13 +2503,19 @@ static int shmem_get_sb(struct file_syst return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } -static struct file_system_type tmpfs_fs_type = { +struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; +EXPORT_SYMBOL(tmpfs_fs_type); + +#ifdef CONFIG_VE +#define shm_mnt (get_exec_env()->shmem_mnt) +#else static struct vfsmount *shm_mnt; +#endif static int __init init_tmpfs(void) { @@ -2503,6 +2556,36 @@ out4: } module_init(init_tmpfs) +static inline int shm_charge_ahead(struct inode *inode) +{ +#ifdef CONFIG_BEANCOUNTERS + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + swp_entry_t *entry; + + if (!inode->i_size) + return 0; + idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + /* + * Just touch info to allocate space for entry and + * make all UBC checks + */ + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, SGP_CACHE); + if (IS_ERR(entry)) + goto err; + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + return 0; + +err: + spin_unlock(&info->lock); + return PTR_ERR(entry); +#else + return 0; +#endif +} + /* * shmem_file_setup - get an unlinked file living in tmpfs * @@ -2550,6 +2633,9 @@ struct file *shmem_file_setup(char *name d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + error = shm_charge_ahead(inode); + if (error) + goto close_file; init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &shmem_file_operations); return file; @@ -2562,6 +2648,7 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /* * shmem_zero_setup - setup a shared anonymous mapping @@ -2579,6 +2666,8 @@ int shmem_zero_setup(struct vm_area_stru if (vma->vm_file) fput(vma->vm_file); + else if (vma->vm_flags & VM_WRITE) + __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; return 0; diff -uprN linux-2.6.24/mm/slab.c linux-2.6.24.ovz/mm/slab.c --- linux-2.6.24/mm/slab.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/slab.c 2008-03-25 18:53:59.000000000 -0500 @@ -110,30 +110,14 @@ #include #include #include +#include +#include #include #include #include -/* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. - * 0 for faster, smaller code (especially in the critical paths). - * - * STATS - 1 to collect stats for /proc/slabinfo. - * 0 for faster, smaller code (especially in the critical paths). - * - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) - */ - -#ifdef CONFIG_DEBUG_SLAB -#define DEBUG 1 -#define STATS 1 -#define FORCED_DEBUG 1 -#else -#define DEBUG 0 -#define STATS 0 -#define FORCED_DEBUG 0 -#endif +#include /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) @@ -172,18 +156,20 @@ #endif /* Legal flag mask for kmem_cache_create(). */ -#if DEBUG +#if SLAB_DEBUG # define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_UBC | SLAB_NO_CHARGE) #endif /* @@ -304,11 +290,11 @@ struct kmem_list3 { /* * Need this for bootstrapping a per node allocator. */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) +#define NUM_INIT_LISTS (3 * MAX_NUMNODES) struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 -#define SIZE_AC 1 -#define SIZE_L3 (1 + MAX_NUMNODES) +#define SIZE_AC MAX_NUMNODES +#define SIZE_L3 (2 * MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); @@ -372,87 +358,6 @@ static void kmem_list3_init(struct kmem_ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -/* - * struct kmem_cache - * - * manages a cache. - */ - -struct kmem_cache { -/* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; -/* 2) Cache tunables. Protected by cache_chain_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int buffer_size; - u32 reciprocal_buffer_size; -/* 3) touched by every alloc & free from the backend */ - - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 4) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ - - /* constructor func */ - void (*ctor)(struct kmem_cache *, void *); - -/* 5) cache creation/removal */ - const char *name; - struct list_head next; - -/* 6) statistics */ -#if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; -#endif -#if DEBUG - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. buffer_size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. - */ - int obj_offset; - int obj_size; -#endif - /* - * We put nodelists[] at the end of kmem_cache, because we want to size - * this array to nr_node_ids slots instead of MAX_NUMNODES - * (see kmem_cache_init()) - * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of nodes. - */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - /* - * Do not add fields after nodelists[] - */ -}; - #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -467,12 +372,14 @@ struct kmem_cache { #define REAPTIMEOUT_CPUC (2*HZ) #define REAPTIMEOUT_LIST3 (4*HZ) -#if STATS +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_INC_SHRUNK(x) ((x)->shrunk++) + +#if SLAB_STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) -#define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -495,8 +402,6 @@ struct kmem_cache { #define STATS_INC_ACTIVE(x) do { } while (0) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) -#define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -509,7 +414,7 @@ struct kmem_cache { #define STATS_INC_FREEMISS(x) do { } while (0) #endif -#if DEBUG +#if SLAB_DEBUG /* * memory layout of objects: @@ -641,6 +546,8 @@ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include CACHE(ULONG_MAX) +#include + CACHE(ULONG_MAX) #undef CACHE }; EXPORT_SYMBOL(malloc_sizes); @@ -654,10 +561,17 @@ struct cache_names { static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include + {NULL,}, +#undef CACHE +#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" }, +#include {NULL,} #undef CACHE }; +int malloc_cache_num; +EXPORT_SYMBOL(malloc_cache_num); + static struct arraycache_init initarray_cache __initdata = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = @@ -735,6 +649,7 @@ static inline void init_lock_keys(void) */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; +static spinlock_t cache_chain_lock; /* * chicken and egg problem: delay the per-cpu array allocation @@ -767,7 +682,9 @@ static inline struct kmem_cache *__find_ { struct cache_sizes *csizep = malloc_sizes; -#if DEBUG + if (gfpflags & __GFP_UBC) + csizep += malloc_cache_num; +#if SLAB_DEBUG /* This happens if someone tries to call * kmem_cache_create(), or __kmalloc(), before * the generic caches are initialized. @@ -797,9 +714,93 @@ static struct kmem_cache *kmem_find_gene return __find_general_cachep(size, gfpflags); } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *) (slabp + 1); +} + +#ifdef CONFIG_BEANCOUNTERS +#define init_slab_ubps(cachep, slabp) do { \ + if (!((cachep)->flags & SLAB_UBC)) \ + break; \ + memset(slab_ubcs(cachep, slabp), 0, \ + (cachep)->num * sizeof(void *)); \ + } while (0) + +#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) +#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) +#define set_cache_objuse(cachep) do { \ + (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ + (cachep)->num - 1) / (cachep)->num; \ + if (!OFF_SLAB(cachep)) \ + break; \ + (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ + (cachep)->num - 1) / (cachep)->num; \ + } while (0) + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return virt_to_cache(obj)->objuse; +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ + return (cachep->grown - cachep->reaped - cachep->shrunk) + << cachep->gfporder; +} + +#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ + (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ + sizeof(void *)))) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj) +{ + struct slab *slabp; + int objnr; + + BUG_ON(!(cachep->flags & SLAB_UBC)); + slabp = virt_to_slab(obj); + objnr = (obj - slabp->s_mem) / cachep->buffer_size; + return slab_ubcs(cachep, slabp) + objnr; +} + +struct user_beancounter *slab_ub(void *obj) +{ + return *ub_slab_ptr(virt_to_cache(obj), obj); +} + +EXPORT_SYMBOL(slab_ub); + +#else +#define UB_ALIGN(flags) 1 +#define UB_EXTRA(flags) 0 +#define set_cache_objuse(c) do { } while (0) +#define init_slab_ubps(c, s) do { } while (0) +#endif + +static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) +{ + size_t size_noub; + + size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); + return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); +} + +static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); } /* @@ -844,20 +845,23 @@ static void cache_estimate(unsigned long * into account. */ nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + (buffer_size + sizeof(kmem_bufctl_t) + + UB_EXTRA(flags)); /* * This calculated number will be either the right * amount, or one greater than what we want. */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) + if (slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size) nr_objs--; + BUG_ON(slab_mgmt_size(nr_objs, align, flags) + + nr_objs * buffer_size > slab_size); if (nr_objs > SLAB_LIMIT) nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); + mgmt_size = slab_mgmt_size(nr_objs, align, flags); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1408,6 +1412,23 @@ static void init_list(struct kmem_cache cachep->nodelists[nodeid] = ptr; local_irq_enable(); } +static int offslab_limit; + +/* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ +static void __init set_up_list3s(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->nodelists[node] = &initkmem_list3[index + node]; + cachep->nodelists[node]->next_reap = jiffies + + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + } +} /* * Initialisation. Called after the page allocator have been initialised and @@ -1432,6 +1453,7 @@ void __init kmem_cache_init(void) if (i < MAX_NUMNODES) cache_cache.nodelists[i] = NULL; } + set_up_list3s(&cache_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1464,6 +1486,7 @@ void __init kmem_cache_init(void) /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); + spin_lock_init(&cache_chain_lock); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; @@ -1475,7 +1498,7 @@ void __init kmem_cache_init(void) */ cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG +#if SLAB_DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, @@ -1522,6 +1545,7 @@ void __init kmem_cache_init(void) slab_early_init = 0; + for (i = 0; i < 2; i++) { while (sizes->cs_size != ULONG_MAX) { /* * For performance, all the general caches are L1 aligned. @@ -1534,21 +1558,30 @@ void __init kmem_cache_init(void) sizes->cs_cachep = kmem_cache_create(names->name, sizes->cs_size, ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, + ARCH_KMALLOC_FLAGS|SLAB_PANIC| + (i ? SLAB_UBC : 0)|SLAB_NO_CHARGE, NULL); } + if (!(OFF_SLAB(sizes->cs_cachep))) + offslab_limit = sizes->cs_size; #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_create( - names->name_dma, + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE| SLAB_PANIC, NULL); #endif sizes++; names++; } + + sizes++; + names++; + if (!i) + malloc_cache_num = sizes - malloc_sizes; + } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; @@ -1587,10 +1620,9 @@ void __init kmem_cache_init(void) { int nid; - /* Replace the static kmem_list3 structures for the boot cpu */ - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(nid) { + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid); + init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1719,7 +1751,7 @@ static void kmem_rcu_free(struct rcu_hea kmem_cache_free(cachep->slabp_cache, slab_rcu); } -#if DEBUG +#if SLAB_DEBUG #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1796,7 +1828,7 @@ static void dump_line(char *data, int of } #endif -#if DEBUG +#if SLAB_DEBUG static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) { @@ -1889,7 +1921,7 @@ static void check_poison_obj(struct kmem } #endif -#if DEBUG +#if SLAB_DEBUG /** * slab_destroy_objs - destroy a slab and its objects * @cachep: cache pointer being destroyed @@ -1960,22 +1992,6 @@ static void slab_destroy(struct kmem_cac } } -/* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. - */ -static void __init set_up_list3s(struct kmem_cache *cachep, int index) -{ - int node; - - for_each_online_node(node) { - cachep->nodelists[node] = &initkmem_list3[index + node]; - cachep->nodelists[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - } -} - static void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; @@ -2013,7 +2029,6 @@ static void __kmem_cache_destroy(struct static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -2026,15 +2041,10 @@ static size_t calculate_slab_order(struc continue; if (flags & CFLGS_OFF_SLAB) { - /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). - */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + int slab_size; - if (num > offslab_limit) + slab_size = slab_mgmt_size_noalign(num, flags); + if (slab_size > offslab_limit) break; } @@ -2099,7 +2109,7 @@ static int __init_refok setup_cpu_cache( g_cpucache_up = PARTIAL_L3; } else { int node; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_online_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); @@ -2197,9 +2207,9 @@ kmem_cache_create (const char *name, siz } } -#if DEBUG +#if SLAB_DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ -#if FORCED_DEBUG +#if SLAB_FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size @@ -2284,7 +2294,7 @@ kmem_cache_create (const char *name, siz if (!cachep) goto oops; -#if DEBUG +#if SLAB_DEBUG cachep->obj_size = size; /* @@ -2306,7 +2316,7 @@ kmem_cache_create (const char *name, siz else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) +#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - size; @@ -2338,8 +2348,7 @@ kmem_cache_create (const char *name, siz cachep = NULL; goto oops; } - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + slab_size = slab_mgmt_size(cachep->num, align, flags); /* * If the slab has been placed off-slab, and we have enough space then @@ -2352,8 +2361,7 @@ kmem_cache_create (const char *name, siz if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + slab_size = slab_mgmt_size_noalign(cachep->num, flags); } cachep->colour_off = cache_line_size(); @@ -2390,7 +2398,10 @@ kmem_cache_create (const char *name, siz } /* cache setup completed, link it into the list */ + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); + set_cache_objuse(cachep); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", @@ -2400,7 +2411,7 @@ oops: } EXPORT_SYMBOL(kmem_cache_create); -#if DEBUG +#if SLAB_DEBUG static void check_irq_off(void) { BUG_ON(!irqs_disabled()); @@ -2496,10 +2507,11 @@ static int drain_freelist(struct kmem_ca } slabp = list_entry(p, struct slab, list); -#if DEBUG +#if SLAB_DEBUG BUG_ON(slabp->inuse); #endif list_del(&slabp->list); + STATS_INC_SHRUNK(cache); /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2579,10 +2591,14 @@ void kmem_cache_destroy(struct kmem_cach /* * the chain is never empty, cache_cache is never destroyed */ + spin_lock(&cache_chain_lock); list_del(&cachep->next); + spin_unlock(&cache_chain_lock); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); + spin_lock(&cache_chain_lock); list_add(&cachep->next, &cache_chain); + spin_unlock(&cache_chain_lock); mutex_unlock(&cache_chain_mutex); return; } @@ -2590,6 +2606,8 @@ void kmem_cache_destroy(struct kmem_cach if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) synchronize_rcu(); + + ub_kmemcache_free(cachep); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); } @@ -2615,7 +2633,8 @@ static struct slab *alloc_slabmgmt(struc if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags & ~GFP_THISNODE, nodeid); + local_flags & (~(__GFP_UBC | GFP_THISNODE)), + nodeid); if (!slabp) return NULL; } else { @@ -2626,14 +2645,10 @@ static struct slab *alloc_slabmgmt(struc slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; + init_slab_ubps(cachep, slabp); return slabp; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) -{ - return (kmem_bufctl_t *) (slabp + 1); -} - static void cache_init_objs(struct kmem_cache *cachep, struct slab *slabp) { @@ -2641,7 +2656,7 @@ static void cache_init_objs(struct kmem_ for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, slabp, i); -#if DEBUG +#if SLAB_DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) poison_obj(cachep, objp, POISON_FREE); @@ -2700,7 +2715,7 @@ static void *slab_get_obj(struct kmem_ca slabp->inuse++; next = slab_bufctl(slabp)[slabp->free]; -#if DEBUG +#if SLAB_DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; WARN_ON(slabp->nodeid != nodeid); #endif @@ -2714,7 +2729,7 @@ static void slab_put_obj(struct kmem_cac { unsigned int objnr = obj_to_index(cachep, slabp, objp); -#if DEBUG +#if SLAB_DEBUG /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); @@ -2802,7 +2817,7 @@ static int cache_grow(struct kmem_cache * 'nodeid'. */ if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); + objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid); if (!objp) goto failed; @@ -2836,7 +2851,7 @@ failed: return 0; } -#if DEBUG +#if SLAB_DEBUG /* * Perform extra freeing checks: @@ -3050,12 +3065,12 @@ static inline void cache_alloc_debugchec gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); -#if DEBUG +#if SLAB_DEBUG kmem_flagcheck(cachep, flags); #endif } -#if DEBUG +#if SLAB_DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, void *caller) { @@ -3467,9 +3482,14 @@ __cache_alloc(struct kmem_cache *cachep, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); - local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); + if (objp && should_charge(cachep, flags) && + ub_slab_charge(cachep, objp, flags)) { + kmem_cache_free(cachep, objp); + objp = NULL; + } + local_irq_restore(save_flags); if (unlikely((flags & __GFP_ZERO) && objp)) memset(objp, 0, obj_size(cachep)); @@ -3503,6 +3523,7 @@ static void free_block(struct kmem_cache /* fixup slab chains */ if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { + STATS_INC_SHRUNK(cachep); l3->free_objects -= cachep->num; /* No need to drop any previously held * lock here, even if we have a off-slab slab @@ -3531,7 +3552,7 @@ static void cache_flusharray(struct kmem int node = numa_node_id(); batchcount = ac->batchcount; -#if DEBUG +#if SLAB_DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); @@ -3552,7 +3573,7 @@ static void cache_flusharray(struct kmem free_block(cachep, ac->entry, batchcount, node); free_done: -#if STATS +#if SLAB_STATS { int i = 0; struct list_head *p; @@ -3586,6 +3607,9 @@ static inline void __cache_free(struct k check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + if (should_uncharge(cachep)) + ub_slab_uncharge(cachep, objp); + /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -3992,7 +4016,7 @@ static int enable_cpucache(struct kmem_c if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#if DEBUG +#if SLAB_DEBUG /* * With debugging enabled, large batchcount lead to excessively long * periods with disabled local interrupts. Limit the batchcount @@ -4060,6 +4084,7 @@ static void cache_reap(struct work_struc /* Give up. Setup the next iteration. */ goto out; + {KSTAT_PERF_ENTER(cache_reap) list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); @@ -4100,6 +4125,7 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); + KSTAT_PERF_LEAVE(cache_reap)} out: /* Set up the next iteration */ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); @@ -4113,7 +4139,7 @@ static void print_slabinfo_header(struct * Output format version, so at least we can change it * without _too_ many complaints. */ -#if STATS +#if SLAB_STATS seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); #else seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4122,14 +4148,82 @@ static void print_slabinfo_header(struct " "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); -#if STATS +#if SLAB_STATS seq_puts(m, " : globalstat " - " "); + " "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); } +#define SHOW_TOP_SLABS 10 + +static unsigned long get_cache_size(struct kmem_cache *cachep) +{ + unsigned long flags; + unsigned long slabs; + struct kmem_list3 *l3; + struct list_head *lh; + int node; + + slabs = 0; + + for_each_online_node (node) { + l3 = cachep->nodelists[node]; + if (l3 == NULL) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each (lh, &l3->slabs_full) + slabs++; + list_for_each (lh, &l3->slabs_partial) + slabs++; + list_for_each (lh, &l3->slabs_free) + slabs++; + spin_unlock_irqrestore(&l3->list_lock, flags); + } + + return slabs * (PAGE_SIZE << cachep->gfporder) + + (OFF_SLAB(cachep) ? + cachep->slabp_cache->buffer_size * slabs : 0); +} + +void show_slab_info(void) +{ + int i, j; + unsigned long size; + struct kmem_cache *ptr; + unsigned long sizes[SHOW_TOP_SLABS]; + struct kmem_cache *top[SHOW_TOP_SLABS]; + + memset(top, 0, sizeof(top)); + memset(sizes, 0, sizeof(sizes)); + + printk("Top %d caches:\n", SHOW_TOP_SLABS); + + spin_lock(&cache_chain_lock); + list_for_each_entry (ptr, &cache_chain, next) { + size = get_cache_size(ptr); + + j = 0; + for (i = 1; i < SHOW_TOP_SLABS; i++) + if (sizes[i] < sizes[j]) + j = i; + + if (size > sizes[j]) { + sizes[j] = size; + top[j] = ptr; + } + } + + for (i = 0; i < SHOW_TOP_SLABS; i++) + if (top[i]) + printk("%-21s: size %10lu objsize %10u\n", + top[i]->name, sizes[i], + top[i]->buffer_size); + spin_unlock(&cache_chain_lock); +} + static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; @@ -4208,19 +4302,20 @@ static int s_show(struct seq_file *m, vo if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d", name, active_objs, num_objs, cachep->buffer_size, cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", cachep->limit, cachep->batchcount, cachep->shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", active_slabs, num_slabs, shared_avail); -#if STATS +#if SLAB_STATS { /* list3 stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; unsigned long reaped = cachep->reaped; + unsigned long shrunk = cachep->shrunk; unsigned long errors = cachep->errors; unsigned long max_freeable = cachep->max_freeable; unsigned long node_allocs = cachep->node_allocs; @@ -4228,9 +4323,10 @@ static int s_show(struct seq_file *m, vo unsigned long overflows = cachep->node_overflow; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, + %4lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + node_frees, overflows, shrunk); } /* cpu stats */ { diff -uprN linux-2.6.24/mm/slub.c linux-2.6.24.ovz/mm/slub.c --- linux-2.6.24/mm/slub.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/slub.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,8 @@ #include #include +#include + /* * Lock order: * 1. slab_lock(page) @@ -186,9 +188,11 @@ static inline void ClearSlabDebug(struct /* * Set of flags that will prevent slab merging + * + * FIXME - think over how to allow merging accountable slubs */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_UBC) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA) @@ -326,6 +330,81 @@ static inline int slab_index(void *p, st return (p - addr) / s->size; } +#ifdef CONFIG_BEANCOUNTERS +static inline void inc_cache_grown(struct kmem_cache *s) +{ + atomic_inc(&s->grown); +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ + atomic_dec(&s->grown); +} + +unsigned long ub_cache_growth(struct kmem_cache *cachep) +{ + return atomic_read(&cachep->grown) << cachep->order; +} + +static void __flush_cpu_slab(struct kmem_cache *s, int cpu); + +int kmem_cache_objuse(struct kmem_cache *cachep) +{ + return cachep->objuse; +} + +EXPORT_SYMBOL(kmem_cache_objuse); + +int kmem_obj_objuse(void *obj) +{ + return kmem_cache_objuse(virt_to_head_page(obj)->slab); +} + +EXPORT_SYMBOL(kmem_obj_objuse); + +#define page_ubs(pg) (pg->bc.slub_ubs) + +struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj) +{ + struct page *pg; + + BUG_ON(!(s->flags & SLAB_UBC)); + pg = virt_to_head_page(obj); + return page_ubs(pg) + slab_index(obj, s, page_address(pg)); +} + +EXPORT_SYMBOL(ub_slab_ptr); + +struct user_beancounter *slab_ub(void *obj) +{ + struct page *pg; + + pg = virt_to_head_page(obj); + BUG_ON(!(pg->slab->flags & SLAB_UBC)); + return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))]; +} + +EXPORT_SYMBOL(slab_ub); + +void kmem_mark_nocharge(struct kmem_cache *cachep) +{ + cachep->flags |= SLAB_NO_CHARGE; +} +#else +static inline void inc_cache_grown(struct kmem_cache *s) +{ +} + +static inline void dec_cache_grown(struct kmem_cache *s) +{ +} +#endif + +void show_slab_info(void) +{ + /* FIXME - show it */ +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -1042,6 +1121,8 @@ static struct page *allocate_slab(struct struct page * page; int pages = 1 << s->order; + flags &= ~__GFP_UBC; + if (s->order) flags |= __GFP_COMP; @@ -1064,9 +1145,12 @@ static struct page *allocate_slab(struct NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, pages); + inc_cache_grown(s); return page; } +static void __free_slab(struct kmem_cache *s, struct page *page); + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -1090,6 +1174,18 @@ static struct page *new_slab(struct kmem if (!page) goto out; +#ifdef CONFIG_BEANCOUNTERS + if (s->flags & SLAB_UBC) { + BUG_ON(page_ubs(page) != NULL); + page_ubs(page) = kzalloc(s->objects * sizeof(void *), + flags & ~__GFP_UBC); + if (page_ubs(page) == NULL) { + __free_slab(s, page); + page = NULL; + goto out; + } + } +#endif n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); @@ -1137,6 +1233,13 @@ static void __free_slab(struct kmem_cach NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); +#ifdef CONFIG_BEANCOUNTERS + if (page_ubs(page) != NULL) { + BUG_ON(!(s->flags & SLAB_UBC)); + kfree(page_ubs(page)); + page_ubs(page) = NULL; + } +#endif __free_pages(page, s->order); } @@ -1159,6 +1262,8 @@ static void free_slab(struct kmem_cache call_rcu(head, rcu_free_slab); } else __free_slab(s, page); + + dec_cache_grown(s); } static void discard_slab(struct kmem_cache *s, struct page *page) @@ -1556,6 +1661,13 @@ static void __always_inline *slab_alloc( object = c->freelist; c->freelist = object[c->offset]; } + + if (object && should_charge(s, gfpflags) && + ub_slab_charge(s, object, gfpflags)) { + kmem_cache_free(s, object); + object = NULL; + } + local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) @@ -1656,6 +1768,10 @@ static void __always_inline slab_free(st local_irq_save(flags); debug_check_no_locks_freed(object, s->objsize); + + if (should_uncharge(s)) + ub_slab_uncharge(s, x); + c = get_cpu_slab(s, smp_processor_id()); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; @@ -2208,6 +2324,9 @@ static int kmem_cache_open(struct kmem_c #ifdef CONFIG_NUMA s->defrag_ratio = 100; #endif +#ifdef CONFIG_BEANCOUNTERS + s->objuse = s->size + (sizeof(struct page) / s->objects); +#endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2334,6 +2453,10 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); +#ifdef CONFIG_BEANCOUNTERS +struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +EXPORT_SYMBOL(ub_kmalloc_caches); +#endif #ifdef CONFIG_ZONE_DMA static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; @@ -2379,6 +2502,11 @@ static struct kmem_cache *create_kmalloc { unsigned int flags = 0; + if (gfp_flags & __GFP_UBC) { + flags = SLAB_UBC | SLAB_NO_CHARGE; + gfp_flags &= ~__GFP_UBC; + } + if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; @@ -2506,11 +2634,14 @@ static struct kmem_cache *get_slab(size_ index = fls(size - 1); #ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) + if (unlikely((flags & SLUB_DMA))) { + BUG_ON(flags & __GFP_UBC); return dma_kmalloc_cache(index, flags); + } #endif - return &kmalloc_caches[index]; + + return __kmalloc_cache(flags, index); } void *__kmalloc(size_t size, gfp_t flags) @@ -2814,6 +2945,11 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_KERNEL); kmalloc_caches[0].refcount = -1; +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc", + sizeof(struct kmem_cache_node), GFP_KERNEL_UBC); + ub_kmalloc_caches[0].refcount = -1; +#endif caches++; hotplug_memory_notifier(slab_memory_callback, 1); @@ -2826,17 +2962,29 @@ void __init kmem_cache_init(void) if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[1], + "kmalloc-96-ubc", 96, GFP_KERNEL_UBC); +#endif caches++; } if (KMALLOC_MIN_SIZE <= 128) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[2], + "kmalloc-192-ubc", 192, GFP_KERNEL_UBC); +#endif caches++; } for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); +#ifdef CONFIG_BEANCOUNTERS + create_kmalloc_cache(&ub_kmalloc_caches[i], + "kmalloc-ubc", 1 << i, GFP_KERNEL_UBC); +#endif caches++; } @@ -2861,9 +3009,14 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); +#ifdef CONFIG_BEANCOUNTERS + ub_kmalloc_caches[i].name = + kasprintf(GFP_KERNEL, "kmalloc-%d-ubc", 1 << i); +#endif + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -3991,6 +4144,8 @@ static char *create_unique_id(struct kme *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (s->flags & SLAB_UBC) + *p++ = 'b'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); diff -uprN linux-2.6.24/mm/swap.c linux-2.6.24.ovz/mm/swap.c --- linux-2.6.24/mm/swap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/swap.c 2008-03-25 18:53:59.000000000 -0500 @@ -221,6 +221,7 @@ void fastcall lru_cache_add_active(struc __pagevec_lru_add_active(pvec); put_cpu_var(lru_add_active_pvecs); } +EXPORT_SYMBOL(lru_cache_add_active); /* * Drain pages out of the cpu's pagevecs. @@ -256,6 +257,8 @@ void lru_add_drain(void) put_cpu(); } +EXPORT_SYMBOL(lru_add_drain); + #ifdef CONFIG_NUMA static void lru_add_drain_per_cpu(struct work_struct *dummy) { diff -uprN linux-2.6.24/mm/swap_state.c linux-2.6.24.ovz/mm/swap_state.c --- linux-2.6.24/mm/swap_state.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/swap_state.c 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,9 @@ #include +#include +#include + /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_page_list, to make sync_page look nicer, and to allow @@ -43,6 +46,7 @@ struct address_space swapper_space = { .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; +EXPORT_SYMBOL(swapper_space); #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,14 +57,18 @@ static struct { unsigned long find_total; unsigned long noent_race; unsigned long exist_race; + unsigned long remove_race; } swap_cache_info; +EXPORT_SYMBOL(swap_cache_info); void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + printk("Swap cache: add %lu, delete %lu, find %lu/%lu, " + "race %lu+%lu+%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total, - swap_cache_info.noent_race, swap_cache_info.exist_race); + swap_cache_info.noent_race, swap_cache_info.exist_race, + swap_cache_info.remove_race); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } @@ -69,8 +77,7 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask) +int __add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; @@ -95,7 +102,9 @@ static int __add_to_swap_cache(struct pa return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +EXPORT_SYMBOL(__add_to_swap_cache); + +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -120,6 +129,8 @@ static int add_to_swap_cache(struct page return 0; } +EXPORT_SYMBOL(add_to_swap_cache); + /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -154,7 +165,14 @@ int add_to_swap(struct page * page, gfp_ BUG_ON(!PageLocked(page)); for (;;) { - entry = get_swap_page(); + struct user_beancounter *ub; + + ub = pb_grab_page_ub(page); + if (IS_ERR(ub)) + return 0; + + entry = get_swap_page(ub); + put_beancounter(ub); if (!entry.val) return 0; @@ -240,6 +258,7 @@ int move_from_swap_cache(struct page *pa delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); + ub_io_release_debug(page); set_page_dirty(page); } return err; @@ -255,10 +274,13 @@ int move_from_swap_cache(struct page *pa */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (!PageSwapCache(page)) + return; + if (!TestSetPageLocked(page)) { remove_exclusive_swap_page(page); unlock_page(page); - } + } else + INC_CACHE_INFO(remove_race); } /* @@ -368,3 +390,5 @@ struct page *read_swap_cache_async(swp_e page_cache_release(new_page); return found_page; } + +EXPORT_SYMBOL(read_swap_cache_async); diff -uprN linux-2.6.24/mm/swapfile.c linux-2.6.24.ovz/mm/swapfile.c --- linux-2.6.24/mm/swapfile.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/swapfile.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,8 @@ #include #include +#include + DEFINE_SPINLOCK(swap_lock); unsigned int nr_swapfiles; long total_swap_pages; @@ -43,8 +45,12 @@ static const char Bad_offset[] = "Bad sw static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; +struct swap_info_struct swap_info[MAX_SWAPFILES]; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +EXPORT_SYMBOL(total_swap_pages); +EXPORT_SYMBOL(swap_lock); +EXPORT_SYMBOL(swap_list); +EXPORT_SYMBOL(swap_info); static DEFINE_MUTEX(swapon_mutex); @@ -171,7 +177,7 @@ no_page: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct user_beancounter *ub) { struct swap_info_struct *si; pgoff_t offset; @@ -192,6 +198,8 @@ swp_entry_t get_swap_page(void) wrapped++; } + if (si->flags & SWP_READONLY) + continue; if (!si->highest_bit) continue; if (!(si->flags & SWP_WRITEOK)) @@ -201,6 +209,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(si); if (offset) { spin_unlock(&swap_lock); + ub_swapentry_inc(si, offset, ub); return swp_entry(type, offset); } next = swap_list.next; @@ -212,6 +221,8 @@ noswap: return (swp_entry_t) {0}; } +EXPORT_SYMBOL(get_swap_page); + swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -219,7 +230,7 @@ swp_entry_t get_swap_page_of_type(int ty spin_lock(&swap_lock); si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) { nr_swap_pages--; offset = scan_swap_map(si); if (offset) { @@ -276,6 +287,7 @@ static int swap_entry_free(struct swap_i count--; p->swap_map[offset] = count; if (!count) { + ub_swapentry_dec(p, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -304,6 +316,8 @@ void swap_free(swp_entry_t entry) } } +EXPORT_SYMBOL(swap_free); + /* * How many references to page are currently swapped out? */ @@ -385,6 +399,55 @@ int remove_exclusive_swap_page(struct pa return retval; } +int try_to_remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_count(page) != 2) /* 2: us + cache */ + return 0; + + entry.val = page->private; + p = swap_info_get(entry); + if (!p) + return 0; + + if (!vm_swap_full() && + (p->flags & (SWP_ACTIVE|SWP_READONLY)) == SWP_ACTIVE) { + spin_unlock(&swap_lock); + return 0; + } + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + write_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == 2) && !PageWriteback(page)) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + write_unlock_irq(&swapper_space.tree_lock); + } + spin_unlock(&swap_lock); + + if (retval) { + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. @@ -424,6 +487,7 @@ void free_swap_and_cache(swp_entry_t ent page_cache_release(page); } } +EXPORT_SYMBOL(free_swap_and_cache); #ifdef CONFIG_HIBERNATION /* @@ -507,11 +571,17 @@ unsigned int count_swap_pages(int type, * force COW, vm_page_prot omits write permission from any private vma. */ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { - inc_mm_counter(vma->vm_mm, anon_rss); + struct mm_struct *mm; + + mm = vma->vm_mm; + inc_mm_counter(mm, anon_rss); + ub_unused_privvm_dec(mm, vma); + pb_add_ref(page, mm, pb); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at(mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); swap_free(entry); @@ -524,7 +594,8 @@ static void unuse_pte(struct vm_area_str static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; @@ -538,7 +609,7 @@ static int unuse_pte_range(struct vm_are * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); + unuse_pte(vma, pte++, addr, entry, page, pb); found = 1; break; } @@ -549,7 +620,8 @@ static int unuse_pte_range(struct vm_are static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pmd_t *pmd; unsigned long next; @@ -559,7 +631,7 @@ static inline int unuse_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (unuse_pte_range(vma, pmd, addr, next, entry, page)) + if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb)) return 1; } while (pmd++, addr = next, addr != end); return 0; @@ -567,7 +639,8 @@ static inline int unuse_pmd_range(struct static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pud_t *pud; unsigned long next; @@ -577,14 +650,15 @@ static inline int unuse_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (unuse_pmd_range(vma, pud, addr, next, entry, page)) + if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb)) return 1; } while (pud++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { pgd_t *pgd; unsigned long addr, end, next; @@ -605,14 +679,15 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (unuse_pud_range(vma, pgd, addr, next, entry, page)) + if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb)) return 1; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) + swp_entry_t entry, struct page *page, + struct page_beancounter **pb) { struct vm_area_struct *vma; @@ -627,7 +702,7 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && unuse_vma(vma, entry, page)) + if (vma->anon_vma && unuse_vma(vma, entry, page, pb)) break; } up_read(&mm->mmap_sem); @@ -693,6 +768,7 @@ static int try_to_unuse(unsigned int typ int retval = 0; int reset_overflow = 0; int shmem; + struct page_beancounter *pb; /* * When searching mms for an entry, a good strategy is to @@ -744,6 +820,13 @@ static int try_to_unuse(unsigned int typ break; } + pb = NULL; + if (pb_alloc_all(&pb)) { + page_cache_release(page); + retval = -ENOMEM; + break; + } + /* * Don't hold on to start_mm if it looks like exiting. */ @@ -766,6 +849,20 @@ static int try_to_unuse(unsigned int typ lock_page(page); wait_on_page_writeback(page); + /* If read failed we cannot map not-uptodate page to + * user space. Actually, we are in serious troubles, + * we do not even know what process to kill. So, the only + * variant remains: to stop swapoff() and allow someone + * to kill processes to zap invalid pages. + */ + if (unlikely(!PageUptodate(page))) { + pb_free_list(&pb); + unlock_page(page); + page_cache_release(page); + retval = -EIO; + break; + } + /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space @@ -777,7 +874,7 @@ static int try_to_unuse(unsigned int typ if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - retval = unuse_mm(start_mm, entry, page); + retval = unuse_mm(start_mm, entry, page, &pb); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -807,7 +904,7 @@ static int try_to_unuse(unsigned int typ set_start_mm = 1; shmem = shmem_unuse(entry, page); } else - retval = unuse_mm(mm, entry, page); + retval = unuse_mm(mm, entry, page, &pb); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); @@ -821,6 +918,8 @@ static int try_to_unuse(unsigned int typ mmput(start_mm); start_mm = new_start_mm; } + + pb_free_list(&pb); if (retval) { unlock_page(page); page_cache_release(page); @@ -1183,6 +1282,10 @@ asmlinkage long sys_swapoff(const char _ int i, type, prev; int err; + /* VE admin check is just to be on the safe side, the admin may affect + * swaps only if he has access to special, i.e. if he has been granted + * access to the block device or if the swap file is in the area + * visible to him. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1282,6 +1385,7 @@ asmlinkage long sys_swapoff(const char _ spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + ub_swap_fini(p); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); @@ -1301,6 +1405,8 @@ out: return err; } +EXPORT_SYMBOL(sys_swapoff); + #ifdef CONFIG_PROC_FS /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) @@ -1635,9 +1741,16 @@ asmlinkage long sys_swapon(const char __ goto bad_swap; } + if (ub_swap_init(p, maxpages)) { + error = -ENOMEM; + goto bad_swap; + } + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); p->flags = SWP_ACTIVE; + if (swap_flags & SWAP_FLAG_READONLY) + p->flags |= SWP_READONLY; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -1697,6 +1810,8 @@ out: return error; } +EXPORT_SYMBOL(sys_swapon); + void si_swapinfo(struct sysinfo *val) { unsigned int i; @@ -1756,6 +1871,8 @@ bad_file: goto out; } +EXPORT_SYMBOL(swap_duplicate); + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff -uprN linux-2.6.24/mm/truncate.c linux-2.6.24.ovz/mm/truncate.c --- linux-2.6.24/mm/truncate.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/truncate.c 2008-03-25 18:53:59.000000000 -0500 @@ -77,6 +77,7 @@ void cancel_dirty_page(struct page *page BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); + ub_io_release_context(page, account_size); } } } diff -uprN linux-2.6.24/mm/vmalloc.c linux-2.6.24.ovz/mm/vmalloc.c --- linux-2.6.24/mm/vmalloc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/vmalloc.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,9 @@ #include #include +#include +#include + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -280,6 +283,70 @@ static struct vm_struct *__find_vm_area( return tmp; } +struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) +{ + unsigned long addr, best_addr, delta, best_delta; + struct vm_struct **p, **best_p, *tmp, *area; + + area = kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + size += PAGE_SIZE; /* one-page gap at the end */ + addr = VMALLOC_START; + best_addr = 0UL; + best_p = NULL; + best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) && + (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); + p = &tmp->next) { + if ((unsigned long)tmp->addr < addr) + continue; + if ((size + addr) < addr) + break; + delta = (unsigned long) tmp->addr - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + addr = tmp->size + (unsigned long)tmp->addr; + if (addr > VMALLOC_END-size) + break; + } + + if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { + /* check free area after list end */ + delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); + if (delta < best_delta) { + best_delta = delta; + best_addr = addr; + best_p = p; + } + } + if (best_addr) { + area->flags = flags; + /* allocate at the end of this area */ + area->addr = (void *)(best_addr + best_delta); + area->size = size; + area->next = *best_p; + area->pages = NULL; + area->nr_pages = 0; + area->phys_addr = 0; + *best_p = area; + /* check like in __vunmap */ + WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); + } else { + kfree(area); + area = NULL; + } + write_unlock(&vmlist_lock); + + return area; +} + /* Caller must hold vmlist_lock */ static struct vm_struct *__remove_vm_area(void *addr) { @@ -319,7 +386,7 @@ struct vm_struct *remove_vm_area(void *a return v; } -static void __vunmap(void *addr, int deallocate_pages) +static void __vunmap(void *addr, int deallocate_pages, int uncharge) { struct vm_struct *area; @@ -345,6 +412,8 @@ static void __vunmap(void *addr, int dea if (deallocate_pages) { int i; + if (uncharge) + dec_vmalloc_charged(area); for (i = 0; i < area->nr_pages; i++) { BUG_ON(!area->pages[i]); __free_page(area->pages[i]); @@ -373,7 +442,7 @@ static void __vunmap(void *addr, int dea void vfree(void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 1); + __vunmap(addr, 1, 1); } EXPORT_SYMBOL(vfree); @@ -389,7 +458,7 @@ EXPORT_SYMBOL(vfree); void vunmap(void *addr) { BUG_ON(in_interrupt()); - __vunmap(addr, 0); + __vunmap(addr, 0, 0); } EXPORT_SYMBOL(vunmap); @@ -464,10 +533,12 @@ void *__vmalloc_area_node(struct vm_stru if (map_vm_area(area, prot, &pages)) goto fail; + + inc_vmalloc_charged(area, gfp_mask); return area->addr; fail: - vfree(area->addr); + __vunmap(area->addr, 1, 0); return NULL; } @@ -509,6 +580,21 @@ void *__vmalloc(unsigned long size, gfp_ } EXPORT_SYMBOL(__vmalloc); +static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = get_vm_area_best(size, VM_ALLOC); + if (!area) + return NULL; + + return __vmalloc_area_node(area, mask, prot, -1); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -524,6 +610,26 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *ub_vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(ub_vmalloc); + +void *vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} + +EXPORT_SYMBOL(vmalloc_best); + +void *ub_vmalloc_best(unsigned long size) +{ + return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); +} + +EXPORT_SYMBOL(ub_vmalloc_best); + /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size @@ -564,6 +670,12 @@ void *vmalloc_node(unsigned long size, i } EXPORT_SYMBOL(vmalloc_node); +void *ub_vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, node); +} +EXPORT_SYMBOL(ub_vmalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -823,3 +935,37 @@ void free_vm_area(struct vm_struct *area kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); + +void vprintstat(void) +{ + struct vm_struct *p, *last_p = NULL; + unsigned long addr, size, free_size, max_free_size; + int num; + + addr = VMALLOC_START; + size = max_free_size = 0; + num = 0; + + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + free_size = (unsigned long)p->addr - addr; + if (free_size > max_free_size) + max_free_size = free_size; + addr = (unsigned long)p->addr + p->size; + size += p->size; + ++num; + last_p = p; + } + if (last_p) { + free_size = VMALLOC_END - + ((unsigned long)last_p->addr + last_p->size); + if (free_size > max_free_size) + max_free_size = free_size; + } + read_unlock(&vmlist_lock); + + printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" + " Max_Free: %luKB Start: %lx End: %lx\n", + size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, + max_free_size/1024, VMALLOC_START, VMALLOC_END); +} diff -uprN linux-2.6.24/mm/vmscan.c linux-2.6.24.ovz/mm/vmscan.c --- linux-2.6.24/mm/vmscan.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/vmscan.c 2008-03-25 18:53:59.000000000 -0500 @@ -38,10 +38,14 @@ #include #include +#include +#include + #include #include #include +#include #include "internal.h" @@ -161,6 +165,9 @@ unsigned long shrink_slab(unsigned long if (scanned == 0) scanned = SWAP_CLUSTER_MAX; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + return 1; + if (!down_read_trylock(&shrinker_rwsem)) return 1; /* Assume we'll be able to shrink next time */ @@ -195,6 +202,9 @@ unsigned long shrink_slab(unsigned long int shrink_ret; int nr_before; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; + nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) @@ -209,6 +219,7 @@ unsigned long shrink_slab(unsigned long shrinker->nr += total_scan; } +done: up_read(&shrinker_rwsem); return ret; } @@ -322,6 +333,7 @@ static pageout_t pageout(struct page *pa */ if (PagePrivate(page)) { if (try_to_free_buffers(page)) { + ub_io_release_context(page, 0); ClearPageDirty(page); printk("%s: orphaned page\n", __FUNCTION__); return PAGE_CLEAN; @@ -1016,6 +1028,7 @@ force_reclaim_mapped: reclaim_mapped = 1; } + {KSTAT_PERF_ENTER(refill_inact) lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, @@ -1095,6 +1108,7 @@ force_reclaim_mapped: spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); + KSTAT_PERF_LEAVE(refill_inact)} } /* @@ -1133,6 +1147,8 @@ static unsigned long shrink_zone(int pri nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); nr_active -= nr_to_scan; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; shrink_active_list(nr_to_scan, zone, sc, priority); } @@ -1140,12 +1156,15 @@ static unsigned long shrink_zone(int pri nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); nr_inactive -= nr_to_scan; + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + goto done; nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, sc); } } throttle_vm_writeout(sc->gfp_mask); +done: return nr_reclaimed; } @@ -1189,6 +1208,9 @@ static unsigned long shrink_zones(int pr sc->all_unreclaimable = 0; nr_reclaimed += shrink_zone(priority, zone, sc); + + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) + break; } return nr_reclaimed; } @@ -1224,8 +1246,10 @@ unsigned long try_to_free_pages(struct z .order = order, }; + KSTAT_PERF_ENTER(ttfp); count_vm_event(ALLOCSTALL); + ub_oom_start(); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -1265,6 +1289,11 @@ unsigned long try_to_free_pages(struct z sc.may_writepage = 1; } + if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) { + ret = 1; + goto out; + } + /* Take a nap, wait for some writeback to complete */ if (sc.nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); @@ -1290,6 +1319,7 @@ out: zone->prev_priority = priority; } + KSTAT_PERF_LEAVE(ttfp); return ret; } diff -uprN linux-2.6.24/mm/vmstat.c linux-2.6.24.ovz/mm/vmstat.c --- linux-2.6.24/mm/vmstat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/mm/vmstat.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,40 @@ #include #include #include +#include + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zone_page_state(&zones[i], NR_ACTIVE); + *inactive += zone_page_state(&zones[i], NR_INACTIVE); + *free += zone_page_state(&zones[i], NR_FREE_PAGES); + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -41,6 +75,20 @@ static void sum_vm_events(unsigned long } } +unsigned long vm_events(enum vm_event_item i) +{ + int cpu; + unsigned long sum; + struct vm_event_state *st; + + sum = 0; + for_each_online_cpu(cpu) { + st = &per_cpu(vm_event_states, cpu); + sum += st->event[i]; + } + + return (sum < 0 ? 0 : sum); +} /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change @@ -733,30 +781,40 @@ static void *vmstat_start(struct seq_fil unsigned long *v; #ifdef CONFIG_VM_EVENT_COUNTERS unsigned long *e; +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \ + sizeof(struct vm_event_state)) +#else +#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)) #endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; -#ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); -#endif + v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + + if (ve_is_super(get_exec_env())) { + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; -#endif + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif + } else + memset(v, 0, VMSTAT_BUFSIZE); + + if (virtinfo_notifier_call(VITYPE_GENERAL, + VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) { + kfree(v); + m->private = NULL; + return ERR_PTR(-ENOMSG); + } + return v + *pos; } diff -uprN linux-2.6.24/net/8021q/vlan.c linux-2.6.24.ovz/net/8021q/vlan.c --- linux-2.6.24/net/8021q/vlan.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/8021q/vlan.c 2008-03-25 18:53:59.000000000 -0500 @@ -33,6 +33,9 @@ #include #include +#include +#include + #include #include "vlan.h" #include "vlanproc.h" @@ -68,6 +71,44 @@ static struct packet_type vlan_packet_ty .func = vlan_skb_recv, /* VLAN receive method */ }; +#ifdef CONFIG_VE +static int vlan_start(void *data) +{ + int err; + + err = vlan_proc_init(); + if (err < 0) + goto out_proc; + + __module_get(THIS_MODULE); + return 0; + +out_proc: + return err; +} + +static void vlan_stop(void *data) +{ + struct ve_struct *ve; + + ve = (struct ve_struct *)data; + if (ve->_proc_vlan_dir == NULL) + return; + + vlan_proc_cleanup(); + ve->_proc_vlan_conf = NULL; + ve->_proc_vlan_dir = NULL; + module_put(THIS_MODULE); +} + +static struct ve_hook vlan_ve_hook = { + .init = vlan_start, + .fini = vlan_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + /* End of global variables definitions. */ /* @@ -106,6 +147,7 @@ static int __init vlan_proto_init(void) goto err2; vlan_ioctl_set(vlan_ioctl_handler); + ve_hook_register(VE_SS_CHAIN, &vlan_ve_hook); return 0; err2: @@ -124,6 +166,7 @@ static void __exit vlan_cleanup_module(v { int i; + ve_hook_unregister(&vlan_ve_hook); vlan_ioctl_set(NULL); vlan_netlink_fini(); @@ -147,14 +190,16 @@ module_init(vlan_proto_init); module_exit(vlan_cleanup_module); /* Must be invoked with RCU read lock (no preempt) */ -static struct vlan_group *__vlan_find_group(int real_dev_ifindex) +static struct vlan_group *__vlan_find_group(int real_dev_ifindex, + struct ve_struct *ve) { struct vlan_group *grp; struct hlist_node *n; int hash = vlan_grp_hashfn(real_dev_ifindex); hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) { - if (grp->real_dev_ifindex == real_dev_ifindex) + if (grp->real_dev_ifindex == real_dev_ifindex && + ve_accessible_strict(ve, grp->owner)) return grp; } @@ -168,7 +213,8 @@ static struct vlan_group *__vlan_find_gr struct net_device *__find_vlan_dev(struct net_device *real_dev, unsigned short VID) { - struct vlan_group *grp = __vlan_find_group(real_dev->ifindex); + struct vlan_group *grp = __vlan_find_group(real_dev->ifindex, + real_dev->owner_env); if (grp) return vlan_group_get_device(grp, VID); @@ -191,14 +237,14 @@ static struct vlan_group *vlan_group_all unsigned int size; unsigned int i; - grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); + grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC); if (!grp) return NULL; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++) { - grp->vlan_devices_arrays[i] = kzalloc(size, GFP_KERNEL); + grp->vlan_devices_arrays[i] = kzalloc(size, GFP_KERNEL_UBC); if (!grp->vlan_devices_arrays[i]) goto err; } @@ -242,7 +288,7 @@ static int unregister_vlan_dev(struct ne return -EINVAL; ASSERT_RTNL(); - grp = __vlan_find_group(real_dev_ifindex); + grp = __vlan_find_group(real_dev_ifindex, real_dev->owner_env); ret = 0; @@ -282,6 +328,9 @@ static int unregister_vlan_dev(struct ne hlist_del_rcu(&grp->hlist); + put_ve(grp->owner); + grp->owner = NULL; + /* Free the group, after all cpu's are done. */ call_rcu(&grp->rcu, vlan_rcu_free); @@ -388,6 +437,8 @@ void vlan_setup(struct net_device *new_d new_dev->do_ioctl = vlan_dev_ioctl; memset(new_dev->broadcast, 0, ETH_ALEN); + if (!ve_is_super(get_exec_env())) + new_dev->features |= NETIF_F_VIRTUAL; } static void vlan_transfer_operstate(const struct net_device *dev, struct net_device *vlandev) @@ -455,7 +506,7 @@ int register_vlan_dev(struct net_device struct vlan_group *grp, *ngrp = NULL; int err; - grp = __vlan_find_group(real_dev->ifindex); + grp = __vlan_find_group(real_dev->ifindex, real_dev->owner_env); if (!grp) { ngrp = grp = vlan_group_alloc(real_dev->ifindex); if (!grp) @@ -609,13 +660,12 @@ static void vlan_sync_address(struct net static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct vlan_group *grp = __vlan_find_group(dev->ifindex); + struct vlan_group *grp; int i, flgs; struct net_device *vlandev; + struct ve_struct *env; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - + grp = __vlan_find_group(dev->ifindex, dev->owner_env); if (!grp) goto out; @@ -692,7 +742,9 @@ static int vlan_device_event(struct noti ret = unregister_vlan_dev(dev, VLAN_DEV_INFO(vlandev)->vlan_id); + env = set_exec_env(vlandev->owner_env); unregister_netdevice(vlandev); + set_exec_env(env); /* Group was destroyed? */ if (ret == 1) @@ -705,6 +757,17 @@ out: return NOTIFY_DONE; } +static inline int vlan_check_caps(void) +{ + if (capable(CAP_NET_ADMIN)) + return 1; +#ifdef CONFIG_VE + if (capable(CAP_VE_NET_ADMIN)) + return 1; +#endif + return 0; +} + /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver @@ -752,7 +815,7 @@ static int vlan_ioctl_handler(struct net switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, @@ -762,7 +825,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, @@ -771,7 +834,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_FLAG_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = vlan_dev_set_vlan_flag(dev, args.u.flag, @@ -780,7 +843,7 @@ static int vlan_ioctl_handler(struct net case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; if ((args.u.name_type >= 0) && (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { @@ -793,14 +856,14 @@ static int vlan_ioctl_handler(struct net case ADD_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!vlan_check_caps()) break; err = unregister_vlan_device(dev); break; diff -uprN linux-2.6.24/net/8021q/vlan_dev.c linux-2.6.24.ovz/net/8021q/vlan_dev.c --- linux-2.6.24/net/8021q/vlan_dev.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/8021q/vlan_dev.c 2008-03-25 18:53:59.000000000 -0500 @@ -453,6 +453,7 @@ int vlan_dev_hard_header(struct sk_buff int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = vlan_dev_get_stats(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -507,13 +508,17 @@ int vlan_dev_hard_start_xmit(struct sk_b stats->tx_bytes += skb->len; skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return 0; } int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ve_struct *env; struct net_device_stats *stats = vlan_dev_get_stats(dev); unsigned short veth_TCI; @@ -531,7 +536,10 @@ int vlan_dev_hwaccel_hard_start_xmit(str stats->tx_bytes += skb->len; skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->owner_env = skb->dev->owner_env; + env = set_exec_env(skb->owner_env); dev_queue_xmit(skb); + set_exec_env(env); return 0; } diff -uprN linux-2.6.24/net/8021q/vlanproc.c linux-2.6.24.ovz/net/8021q/vlanproc.c --- linux-2.6.24/net/8021q/vlanproc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/8021q/vlanproc.c 2008-03-25 18:53:59.000000000 -0500 @@ -115,13 +115,21 @@ static const struct file_operations vlan * /proc/net/vlan */ +#ifdef CONFIG_VE +#define proc_vlan_dir (get_exec_env()->_proc_vlan_dir) +#else static struct proc_dir_entry *proc_vlan_dir; +#endif /* * /proc/net/vlan/config */ +#ifdef CONFIG_VE +#define proc_vlan_conf (get_exec_env()->_proc_vlan_conf) +#else static struct proc_dir_entry *proc_vlan_conf; +#endif /* Strings */ static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { @@ -155,7 +163,7 @@ void vlan_proc_cleanup(void) * Create /proc/net/vlan entries */ -int __init vlan_proc_init(void) +int vlan_proc_init(void) { proc_vlan_dir = proc_mkdir(name_root, init_net.proc_net); if (proc_vlan_dir) { diff -uprN linux-2.6.24/net/9p/trans_fd.c linux-2.6.24.ovz/net/9p/trans_fd.c --- linux-2.6.24/net/9p/trans_fd.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/9p/trans_fd.c 2008-03-25 18:53:59.000000000 -0500 @@ -459,14 +459,7 @@ static int __init p9_trans_fd_init(void) return 1; } - -static void __exit p9_trans_fd_exit(void) { - printk(KERN_ERR "Removal of 9p transports not implemented\n"); - BUG(); -} - module_init(p9_trans_fd_init); -module_exit(p9_trans_fd_exit); MODULE_AUTHOR("Latchesar Ionkov "); MODULE_AUTHOR("Eric Van Hensbergen "); diff -uprN linux-2.6.24/net/Kconfig linux-2.6.24.ovz/net/Kconfig --- linux-2.6.24/net/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -30,7 +30,7 @@ menu "Networking options" config NET_NS bool "Network namespace support" default n - depends on EXPERIMENTAL && !SYSFS + depends on EXPERIMENTAL help Allow user space to create what appear to be multiple instances of the network stack. diff -uprN linux-2.6.24/net/bridge/br.c linux-2.6.24.ovz/net/bridge/br.c --- linux-2.6.24/net/bridge/br.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br.c 2008-03-25 18:53:59.000000000 -0500 @@ -55,6 +55,7 @@ static int __init br_init(void) brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; + br_hard_xmit_hook = br_xmit; br_fdb_get_hook = br_fdb_get; br_fdb_put_hook = br_fdb_put; @@ -89,6 +90,7 @@ static void __exit br_deinit(void) br_fdb_put_hook = NULL; br_handle_frame_hook = NULL; + br_hard_xmit_hook = NULL; br_fdb_fini(); } diff -uprN linux-2.6.24/net/bridge/br_device.c linux-2.6.24.ovz/net/bridge/br_device.c --- linux-2.6.24/net/bridge/br_device.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_device.c 2008-03-25 18:53:59.000000000 -0500 @@ -40,16 +40,47 @@ int br_dev_xmit(struct sk_buff *skb, str skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); + skb->brmark = BR_ALREADY_SEEN; + if (dest[0] & 1) br_flood_deliver(br, skb); else if ((dst = __br_fdb_get(br, dest)) != NULL) - br_deliver(dst->dst, skb); + br_deliver(dst->dst, skb, 1); else br_flood_deliver(br, skb); return 0; } +int br_xmit(struct sk_buff *skb, struct net_bridge_port *port) +{ + struct net_bridge *br = port->br; + const unsigned char *dest = skb->data; + struct net_bridge_fdb_entry *dst; + + if (!br->via_phys_dev) + return 0; + + br->statistics.tx_packets++; + br->statistics.tx_bytes += skb->len; + + skb_reset_mac_header(skb); + skb_pull(skb, ETH_HLEN); + + skb->brmark = BR_ALREADY_SEEN; + + if (dest[0] & 1) + br_xmit_deliver(br, port, skb); + else if ((dst = __br_fdb_get(br, dest)) != NULL) + br_deliver(dst->dst, skb, 0); + else + br_xmit_deliver(br, port, skb); + + skb_push(skb, ETH_HLEN); + + return 0; +} + static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); diff -uprN linux-2.6.24/net/bridge/br_forward.c linux-2.6.24.ovz/net/bridge/br_forward.c --- linux-2.6.24/net/bridge/br_forward.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_forward.c 2008-03-25 18:53:59.000000000 -0500 @@ -78,14 +78,24 @@ static void __br_forward(const struct ne } /* called with rcu_read_lock */ -void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free) { if (should_deliver(to, skb)) { + if (!free) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + to->br->statistics.tx_dropped++; + return; + } + skb = skb2; + } __br_deliver(to, skb); return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ @@ -101,6 +111,7 @@ void br_forward(const struct net_bridge_ /* called under bridge lock */ static void br_flood(struct net_bridge *br, struct sk_buff *skb, + int free, void (*__packet_hook)(const struct net_bridge_port *p, struct sk_buff *skb)) { @@ -132,18 +143,41 @@ static void br_flood(struct net_bridge * return; } - kfree_skb(skb); + if (free) + kfree_skb(skb); } /* called with rcu_read_lock */ void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_deliver); + br_flood(br, skb, 1, __br_deliver); +} + +/* called with rcu_read_lock */ +void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct net_bridge_port *p; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (p == port) + continue; + if (should_deliver(p, skb)) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + br->statistics.tx_dropped++; + return; + } + __br_deliver(p, skb2); + } + } } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb) { - br_flood(br, skb, __br_forward); + skb->brmark = BR_ALREADY_SEEN; + br_flood(br, skb, 1, __br_forward); } diff -uprN linux-2.6.24/net/bridge/br_if.c linux-2.6.24.ovz/net/bridge/br_if.c --- linux-2.6.24/net/bridge/br_if.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_if.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ */ #include +#include #include #include #include @@ -160,6 +161,11 @@ static void del_br(struct net_bridge *br { struct net_bridge_port *p, *n; + if (br->master_dev) { + dev_put(br->master_dev); + br->master_dev = NULL; + } + list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } @@ -303,7 +309,7 @@ int br_del_bridge(const char *name) int ret = 0; rtnl_lock(); - dev = __dev_get_by_name(&init_net, name); + dev = __dev_get_by_name(current->nsproxy->net_ns, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ @@ -403,6 +409,10 @@ int br_add_if(struct net_bridge *br, str if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); + if (!(dev->features & NETIF_F_VIRTUAL)) { + dev_hold(dev); + br->master_dev = dev; + } spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, p); @@ -434,6 +444,16 @@ int br_del_if(struct net_bridge *br, str spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); br_features_recompute(br); + if (br->master_dev == dev) { + br->master_dev = NULL; + dev_put(dev); + list_for_each_entry(p, &br->port_list, list) + if (!(p->dev->features & NETIF_F_VIRTUAL)) { + dev_hold(p->dev); + br->master_dev = p->dev; + break; + } + } spin_unlock_bh(&br->lock); return 0; @@ -444,7 +464,7 @@ void __exit br_cleanup_bridges(void) struct net_device *dev, *nxt; rtnl_lock(); - for_each_netdev_safe(&init_net, dev, nxt) + for_each_netdev_safe(current->nsproxy->net_ns, dev, nxt) if (dev->priv_flags & IFF_EBRIDGE) del_br(dev->priv); rtnl_unlock(); diff -uprN linux-2.6.24/net/bridge/br_input.c linux-2.6.24.ovz/net/bridge/br_input.c --- linux-2.6.24/net/bridge/br_input.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_input.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,13 +24,20 @@ const u8 br_group_address[ETH_ALEN] = { static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb) { - struct net_device *indev; + struct net_device *indev, *outdev; br->statistics.rx_packets++; br->statistics.rx_bytes += skb->len; indev = skb->dev; - skb->dev = br->dev; + if (!br->via_phys_dev) + skb->dev = br->dev; + else { + skb->brmark = BR_ALREADY_SEEN; + outdev = br->master_dev; + if (outdev) + skb->dev = outdev; + } NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, netif_receive_skb); @@ -58,7 +65,7 @@ int br_handle_frame_finish(struct sk_buf /* The packet skb2 goes to the local host (NULL to skip). */ skb2 = NULL; - if (br->dev->flags & IFF_PROMISC) + if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev) skb2 = skb; dst = NULL; @@ -156,6 +163,9 @@ struct sk_buff *br_handle_frame(struct n } /* fall through */ case BR_STATE_LEARNING: + if (skb->brmark == BR_ALREADY_SEEN) + return 0; + if (!compare_ether_addr(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; diff -uprN linux-2.6.24/net/bridge/br_ioctl.c linux-2.6.24.ovz/net/bridge/br_ioctl.c --- linux-2.6.24/net/bridge/br_ioctl.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_ioctl.c 2008-03-25 18:53:59.000000000 -0500 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,7 @@ static int get_bridge_ifindices(int *ind struct net_device *dev; int i = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(current->nsproxy->net_ns, dev) { if (i >= num) break; if (dev->priv_flags & IFF_EBRIDGE) @@ -91,7 +92,7 @@ static int add_del_if(struct net_bridge if (!capable(CAP_NET_ADMIN)) return -EPERM; - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, ifindex); if (dev == NULL) return -EINVAL; @@ -142,6 +143,7 @@ static int old_dev_ioctl(struct net_devi b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); + b.via_phys_dev = br->via_phys_dev; b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); @@ -258,6 +260,13 @@ static int old_dev_ioctl(struct net_devi br_stp_set_enabled(br, args[1]); return 0; + case BRCTL_SET_VIA_ORIG_DEV: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->via_phys_dev = args[1] ? 1 : 0; + return 0; + case BRCTL_SET_BRIDGE_PRIORITY: if (!capable(CAP_NET_ADMIN)) return -EPERM; diff -uprN linux-2.6.24/net/bridge/br_netlink.c linux-2.6.24.ovz/net/bridge/br_netlink.c --- linux-2.6.24/net/bridge/br_netlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_netlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -11,6 +11,7 @@ */ #include +#include #include #include #include "br_private.h" @@ -111,7 +112,7 @@ static int br_dump_ifinfo(struct sk_buff int idx; idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(current->nsproxy->net_ns, dev) { /* not a bridge port */ if (dev->br_port == NULL || idx < cb->args[0]) goto skip; @@ -156,7 +157,7 @@ static int br_rtm_setlink(struct sk_buff if (new_state > BR_STATE_BLOCKING) return -EINVAL; - dev = __dev_get_by_index(&init_net, ifm->ifi_index); + dev = __dev_get_by_index(current->nsproxy->net_ns, ifm->ifi_index); if (!dev) return -ENODEV; diff -uprN linux-2.6.24/net/bridge/br_notify.c linux-2.6.24.ovz/net/bridge/br_notify.c --- linux-2.6.24/net/bridge/br_notify.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_notify.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,9 +37,6 @@ static int br_device_event(struct notifi struct net_bridge_port *p = dev->br_port; struct net_bridge *br; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - /* not a port of a bridge */ if (p == NULL) return NOTIFY_DONE; diff -uprN linux-2.6.24/net/bridge/br_private.h linux-2.6.24.ovz/net/bridge/br_private.h --- linux-2.6.24/net/bridge/br_private.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_private.h 2008-03-25 18:53:59.000000000 -0500 @@ -90,6 +90,8 @@ struct net_bridge spinlock_t lock; struct list_head port_list; struct net_device *dev; + struct net_device *master_dev; + unsigned char via_phys_dev; struct net_device_stats statistics; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; @@ -139,6 +141,7 @@ static inline int br_is_root_bridge(cons /* br_device.c */ extern void br_dev_setup(struct net_device *dev); extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev); +extern int br_xmit(struct sk_buff *skb, struct net_bridge_port *port); /* br_fdb.c */ extern int br_fdb_init(void); @@ -165,12 +168,13 @@ extern void br_fdb_update(struct net_bri /* br_forward.c */ extern void br_deliver(const struct net_bridge_port *to, - struct sk_buff *skb); + struct sk_buff *skb, int free); extern int br_dev_queue_push_xmit(struct sk_buff *skb); extern void br_forward(const struct net_bridge_port *to, struct sk_buff *skb); extern int br_forward_finish(struct sk_buff *skb); extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb); extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb); /* br_if.c */ diff -uprN linux-2.6.24/net/bridge/br_stp_bpdu.c linux-2.6.24.ovz/net/bridge/br_stp_bpdu.c --- linux-2.6.24/net/bridge/br_stp_bpdu.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_stp_bpdu.c 2008-03-25 18:53:59.000000000 -0500 @@ -142,9 +142,6 @@ int br_stp_rcv(struct sk_buff *skb, stru struct net_bridge *br; const unsigned char *buf; - if (dev->nd_net != &init_net) - goto err; - if (!p) goto err; diff -uprN linux-2.6.24/net/bridge/br_sysfs_br.c linux-2.6.24.ovz/net/bridge/br_sysfs_br.c --- linux-2.6.24/net/bridge/br_sysfs_br.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/br_sysfs_br.c 2008-03-25 18:53:59.000000000 -0500 @@ -172,6 +172,27 @@ static ssize_t store_stp_state(struct de static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, store_stp_state); +static ssize_t show_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", br->via_phys_dev); +} + +static void set_via_phys_dev_state(struct net_bridge *br, unsigned long val) +{ + br->via_phys_dev = val; +} + +static ssize_t store_via_phys_dev_state(struct device *cd, + struct device_attribute *attr, const char *buf, size_t len) +{ + return store_bridge_parm(cd, buf, len, set_via_phys_dev_state); +} + +static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state, + store_via_phys_dev_state); + static ssize_t show_priority(struct device *d, struct device_attribute *attr, char *buf) { @@ -340,6 +361,7 @@ static struct attribute *bridge_attrs[] &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, + &dev_attr_via_phys_dev.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, diff -uprN linux-2.6.24/net/bridge/netfilter/ebt_among.c linux-2.6.24.ovz/net/bridge/netfilter/ebt_among.c --- linux-2.6.24/net/bridge/netfilter/ebt_among.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/bridge/netfilter/ebt_among.c 2008-03-25 18:53:59.000000000 -0500 @@ -176,7 +176,7 @@ static int ebt_among_check(const char *t unsigned int datalen) { struct ebt_among_info *info = (struct ebt_among_info *) data; - int expected_length = sizeof(struct ebt_among_info); + int expected_length = EBT_ALIGN(sizeof(struct ebt_among_info)); const struct ebt_mac_wormhash *wh_dst, *wh_src; int err; @@ -185,7 +185,7 @@ static int ebt_among_check(const char *t expected_length += ebt_mac_wormhash_size(wh_dst); expected_length += ebt_mac_wormhash_size(wh_src); - if (datalen != EBT_ALIGN(expected_length)) { + if (datalen != expected_length) { printk(KERN_WARNING "ebtables: among: wrong size: %d " "against expected %d, rounded to %Zd\n", diff -uprN linux-2.6.24/net/core/datagram.c linux-2.6.24.ovz/net/core/datagram.c --- linux-2.6.24/net/core/datagram.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/datagram.c 2008-03-25 18:53:59.000000000 -0500 @@ -56,6 +56,8 @@ #include #include +#include + /* * Is a socket 'connection oriented' ? */ @@ -502,6 +504,7 @@ unsigned int datagram_poll(struct file * { struct sock *sk = sock->sk; unsigned int mask; + int no_ubc_space; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -511,8 +514,14 @@ unsigned int datagram_poll(struct file * mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { + no_ubc_space = 0; mask |= POLLHUP; + } else { + no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ubc_space) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + } /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || @@ -529,7 +538,7 @@ unsigned int datagram_poll(struct file * } /* writable? */ - if (sock_writeable(sk)) + if (!no_ubc_space && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff -uprN linux-2.6.24/net/core/dev.c linux-2.6.24.ovz/net/core/dev.c --- linux-2.6.24/net/core/dev.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/dev.c 2008-03-25 18:53:59.000000000 -0500 @@ -122,6 +122,9 @@ #include "net-sysfs.h" +#include +#include + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -196,20 +199,6 @@ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - -static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) -{ - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; -} - -static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -{ - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; -} - /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -672,7 +661,7 @@ struct net_device *dev_getbyhwaddr(struc ASSERT_RTNL(); - for_each_netdev(&init_net, dev) + for_each_netdev(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; @@ -1210,6 +1199,8 @@ int call_netdevice_notifiers(unsigned lo return raw_notifier_call_chain(&netdev_chain, val, dev); } +EXPORT_SYMBOL(call_netdevice_notifiers); + /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); @@ -1530,6 +1521,23 @@ static int dev_gso_segment(struct sk_buf return 0; } +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); +static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net_bridge_port *port; + + if (((port = rcu_dereference(dev->br_port)) == NULL) || + (skb->brmark == BR_ALREADY_SEEN)) + return 0; + + return br_hard_xmit_hook(skb, port); +} +#else +#define bridge_hard_start_xmit(skb, dev) (0) +#endif + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { if (likely(!skb->next)) { @@ -1543,6 +1551,8 @@ int dev_hard_start_xmit(struct sk_buff * goto gso; } + bridge_hard_start_xmit(skb, dev); + return dev->hard_start_xmit(skb, dev); } @@ -1553,6 +1563,9 @@ gso: skb->next = nskb->next; nskb->next = NULL; + + bridge_hard_start_xmit(skb, dev); + rc = dev->hard_start_xmit(nskb, dev); if (unlikely(rc)) { nskb->next = skb->next; @@ -2021,6 +2034,7 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *orig_dev; int ret = NET_RX_DROP; __be16 type; + struct ve_struct *old_ve; /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) @@ -2043,6 +2057,16 @@ int netif_receive_skb(struct sk_buff *sk skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; +#ifdef CONFIG_VE + /* + * Skb might be alloced in another VE context, than its device works. + * So, set the correct owner_env. + */ + skb->owner_env = skb->dev->owner_env; + BUG_ON(skb->owner_env == NULL); +#endif + old_ve = set_exec_env(skb->owner_env); + pt_prev = NULL; rcu_read_lock(); @@ -2098,6 +2122,7 @@ ncls: out: rcu_read_unlock(); + (void)set_exec_env(old_ve); return ret; } @@ -2757,8 +2782,11 @@ static void __dev_set_promiscuity(struct dev->flags &= ~IFF_PROMISC; else dev->flags |= IFF_PROMISC; + /* Promiscous mode on these devices does not mean anything */ + if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + return; if (dev->flags != old_flags) { - printk(KERN_INFO "device %s %s promiscuous mode\n", + ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); audit_log(current->audit_context, GFP_ATOMIC, @@ -3423,11 +3451,20 @@ int dev_ioctl(struct net *net, unsigned * - require strict serialization. * - do not return a value */ + case SIOCSIFMTU: + case SIOCSIFHWADDR: case SIOCSIFFLAGS: + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCSIFMETRIC: - case SIOCSIFMTU: case SIOCSIFMAP: - case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: @@ -3494,12 +3531,11 @@ int dev_ioctl(struct net *net, unsigned */ static int dev_new_index(struct net *net) { - static int ifindex; for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return ifindex; + if (++net->ifindex <= 0) + net->ifindex = 1; + if (!__dev_get_by_index(net, net->ifindex)) + return net->ifindex; } } @@ -3602,6 +3638,10 @@ int register_netdevice(struct net_device BUG_ON(!dev->nd_net); net = dev->nd_net; + ret = -EPERM; + if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) + goto out; + spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); @@ -3698,6 +3738,10 @@ int register_netdevice(struct net_device set_bit(__LINK_STATE_PRESENT, &dev->state); + dev->owner_env = get_exec_env(); + netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); + netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); @@ -3831,6 +3875,7 @@ static DEFINE_MUTEX(net_todo_run_mutex); void netdev_run_todo(void) { struct list_head list; + struct ve_struct *old_ve; /* Need to guard against multiple cpu's getting out of order. */ mutex_lock(&net_todo_run_mutex); @@ -3848,6 +3893,7 @@ void netdev_run_todo(void) list_replace_init(&net_todo_list, &list); spin_unlock(&net_todo_list_lock); + old_ve = get_exec_env(); while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); @@ -3860,6 +3906,7 @@ void netdev_run_todo(void) continue; } + (void)set_exec_env(dev->owner_env); dev->reg_state = NETREG_UNREGISTERED; netdev_wait_allrefs(dev); @@ -3870,13 +3917,21 @@ void netdev_run_todo(void) BUG_TRAP(!dev->ip6_ptr); BUG_TRAP(!dev->dn_ptr); + put_beancounter(netdev_bc(dev)->exec_ub); + put_beancounter(netdev_bc(dev)->owner_ub); + netdev_bc(dev)->exec_ub = NULL; + netdev_bc(dev)->owner_ub = NULL; + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ if (dev->destructor) dev->destructor(dev); /* Free network device */ kobject_put(&dev->dev.kobj); } - + (void)set_exec_env(old_ve); out: mutex_unlock(&net_todo_run_mutex); } @@ -3912,7 +3967,7 @@ struct net_device *alloc_netdev_mq(int s ~NETDEV_ALIGN_CONST; alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL_UBC); if (!p) { printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); return NULL; @@ -4023,11 +4078,15 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, + struct ve_struct *src_ve, struct ve_struct *dst_ve, + struct user_beancounter *exec_ub) { char buf[IFNAMSIZ]; const char *destname; int err; + struct ve_struct *cur_ve; + struct user_beancounter *tmp_ub; ASSERT_RTNL(); @@ -4078,6 +4137,11 @@ int dev_change_net_namespace(struct net_ err = -ENODEV; unlist_netdevice(dev); + dev->owner_env = dst_ve; + tmp_ub = netdev_bc(dev)->exec_ub; + netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); + put_beancounter(tmp_ub); + synchronize_net(); /* Shutdown queueing discipline. */ @@ -4086,7 +4150,9 @@ int dev_change_net_namespace(struct net_ /* Notify protocols, that we are about to destroy this device. They should clean all the things. */ + cur_ve = set_exec_env(src_ve); call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + (void)set_exec_env(cur_ve); /* * Flush the unicast and multicast chains @@ -4116,7 +4182,9 @@ int dev_change_net_namespace(struct net_ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ + cur_ve = set_exec_env(dst_ve); call_netdevice_notifiers(NETDEV_REGISTER, dev); + (void)set_exec_env(cur_ve); synchronize_net(); err = 0; @@ -4124,6 +4192,14 @@ out: return err; } +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + struct ve_struct *ve = get_exec_env(); + struct user_beancounter *ub = get_exec_ub(); + + return __dev_change_net_namespace(dev, net, pat, ve, ve, ub); +} + static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -4322,7 +4398,7 @@ static struct hlist_head *netdev_create_ int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -4464,6 +4540,7 @@ EXPORT_SYMBOL(__dev_remove_pack); EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(__dev_change_net_namespace); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); @@ -4495,6 +4572,7 @@ EXPORT_SYMBOL(dev_get_flags); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_hard_xmit_hook); EXPORT_SYMBOL(br_fdb_get_hook); EXPORT_SYMBOL(br_fdb_put_hook); #endif diff -uprN linux-2.6.24/net/core/dst.c linux-2.6.24.ovz/net/core/dst.c --- linux-2.6.24/net/core/dst.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/dst.c 2008-03-25 18:53:59.000000000 -0500 @@ -278,11 +278,11 @@ static inline void dst_ifdown(struct dst if (!unregister) { dst->input = dst->output = dst_discard; } else { - dst->dev = init_net.loopback_dev; + dst->dev = dst->dev->nd_net->loopback_dev; dev_hold(dst->dev); dev_put(dev); if (dst->neighbour && dst->neighbour->dev == dev) { - dst->neighbour->dev = init_net.loopback_dev; + dst->neighbour->dev = dst->dev; dev_put(dev); dev_hold(dst->neighbour->dev); } @@ -294,12 +294,10 @@ static int dst_dev_event(struct notifier struct net_device *dev = ptr; struct dst_entry *dst, *last = NULL; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_UNREGISTER: case NETDEV_DOWN: + dst_gc_task(NULL); mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { last = dst; diff -uprN linux-2.6.24/net/core/ethtool.c linux-2.6.24.ovz/net/core/ethtool.c --- linux-2.6.24/net/core/ethtool.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/ethtool.c 2008-03-25 18:53:59.000000000 -0500 @@ -815,7 +815,7 @@ int dev_ethtool(struct net *net, struct case ETHTOOL_GPFLAGS: break; default: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; } diff -uprN linux-2.6.24/net/core/fib_rules.c linux-2.6.24.ovz/net/core/fib_rules.c --- linux-2.6.24/net/core/fib_rules.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/fib_rules.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,7 +23,7 @@ int fib_default_rule_add(struct fib_rule { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (r == NULL) return -ENOMEM; @@ -245,7 +245,7 @@ static int fib_nl_newrule(struct sk_buff if (err < 0) goto errout; - rule = kzalloc(ops->rule_size, GFP_KERNEL); + rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC); if (rule == NULL) { err = -ENOMEM; goto errout; @@ -621,9 +621,6 @@ static int fib_rules_event(struct notifi struct net_device *dev = ptr; struct fib_rules_ops *ops; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - ASSERT_RTNL(); rcu_read_lock(); diff -uprN linux-2.6.24/net/core/filter.c linux-2.6.24.ovz/net/core/filter.c --- linux-2.6.24/net/core/filter.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/filter.c 2008-03-25 18:53:59.000000000 -0500 @@ -425,7 +425,7 @@ int sk_attach_filter(struct sock_fprog * if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { diff -uprN linux-2.6.24/net/core/neighbour.c linux-2.6.24.ovz/net/core/neighbour.c --- linux-2.6.24/net/core/neighbour.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/neighbour.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -35,6 +36,7 @@ #include #include #include +#include #define NEIGH_DEBUG 1 @@ -252,6 +254,7 @@ static struct neighbour *neigh_alloc(str int entries; entries = atomic_inc_return(&tbl->entries) - 1; + n = ERR_PTR(-ENOBUFS); if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -262,7 +265,7 @@ static struct neighbour *neigh_alloc(str n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); if (!n) - goto out_entries; + goto out_nomem; skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); @@ -281,6 +284,8 @@ static struct neighbour *neigh_alloc(str out: return n; +out_nomem: + n = ERR_PTR(-ENOMEM); out_entries: atomic_dec(&tbl->entries); goto out; @@ -393,12 +398,11 @@ struct neighbour *neigh_create(struct ne u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n; - if (!n) { - rc = ERR_PTR(-ENOBUFS); + rc = n = neigh_alloc(tbl); + if (IS_ERR(n)) goto out; - } memcpy(n->primary_key, pkey, key_len); n->dev = dev; @@ -644,6 +648,8 @@ static void neigh_periodic_timer(unsigne struct neigh_table *tbl = (struct neigh_table *)arg; struct neighbour *n, **np; unsigned long expire, now = jiffies; + struct ve_struct *env = set_exec_env(tbl->owner_env); + struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -708,6 +714,8 @@ next_elt: mod_timer(&tbl->gc_timer, now + expire); write_unlock(&tbl->lock); + set_exec_ub(ub); + set_exec_env(env); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -735,6 +743,11 @@ static void neigh_timer_handler(unsigned struct neighbour *neigh = (struct neighbour *)arg; unsigned state; int notify = 0; + struct ve_struct *env; + struct user_beancounter *ub; + + env = set_exec_env(neigh->dev->owner_env); + ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); write_lock(&neigh->lock); @@ -838,6 +851,8 @@ out: neigh_update_notify(neigh); neigh_release(neigh); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) @@ -1212,6 +1227,9 @@ static void neigh_proxy_process(unsigned unsigned long now = jiffies; struct sk_buff *skb; + struct ve_struct *env = set_exec_env(tbl->owner_env); + struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); + spin_lock(&tbl->proxy_queue.lock); skb = tbl->proxy_queue.next; @@ -1223,6 +1241,7 @@ static void neigh_proxy_process(unsigned skb = skb->next; if (tdif <= 0) { struct net_device *dev = back->dev; + __skb_unlink(back, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) tbl->proxy_redo(back); @@ -1230,6 +1249,7 @@ static void neigh_proxy_process(unsigned kfree_skb(back); dev_put(dev); + } else if (!sched_next || tdif < sched_next) sched_next = tdif; } @@ -1237,6 +1257,8 @@ static void neigh_proxy_process(unsigned if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); + (void)set_exec_ub(ub); + (void)set_exec_env(env); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, @@ -1333,12 +1355,15 @@ void neigh_parms_destroy(struct neigh_pa static struct lock_class_key neigh_table_proxy_queue_class; -void neigh_table_init_no_netlink(struct neigh_table *tbl) +int neigh_table_init_no_netlink(struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + atomic_set(&tbl->entries, 0); + tbl->hash_chain_gc = 0; atomic_set(&tbl->parms.refcnt, 1); + tbl->parms.next = NULL; INIT_RCU_HEAD(&tbl->parms.rcu_head); tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); @@ -1346,18 +1371,26 @@ void neigh_table_init_no_netlink(struct if (!tbl->kmem_cachep) tbl->kmem_cachep = kmem_cache_create(tbl->id, tbl->entry_size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); + if (!tbl->kmem_cachep) + return -ENOMEM; + tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) - panic("cannot create neighbour cache statistics"); + return -ENOMEM; + + tbl->owner_env = get_ve(get_exec_env()); + tbl->owner_ub = get_beancounter(get_exec_ub()); #ifdef CONFIG_PROC_FS - tbl->pde = create_proc_entry(tbl->id, 0, init_net.proc_net_stat); - if (!tbl->pde) - panic("cannot create neighbour proc dir entry"); - tbl->pde->proc_fops = &neigh_stat_seq_fops; - tbl->pde->data = tbl; + if (ve_is_super(get_exec_env())) { + tbl->pde = create_proc_entry(tbl->id, 0, init_net.proc_net_stat); + if (!tbl->pde) + panic("cannot create neighbour proc dir entry"); + tbl->pde->proc_fops = &neigh_stat_seq_fops; + tbl->pde->data = tbl; + } #endif tbl->hash_mask = 1; @@ -1367,7 +1400,7 @@ void neigh_table_init_no_netlink(struct tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->hash_buckets || !tbl->phash_buckets) - panic("cannot allocate neighbour cache hashes"); + goto nomem; get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); @@ -1386,15 +1419,38 @@ void neigh_table_init_no_netlink(struct tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; + return 0; + +nomem: + if (tbl->hash_buckets) { + neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); + tbl->hash_buckets = NULL; + } + if (tbl->phash_buckets) { + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + } + if (tbl->stats) { + free_percpu(tbl->stats); + tbl->stats = NULL; + } + put_beancounter(tbl->owner_ub); + put_ve(tbl->owner_env); + return -ENOMEM; } -void neigh_table_init(struct neigh_table *tbl) +int neigh_table_init(struct neigh_table *tbl) { struct neigh_table *tmp; + int err; - neigh_table_init_no_netlink(tbl); + err = neigh_table_init_no_netlink(tbl); + if (err) + return err; write_lock(&neigh_tbl_lock); for (tmp = neigh_tables; tmp; tmp = tmp->next) { + if (!ve_accessible_strict(tmp->owner_env, get_exec_env())) + continue; if (tmp->family == tbl->family) break; } @@ -1407,6 +1463,7 @@ void neigh_table_init(struct neigh_table "family %d\n", tbl->family); dump_stack(); } + return 0; } int neigh_table_clear(struct neigh_table *tbl) @@ -1420,6 +1477,15 @@ int neigh_table_clear(struct neigh_table neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) printk(KERN_CRIT "neighbour leakage\n"); +#ifdef CONFIG_PROC_FS + if (ve_is_super(get_exec_env())) { + char name[strlen(tbl->id) + sizeof("net/stat/")]; + strcpy(name, "net/stat/"); + strcat(name, tbl->id); + remove_proc_glob_entry(name, NULL); + } +#endif + write_lock(&neigh_tbl_lock); for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { if (*tp == tbl) { @@ -1440,8 +1506,13 @@ int neigh_table_clear(struct neigh_table free_percpu(tbl->stats); tbl->stats = NULL; - kmem_cache_destroy(tbl->kmem_cachep); - tbl->kmem_cachep = NULL; + if (ve_is_super(get_exec_env())) { + kmem_cache_destroy(tbl->kmem_cachep); + tbl->kmem_cachep = NULL; + } + + put_beancounter(tbl->owner_ub); + put_ve(tbl->owner_env); return 0; } @@ -1477,6 +1548,8 @@ static int neigh_delete(struct sk_buff * if (tbl->family != ndm->ndm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; read_unlock(&neigh_tbl_lock); if (nla_len(dst_attr) < tbl->key_len) @@ -1549,6 +1622,8 @@ static int neigh_add(struct sk_buff *skb if (tbl->family != ndm->ndm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; read_unlock(&neigh_tbl_lock); if (nla_len(tb[NDA_DST]) < tbl->key_len) @@ -1816,6 +1891,9 @@ static int neightbl_set(struct sk_buff * if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) break; } @@ -2056,6 +2134,8 @@ static int neigh_dump_info(struct sk_buf s_t = cb->args[0]; for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) diff -uprN linux-2.6.24/net/core/net-sysfs.c linux-2.6.24.ovz/net/core/net-sysfs.c --- linux-2.6.24/net/core/net-sysfs.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/net-sysfs.c 2008-03-25 18:53:59.000000000 -0500 @@ -238,6 +238,27 @@ static struct device_attribute net_class {} }; +#ifdef CONFIG_VE +struct device_attribute ve_net_class_attributes[] = { + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(features, S_IRUGO, show_features, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(dormant, S_IRUGO, show_dormant, NULL), + __ATTR(operstate, S_IRUGO, show_operstate, NULL), + __ATTR(mtu, S_IRUGO, show_mtu, NULL), + __ATTR(flags, S_IRUGO, show_flags, NULL), + __ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL), + {} +}; +EXPORT_SYMBOL(ve_net_class_attributes); +#endif + /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, @@ -431,7 +452,7 @@ static void netdev_release(struct device kfree((char *)dev - dev->padded); } -static struct class net_class = { +struct class net_class = { .name = "net", .dev_release = netdev_release, #ifdef CONFIG_SYSFS @@ -441,6 +462,13 @@ static struct class net_class = { .dev_uevent = netdev_uevent, #endif }; +EXPORT_SYMBOL(net_class); + +#ifndef CONFIG_VE +#define visible_net_class net_class +#else +#define visible_net_class (*get_exec_env()->net_class) +#endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. @@ -460,7 +488,7 @@ int netdev_register_kobject(struct net_d struct attribute_group **groups = net->sysfs_groups; device_initialize(dev); - dev->class = &net_class; + dev->class = &visible_net_class; dev->platform_data = net; dev->groups = groups; @@ -480,7 +508,15 @@ int netdev_register_kobject(struct net_d return device_add(dev); } +void prepare_sysfs_netdev(void) +{ +#ifdef CONFIG_VE + get_ve0()->net_class = &net_class; +#endif +} + int netdev_kobject_init(void) { + prepare_sysfs_netdev(); return class_register(&net_class); } diff -uprN linux-2.6.24/net/core/net_namespace.c linux-2.6.24.ovz/net/core/net_namespace.c --- linux-2.6.24/net/core/net_namespace.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/net_namespace.c 2008-03-25 18:53:59.000000000 -0500 @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,8 @@ static struct net *net_alloc(void) static void net_free(struct net *net) { + struct completion *sysfs_completion; + if (!net) return; @@ -75,7 +78,10 @@ static void net_free(struct net *net) return; } + sysfs_completion = net->sysfs_completion; kmem_cache_free(net_cachep, net); + if (sysfs_completion) + complete(sysfs_completion); } struct net *copy_net_ns(unsigned long flags, struct net *old_net) @@ -92,7 +98,13 @@ struct net *copy_net_ns(unsigned long fl new_net = net_alloc(); if (!new_net) goto out; - +#ifdef CONFIG_VE + new_net->owner_ve = get_exec_env(); + new_net->proc_net = get_exec_env()->_proc_net; + new_net->proc_net->data = new_net; + new_net->proc_net_stat = get_exec_env()->_proc_net_stat; + new_net->proc_net_stat->data = new_net; +#endif mutex_lock(&net_mutex); err = setup_net(new_net); if (err) @@ -118,6 +130,7 @@ static void cleanup_net(struct work_stru { struct pernet_operations *ops; struct net *net; + struct ve_struct *old_ve; net = container_of(work, struct net, work); @@ -128,11 +141,13 @@ static void cleanup_net(struct work_stru list_del(&net->list); rtnl_unlock(); + old_ve = set_exec_env(net->owner_ve); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); } + (void)set_exec_env(old_ve); mutex_unlock(&net_mutex); diff -uprN linux-2.6.24/net/core/rtnetlink.c linux-2.6.24.ovz/net/core/rtnetlink.c --- linux-2.6.24/net/core/rtnetlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/rtnetlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -1207,6 +1207,8 @@ static int rtnl_dump_all(struct sk_buff if (rtnl_msg_handlers[idx] == NULL || rtnl_msg_handlers[idx][type].dumpit == NULL) continue; + if (vz_security_family_check(idx)) + continue; if (idx > s_idx) memset(&cb->args[0], 0, sizeof(cb->args)); if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) @@ -1265,13 +1267,13 @@ static int rtnetlink_rcv_msg(struct sk_b return 0; family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) + if (family >= NPROTO || vz_security_family_check(family)) return -EAFNOSUPPORT; sz_idx = type>>2; kind = type&3; - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -1326,9 +1328,6 @@ static int rtnetlink_event(struct notifi { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_UNREGISTER: rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); diff -uprN linux-2.6.24/net/core/scm.c linux-2.6.24.ovz/net/core/scm.c --- linux-2.6.24/net/core/scm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/scm.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,6 +36,7 @@ #include #include +#include /* * Only allow a user to send credentials, that they could set with @@ -44,7 +45,9 @@ static __inline__ int scm_check_creds(struct ucred *creds) { - if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + creds->pid == current->tgid || + capable(CAP_VE_SYS_ADMIN)) && ((creds->uid == current->uid || creds->uid == current->euid || creds->uid == current->suid) || capable(CAP_SETUID)) && ((creds->gid == current->gid || creds->gid == current->egid || @@ -71,7 +74,7 @@ static int scm_fp_copy(struct cmsghdr *c if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -282,7 +285,7 @@ struct scm_fp_list *scm_fp_dup(struct sc if (!fpl) return NULL; - new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL_UBC); if (new_fpl) { for (i=fpl->count-1; i>=0; i--) get_file(fpl->fp[i]); diff -uprN linux-2.6.24/net/core/skbuff.c linux-2.6.24.ovz/net/core/skbuff.c --- linux-2.6.24/net/core/skbuff.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/skbuff.c 2008-03-25 18:53:59.000000000 -0500 @@ -66,6 +66,8 @@ #include #include +#include + #include "kmap_skb.h" static struct kmem_cache *skbuff_head_cache __read_mostly; @@ -158,6 +160,10 @@ struct sk_buff *__alloc_skb(unsigned int if (!skb) goto out; + if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) + goto nobc; + + /* Get the DATA. Size must match skb_add_mtu(). */ size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); @@ -174,6 +180,7 @@ struct sk_buff *__alloc_skb(unsigned int skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; + skb->owner_env = get_exec_env(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); @@ -196,6 +203,8 @@ struct sk_buff *__alloc_skb(unsigned int out: return skb; nodata: + ub_skb_free_bc(skb); +nobc: kmem_cache_free(cache, skb); skb = NULL; goto out; @@ -280,6 +289,7 @@ static void kfree_skbmem(struct sk_buff struct sk_buff *other; atomic_t *fclone_ref; + ub_skb_free_bc(skb); switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); @@ -313,6 +323,7 @@ static void skb_release_all(struct sk_bu #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif + ub_skb_uncharge(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -402,6 +413,11 @@ static void __copy_skb_header(struct sk_ new->tc_verd = old->tc_verd; #endif #endif +#ifdef CONFIG_VE + new->accounted = old->accounted; + new->redirected = old->redirected; +#endif + skb_copy_brmark(new, old); skb_copy_secmark(new, old); } @@ -419,6 +435,10 @@ static struct sk_buff *__skb_clone(struc n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; + C(owner_env); +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + C(brmark); +#endif n->destructor = NULL; C(iif); C(tail); @@ -428,6 +448,11 @@ static struct sk_buff *__skb_clone(struc C(truesize); atomic_set(&n->users, 1); +#ifdef CONFIG_VE + C(accounted); + C(redirected); +#endif + atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -483,6 +508,10 @@ struct sk_buff *skb_clone(struct sk_buff n->fclone = SKB_FCLONE_UNAVAILABLE; } + if (ub_skb_alloc_bc(n, gfp_mask)) { + kmem_cache_free(skbuff_head_cache, n); + return NULL; + } return __skb_clone(n, skb); } diff -uprN linux-2.6.24/net/core/sock.c linux-2.6.24.ovz/net/core/sock.c --- linux-2.6.24/net/core/sock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/sock.c 2008-03-25 18:53:59.000000000 -0500 @@ -125,6 +125,9 @@ #include #include +#include +#include + #include #ifdef CONFIG_INET @@ -249,7 +252,7 @@ static void sock_warn_obsolete_bsdism(co static char warncomm[TASK_COMM_LEN]; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING "process `%s' is using obsolete " + ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " "%s SO_BSDCOMPAT\n", warncomm, name); warned++; } @@ -278,6 +281,10 @@ int sock_queue_rcv_skb(struct sock *sk, goto out; } + err = ub_sockrcvbuf_charge(sk, skb); + if (err < 0) + goto out; + err = sk_filter(sk, skb); if (err) goto out; @@ -911,6 +918,7 @@ static void sk_prot_free(struct proto *p slab = prot->slab; security_sk_free(sk); + ub_sock_uncharge(sk); if (slab != NULL) kmem_cache_free(slab, sk); else @@ -940,6 +948,7 @@ struct sock *sk_alloc(struct net *net, i */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); + sk->owner_env = get_exec_env(); sk->sk_net = get_net(net); } @@ -1014,14 +1023,11 @@ struct sock *sk_clone(const struct sock if (filter != NULL) sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - newsk = NULL; - goto out; - } + if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) + goto out_err; + + if (unlikely(xfrm_sk_clone_policy(newsk))) + goto out_err; newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1045,14 +1051,23 @@ struct sock *sk_clone(const struct sock if (newsk->sk_prot->sockets_allocated) atomic_inc(newsk->sk_prot->sockets_allocated); } -out: return newsk; + +out_err: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sock_reset_flag(newsk, SOCK_TIMESTAMP); + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; } EXPORT_SYMBOL_GPL(sk_clone); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + extern int sysctl_tcp_use_sg; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1063,6 +1078,8 @@ void sk_setup_caps(struct sock *sk, stru else sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; } + if (!sysctl_tcp_use_sg) + sk->sk_route_caps &= ~NETIF_F_SG; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1221,11 +1238,9 @@ static long sock_wait_for_wmem(struct so /* * Generic send/receive buffer handlers */ - -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, + unsigned long size2, int noblock, + int *errcode) { struct sk_buff *skb; gfp_t gfp_mask; @@ -1246,46 +1261,35 @@ static struct sk_buff *sock_alloc_send_p if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int npages; - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - skb_frag_t *frag; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); - data_len -= PAGE_SIZE; - } + if (ub_sock_getwres_other(sk, skb_charge_size(size))) { + if (size2 < size) { + size = size2; + continue; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = ub_sock_wait_for_space(sk, timeo, + skb_charge_size(size)); + continue; + } + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(size, gfp_mask); + if (skb) /* Full success... */ break; - } + ub_sock_retwres_other(sk, skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); err = -ENOBUFS; goto failure; } + ub_sock_retwres_other(sk, + skb_charge_size(size), + SOCK_MIN_UBCSPACE_CH); set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; @@ -1296,6 +1300,7 @@ static struct sk_buff *sock_alloc_send_p timeo = sock_wait_for_wmem(sk, timeo); } + ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); skb_set_owner_w(skb, sk); return skb; @@ -1305,11 +1310,12 @@ failure: *errcode = err; return NULL; } +EXPORT_SYMBOL(sock_alloc_send_skb2); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_skb2(sk, size, size, noblock, errcode); } static void __lock_sock(struct sock *sk) @@ -1621,10 +1627,12 @@ void fastcall lock_sock_nested(struct so __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); +#endif local_bh_enable(); } @@ -1632,11 +1640,12 @@ EXPORT_SYMBOL(lock_sock_nested); void fastcall release_sock(struct sock *sk) { +#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) /* * The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - +#endif spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); @@ -1863,7 +1872,7 @@ int proto_register(struct proto *prot, i if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", @@ -1881,7 +1890,7 @@ int proto_register(struct proto *prot, i sprintf(request_sock_slab_name, mask, prot->name); prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->rsk_prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", @@ -1902,7 +1911,7 @@ int proto_register(struct proto *prot, i prot->twsk_prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, prot->twsk_prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, + 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; @@ -2058,10 +2067,26 @@ static const struct file_operations prot .release = seq_release, }; +static int proto_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + return -ENOBUFS; + return 0; +} + +static void proto_net_exit(struct net *net) +{ + proc_net_remove(net, "protocols"); +} + +static struct pernet_operations proto_net_ops = { + .init = proto_net_init, + .exit = proto_net_exit, +}; + static int __init proto_init(void) { - /* register /proc/net/protocols */ - return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; + return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); diff -uprN linux-2.6.24/net/core/stream.c linux-2.6.24.ovz/net/core/stream.c --- linux-2.6.24/net/core/stream.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/core/stream.c 2008-03-25 18:53:59.000000000 -0500 @@ -111,8 +111,10 @@ EXPORT_SYMBOL(sk_stream_wait_close); * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long + * @amount - amount of memory to wait for (in UB space!) */ -int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, + unsigned long amount) { int err = 0; long vm_wait = 0; @@ -134,8 +136,11 @@ int sk_stream_wait_memory(struct sock *s if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - if (sk_stream_memory_free(sk) && !vm_wait) - break; + if (amount == 0) { + if (sk_stream_memory_free(sk) && !vm_wait) + break; + } else + ub_sock_sndqueueadd_tcp(sk, amount); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; @@ -144,6 +149,8 @@ int sk_stream_wait_memory(struct sock *s sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; + if (amount > 0) + ub_sock_sndqueuedel(sk); if (vm_wait) { vm_wait -= current_timeo; @@ -170,6 +177,10 @@ do_interrupted: goto out; } +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + return __sk_stream_wait_memory(sk, timeo_p, 0); +} EXPORT_SYMBOL(sk_stream_wait_memory); void sk_stream_rfree(struct sk_buff *skb) diff -uprN linux-2.6.24/net/dccp/ipv6.c linux-2.6.24.ovz/net/dccp/ipv6.c --- linux-2.6.24/net/dccp/ipv6.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/dccp/ipv6.c 2008-03-25 18:53:59.000000000 -0500 @@ -574,6 +574,8 @@ static struct sock *dccp_v6_request_recv __ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + if (!sysctl_tcp_use_sg) + newsk->sk_route_caps &= ~NETIF_F_SG; newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; diff -uprN linux-2.6.24/net/dccp/minisocks.c linux-2.6.24.ovz/net/dccp/minisocks.c --- linux-2.6.24/net/dccp/minisocks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/dccp/minisocks.c 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,8 @@ #include #include +#include + #include "ackvec.h" #include "ccid.h" #include "dccp.h" @@ -56,7 +58,8 @@ void dccp_time_wait(struct sock *sk, int { struct inet_timewait_sock *tw = NULL; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &dccp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { diff -uprN linux-2.6.24/net/decnet/netfilter/dn_rtmsg.c linux-2.6.24.ovz/net/decnet/netfilter/dn_rtmsg.c --- linux-2.6.24/net/decnet/netfilter/dn_rtmsg.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/decnet/netfilter/dn_rtmsg.c 2008-03-25 18:53:59.000000000 -0500 @@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_sk if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); /* Eventually we might send routing messages too */ diff -uprN linux-2.6.24/net/ipv4/af_inet.c linux-2.6.24.ovz/net/ipv4/af_inet.c --- linux-2.6.24/net/ipv4/af_inet.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/af_inet.c 2008-03-25 18:53:59.000000000 -0500 @@ -115,6 +115,7 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; @@ -253,9 +254,6 @@ static int inet_create(struct net *net, int try_loading_module = 0; int err; - if (net != &init_net) - return -EAFNOSUPPORT; - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) @@ -310,6 +308,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -327,6 +329,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) @@ -384,6 +393,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -398,6 +410,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + struct ve_struct *saved_env; + + saved_env = set_exec_env(sk->owner_env); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -415,6 +430,8 @@ int inet_release(struct socket *sock) timeout = sk->sk_lingertime; sock->sk = NULL; sk->sk_prot->close(sk, timeout); + + (void)set_exec_env(saved_env); } return 0; } @@ -1290,31 +1307,31 @@ static struct net_protocol icmp_protocol static int __init init_ipv4_mibs(void) { - if (snmp_mib_init((void **)net_statistics, + if (snmp_mib_init((void **)ve_net_statistics, sizeof(struct linux_mib), __alignof__(struct linux_mib)) < 0) goto err_net_mib; - if (snmp_mib_init((void **)ip_statistics, + if (snmp_mib_init((void **)ve_ip_statistics, sizeof(struct ipstats_mib), __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)icmp_statistics, + if (snmp_mib_init((void **)ve_icmp_statistics, sizeof(struct icmp_mib), __alignof__(struct icmp_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void **)icmpmsg_statistics, + if (snmp_mib_init((void **)ve_icmpmsg_statistics, sizeof(struct icmpmsg_mib), __alignof__(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)tcp_statistics, + if (snmp_mib_init((void **)ve_tcp_statistics, sizeof(struct tcp_mib), __alignof__(struct tcp_mib)) < 0) goto err_tcp_mib; - if (snmp_mib_init((void **)udp_statistics, + if (snmp_mib_init((void **)ve_udp_statistics, sizeof(struct udp_mib), __alignof__(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)udplite_statistics, + if (snmp_mib_init((void **)ve_udplite_statistics, sizeof(struct udp_mib), __alignof__(struct udp_mib)) < 0) goto err_udplite_mib; @@ -1324,17 +1341,17 @@ static int __init init_ipv4_mibs(void) return 0; err_udplite_mib: - snmp_mib_free((void **)udp_statistics); + snmp_mib_free((void **)ve_udp_statistics); err_udp_mib: - snmp_mib_free((void **)tcp_statistics); + snmp_mib_free((void **)ve_tcp_statistics); err_tcp_mib: - snmp_mib_free((void **)icmpmsg_statistics); + snmp_mib_free((void **)ve_icmpmsg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmp_statistics); + snmp_mib_free((void **)ve_icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)ip_statistics); + snmp_mib_free((void **)ve_ip_statistics); err_ip_mib: - snmp_mib_free((void **)net_statistics); + snmp_mib_free((void **)ve_net_statistics); err_net_mib: return -ENOMEM; } diff -uprN linux-2.6.24/net/ipv4/arp.c linux-2.6.24.ovz/net/ipv4/arp.c --- linux-2.6.24/net/ipv4/arp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/arp.c 2008-03-25 18:53:59.000000000 -0500 @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -170,7 +171,7 @@ struct neigh_ops arp_broken_ops = { .queue_xmit = dev_queue_xmit, }; -struct neigh_table arp_tbl = { +struct neigh_table global_arp_tbl = { .family = AF_INET, .entry_size = sizeof(struct neighbour) + 4, .key_len = 4, @@ -179,7 +180,7 @@ struct neigh_table arp_tbl = { .proxy_redo = parp_redo, .id = "arp_cache", .parms = { - .tbl = &arp_tbl, + .tbl = &global_arp_tbl, .base_reachable_time = 30 * HZ, .retrans_time = 1 * HZ, .gc_staletime = 60 * HZ, @@ -914,9 +915,6 @@ static int arp_rcv(struct sk_buff *skb, { struct arphdr *arp; - if (dev->nd_net != &init_net) - goto freeskb; - /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, (sizeof(struct arphdr) + (2 * dev->addr_len) + @@ -963,7 +961,7 @@ static int arp_req_set(struct arpreq *r, if (mask && mask != htonl(0xFFFFFFFF)) return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data); + dev = dev_getbyhwaddr(get_exec_env()->ve_ns->net_ns, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1128,7 +1126,8 @@ int arp_ioctl(unsigned int cmd, void __u switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); @@ -1151,7 +1150,7 @@ int arp_ioctl(unsigned int cmd, void __u rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL) + if ((dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, r.arp_dev)) == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1187,9 +1186,6 @@ static int arp_netdev_event(struct notif { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); @@ -1225,18 +1221,69 @@ static struct packet_type arp_packet_typ .func = arp_rcv, }; -static int arp_proc_init(void); +static const struct file_operations arp_seq_fops; -void __init arp_init(void) +static int arp_net_init(struct net *net) { - neigh_table_init(&arp_tbl); + struct ve_struct *ve = get_exec_env(); + int err; - dev_add_pack(&arp_packet_type); - arp_proc_init(); + ve->ve_arp_tbl = kmemdup(ve0.ve_arp_tbl, sizeof(struct neigh_table), + GFP_KERNEL); + if (ve->ve_arp_tbl == NULL) + return -ENOMEM; + + ve->ve_arp_tbl->parms.tbl = ve->ve_arp_tbl; + err = neigh_table_init(ve->ve_arp_tbl); + if (err) + goto out_free; #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, + err = neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4", NULL, NULL); + if (err) + goto out_uninit; +#endif + if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) + goto out_deregister; + err = 0; +out: + return err; + +out_deregister: +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&ve->ve_arp_tbl->parms); +out_uninit: +#endif + neigh_table_clear(ve->ve_arp_tbl); +out_free: + kfree(ve->ve_arp_tbl); + ve->ve_arp_tbl = NULL; + goto out; +} + +static void arp_net_exit(struct net *net) +{ + struct ve_struct *ve = get_exec_env(); + + proc_net_remove(net, "arp"); +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&ve->ve_arp_tbl->parms); #endif + neigh_table_clear(ve->ve_arp_tbl); + kfree(ve->ve_arp_tbl); + ve->ve_arp_tbl = NULL; +} + +static struct pernet_operations arp_net_ops = { + .init = arp_net_init, + .exit = arp_net_exit, +}; + +void __init arp_init(void) +{ + get_ve0()->ve_arp_tbl = &global_arp_tbl; + register_pernet_subsys(&arp_net_ops); + dev_add_pack(&arp_packet_type); register_netdevice_notifier(&arp_netdev_notifier); } @@ -1370,21 +1417,6 @@ static const struct file_operations arp_ .llseek = seq_lseek, .release = seq_release_private, }; - -static int __init arp_proc_init(void) -{ - if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops)) - return -ENOMEM; - return 0; -} - -#else /* CONFIG_PROC_FS */ - -static int __init arp_proc_init(void) -{ - return 0; -} - #endif /* CONFIG_PROC_FS */ EXPORT_SYMBOL(arp_broken_ops); @@ -1392,7 +1424,7 @@ EXPORT_SYMBOL(arp_find); EXPORT_SYMBOL(arp_create); EXPORT_SYMBOL(arp_xmit); EXPORT_SYMBOL(arp_send); -EXPORT_SYMBOL(arp_tbl); +EXPORT_SYMBOL(global_arp_tbl); #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) EXPORT_SYMBOL(clip_tbl_hook); diff -uprN linux-2.6.24/net/ipv4/devinet.c linux-2.6.24.ovz/net/ipv4/devinet.c --- linux-2.6.24/net/ipv4/devinet.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/devinet.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +73,7 @@ struct ipv4_devconf ipv4_devconf = { }, }; -static struct ipv4_devconf ipv4_devconf_dflt = { +struct ipv4_devconf ipv4_devconf_dflt = { .data = { [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, @@ -82,7 +83,7 @@ static struct ipv4_devconf ipv4_devconf_ }, }; -#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr) +#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ve_ipv4_devconf_dflt, attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, @@ -94,8 +95,14 @@ static const struct nla_policy ifa_ipv4_ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); +#ifdef CONFIG_VE +#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt)) +#else +#define ve_ipv4_devconf_dflt ipv4_devconf_dflt +#endif + static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL static void devinet_sysctl_register(struct in_device *in_dev, @@ -105,9 +112,9 @@ static void devinet_sysctl_unregister(st /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +struct in_ifaddr *inet_alloc_ifa(void) { - struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); + struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_UBC); if (ifa) { INIT_RCU_HEAD(&ifa->rcu_head); @@ -115,6 +122,7 @@ static struct in_ifaddr *inet_alloc_ifa( return ifa; } +EXPORT_SYMBOL_GPL(inet_alloc_ifa); static void inet_rcu_free_ifa(struct rcu_head *head) { @@ -147,7 +155,7 @@ void in_dev_finish_destroy(struct in_dev } } -static struct in_device *inetdev_init(struct net_device *dev) +struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; @@ -188,6 +196,7 @@ out_kfree: in_dev = NULL; goto out; } +EXPORT_SYMBOL_GPL(inetdev_init); static void in_dev_rcu_put(struct rcu_head *head) { @@ -329,7 +338,7 @@ static void __inet_del_ifa(struct in_dev inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -387,7 +396,7 @@ static int __inet_insert_ifa(struct in_i return 0; } -static int inet_insert_ifa(struct in_ifaddr *ifa) +int inet_insert_ifa(struct in_ifaddr *ifa) { return __inet_insert_ifa(ifa, NULL, 0); } @@ -418,7 +427,7 @@ struct in_device *inetdev_by_index(int i struct net_device *dev; struct in_device *in_dev = NULL; read_lock(&dev_base_lock); - dev = __dev_get_by_index(&init_net, ifindex); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex); if (dev) in_dev = in_dev_get(dev); read_unlock(&dev_base_lock); @@ -438,6 +447,7 @@ struct in_ifaddr *inet_ifa_byprefix(stru } endfor_ifa(in_dev); return NULL; } +EXPORT_SYMBOL_GPL(inet_insert_ifa); static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -504,7 +514,7 @@ static struct in_ifaddr *rtm_to_ifaddr(s goto errout; } - dev = __dev_get_by_index(&init_net, ifm->ifa_index); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifm->ifa_index); if (dev == NULL) { err = -ENODEV; goto errout; @@ -608,6 +618,7 @@ int devinet_ioctl(unsigned int cmd, void char *colon; int ret = -EFAULT; int tryaddrmatch = 0; + struct net *net = get_exec_env()->ve_ns->net_ns; /* * Fetch the caller's info block into kernel space @@ -625,7 +636,7 @@ int devinet_ioctl(unsigned int cmd, void *colon = 0; #ifdef CONFIG_KMOD - dev_load(&init_net, ifr.ifr_name); + dev_load(net, ifr.ifr_name); #endif switch (cmd) { @@ -644,7 +655,7 @@ int devinet_ioctl(unsigned int cmd, void case SIOCSIFFLAGS: ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -652,7 +663,7 @@ int devinet_ioctl(unsigned int cmd, void case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EACCES; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) @@ -666,7 +677,7 @@ int devinet_ioctl(unsigned int cmd, void rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL) + if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) goto done; if (colon) @@ -906,7 +917,7 @@ no_in_dev: */ read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; @@ -985,7 +996,7 @@ __be32 inet_confirm_addr(const struct ne read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1048,9 +1059,6 @@ static int inetdev_event(struct notifier struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - ASSERT_RTNL(); if (!in_dev) { @@ -1182,7 +1190,7 @@ static int inet_dump_ifaddr(struct sk_bu s_ip_idx = ip_idx = cb->args[1]; idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -1241,7 +1249,7 @@ static void devinet_copy_dflt_conf(int i struct net_device *dev; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1326,11 +1334,10 @@ void inet_forward_change(void) struct net_device *dev; int on = IPV4_DEVCONF_ALL(FORWARDING); - IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(FORWARDING) = on; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1489,28 +1496,19 @@ static struct devinet_sysctl_table { }, }; -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p) +static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name, + int ifindex, struct ipv4_devconf *p) { int i; - struct net_device *dev = in_dev ? in_dev->dev : NULL; - struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t), - GFP_KERNEL); - char *dev_name = NULL; + struct devinet_sysctl_table *t; + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); if (!t) - return; + goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; - } - - if (dev) { - dev_name = dev->name; - t->devinet_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + t->devinet_vars[i].owner_env = get_exec_env(); } /* @@ -1520,8 +1518,9 @@ static void devinet_sysctl_register(stru */ dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) - goto free; + goto out_free_table; + t->devinet_dev[0].ctl_name = ifindex; t->devinet_dev[0].procname = dev_name; t->devinet_dev[0].child = t->devinet_vars; t->devinet_conf_dir[0].child = t->devinet_dev; @@ -1530,17 +1529,37 @@ static void devinet_sysctl_register(stru t->sysctl_header = register_sysctl_table(t->devinet_root_dir); if (!t->sysctl_header) - goto free_procname; + goto out_free_procname; - p->sysctl = t; - return; + return t; /* error path */ - free_procname: +out_free_procname: kfree(dev_name); - free: +out_free_table: kfree(t); - return; +out: + return NULL; +} + +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p) +{ + struct net_device *dev; + char *dev_name; + int ifindex; + + dev = in_dev ? in_dev->dev : NULL; + + if (dev) { + dev_name = dev->name; + ifindex = dev->ifindex; + } else { + dev_name = "default"; + ifindex = NET_PROTO_CONF_DEFAULT; + } + + p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p); } static void devinet_sysctl_unregister(struct ipv4_devconf *p) @@ -1553,8 +1572,176 @@ static void devinet_sysctl_unregister(st kfree(t); } } + +#ifdef CONFIG_VE +static ctl_table net_sysctl_tables[] = { + /* 0: net */ + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = &net_sysctl_tables[2], + }, + { .ctl_name = 0, }, + /* 2: net/ipv4 */ + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = &net_sysctl_tables[4], + }, + { .ctl_name = 0, }, + /* 4, 5: net/ipv4/[vars] */ + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.data[NET_IPV4_CONF_FORWARDING-1], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_forward, + .strategy = &ipv4_sysctl_forward_strategy, + }, + { + .ctl_name = NET_IPV4_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = &net_sysctl_tables[7], + }, + { .ctl_name = 0 }, + /* 7: net/ipv4/route/flush */ + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .data = NULL, /* setuped below */ + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static int ip_forward_sysctl_register(struct ve_struct *ve, + struct ipv4_devconf *p) +{ + struct ctl_table_header *hdr; + ctl_table *root, *ipv4_table, *route_table; + + root = clone_sysctl_template(net_sysctl_tables); + if (root == NULL) + goto out; + + ipv4_table = root->child->child; + ipv4_table[0].data = &p->data[NET_IPV4_CONF_FORWARDING - 1]; + + route_table = ipv4_table[1].child; + route_table[0].data = &ipv4_flush_delay; + + hdr = register_sysctl_table(root); + if (hdr == NULL) + goto out_free; + + ve->forward_header = hdr; + ve->forward_table = root; + return 0; + +out_free: + free_sysctl_clone(root); +out: + return -ENOMEM; +} + +static inline void ip_forward_sysctl_unregister(struct ve_struct *ve) +{ + unregister_sysctl_table(ve->forward_header); + ve->forward_header = NULL; +} + +static inline void ip_forward_sysctl_free(struct ve_struct *ve) +{ + if (ve->forward_table == NULL) + return; + + free_sysctl_clone(ve->forward_table); + ve->forward_table = NULL; +} +#endif #endif +int devinet_sysctl_init(struct ve_struct *ve) +{ + int err = 0; +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + struct ipv4_devconf *conf, *conf_def; + + err = -ENOMEM; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + goto err1; + + memcpy(conf, &ipv4_devconf, sizeof(*conf)); + conf->sysctl = __devinet_sysctl_register("all", + NET_PROTO_CONF_ALL, conf); + if (!conf->sysctl) + goto err2; + + conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); + if (!conf_def) + goto err3; + + memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def)); + conf_def->sysctl = __devinet_sysctl_register("default", + NET_PROTO_CONF_DEFAULT, conf_def); + if (!conf_def->sysctl) + goto err4; + + err = ip_forward_sysctl_register(ve, conf); + if (err) + goto err5; + + ve->_ipv4_devconf = conf; + ve->_ipv4_devconf_dflt = conf_def; + return 0; + +err5: + devinet_sysctl_unregister(conf_def); +err4: + kfree(conf_def); +err3: + devinet_sysctl_unregister(conf); +err2: + kfree(conf); +err1: +#endif +#endif + return err; +} + +void devinet_sysctl_fini(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + ip_forward_sysctl_unregister(ve); + devinet_sysctl_unregister(ve->_ipv4_devconf); + devinet_sysctl_unregister(ve->_ipv4_devconf_dflt); +#endif +#endif +} + +void devinet_sysctl_free(struct ve_struct *ve) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_VE + ip_forward_sysctl_free(ve); + kfree(ve->_ipv4_devconf); + kfree(ve->_ipv4_devconf_dflt); +#endif +#endif +} + void __init devinet_init(void) { register_gifconf(PF_INET, inet_gifconf); @@ -1566,7 +1753,8 @@ void __init devinet_init(void) #ifdef CONFIG_SYSCTL devinet_sysctl.sysctl_header = register_sysctl_table(devinet_sysctl.devinet_root_dir); - devinet_sysctl_register(NULL, &ipv4_devconf_dflt); + __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT, + &ipv4_devconf_dflt); #endif } @@ -1575,3 +1763,7 @@ EXPORT_SYMBOL(inet_select_addr); EXPORT_SYMBOL(inetdev_by_index); EXPORT_SYMBOL(register_inetaddr_notifier); EXPORT_SYMBOL(unregister_inetaddr_notifier); +EXPORT_SYMBOL(inet_del_ifa); +EXPORT_SYMBOL(devinet_sysctl_init); +EXPORT_SYMBOL(devinet_sysctl_fini); +EXPORT_SYMBOL(devinet_sysctl_free); diff -uprN linux-2.6.24/net/ipv4/fib_frontend.c linux-2.6.24.ovz/net/ipv4/fib_frontend.c --- linux-2.6.24/net/ipv4/fib_frontend.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/fib_frontend.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -51,13 +52,18 @@ static struct sock *fibnl; +#ifdef CONFIG_VE +#define fib_table_hash (get_exec_env()->_fib_table_hash) +#else +static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +#endif + #ifndef CONFIG_IP_MULTIPLE_TABLES -struct fib_table *ip_fib_local_table; -struct fib_table *ip_fib_main_table; +struct fib_table *__ip_fib_local_table; +struct fib_table *__ip_fib_main_table; #define FIB_TABLE_HASHSZ 1 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; static void __init fib4_rules_init(void) { @@ -67,10 +73,10 @@ static void __init fib4_rules_init(void) hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); } #else - #define FIB_TABLE_HASHSZ 256 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +#endif +#ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_table *fib_new_table(u32 id) { struct fib_table *tb; @@ -217,7 +223,8 @@ int fib_validate_source(__be32 src, __be if (fib_lookup(&fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) + if (res.type != RTN_UNICAST && + (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL)) goto e_inval_res; *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); @@ -345,7 +352,7 @@ static int rtentry_to_fib_config(int cmd colon = strchr(devname, ':'); if (colon) *colon = 0; - dev = __dev_get_by_name(&init_net, devname); + dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; @@ -418,7 +425,7 @@ int ip_rt_ioctl(unsigned int cmd, void _ switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&rt, arg, sizeof(rt))) @@ -869,9 +876,6 @@ static int fib_netdev_event(struct notif struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; diff -uprN linux-2.6.24/net/ipv4/fib_hash.c linux-2.6.24.ovz/net/ipv4/fib_hash.c --- linux-2.6.24/net/ipv4/fib_hash.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/fib_hash.c 2008-03-25 18:53:59.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -72,11 +73,6 @@ struct fn_zone { * can be cheaper than memory lookup, so that FZ_* macros are used. */ -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone *fn_zone_list; -}; - static inline u32 fn_hash(__be32 key, struct fn_zone *fz) { u32 h = ntohl(key)>>(32 - fz->fz_order); @@ -620,7 +616,7 @@ static int fn_hash_delete(struct fib_tab return -ESRCH; } -static int fn_flush_list(struct fn_zone *fz, int idx) +static int fn_flush_list(struct fn_zone *fz, int idx, int destroy) { struct hlist_head *head = &fz->fz_hash[idx]; struct hlist_node *node, *n; @@ -635,7 +631,9 @@ static int fn_flush_list(struct fn_zone list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { struct fib_info *fi = fa->fa_info; - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + if (fi == NULL) + continue; + if (destroy || (fi->fib_flags&RTNH_F_DEAD)) { write_lock_bh(&fib_hash_lock); list_del(&fa->fa_list); if (list_empty(&f->fn_alias)) { @@ -657,7 +655,7 @@ static int fn_flush_list(struct fn_zone return found; } -static int fn_hash_flush(struct fib_table *tb) +static int __fn_hash_flush(struct fib_table *tb, int destroy) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; @@ -667,11 +665,85 @@ static int fn_hash_flush(struct fib_tabl int i; for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); + found += fn_flush_list(fz, i, destroy); } return found; } +static int fn_hash_flush(struct fib_table *tb) +{ + return __fn_hash_flush(tb, 0); +} + +#ifdef CONFIG_VE +static void fn_free_zones(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; + + while ((fz = table->fn_zone_list) != NULL) { + table->fn_zone_list = fz->fz_next; + fz_hash_free(fz->fz_hash, fz->fz_divisor); + kfree(fz); + } +} + +void fib_hash_destroy(struct fib_table *tb) +{ + __fn_hash_flush(tb, 1); + fn_free_zones(tb); + kfree(tb); +} + +/* + * Initialization of virtualized networking subsystem. + */ +int init_ve_route(struct ve_struct *ve) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ve->_fib_table_hash); i++) + INIT_HLIST_HEAD(&ve->_fib_table_hash[i]); + +#ifdef CONFIG_IP_MULTIPLE_TABLES + return fib_rules_create(); +#else + ve->_local_table = fib_hash_init(RT_TABLE_LOCAL); + if (!ve->_local_table) + return -ENOMEM; + ve->_main_table = fib_hash_init(RT_TABLE_MAIN); + if (!ve->_main_table) { + fib_hash_destroy(ve->_local_table); + return -ENOMEM; + } + + hlist_add_head_rcu(&ve->_local_table->tb_hlist, + &ve->_fib_table_hash[0]); + hlist_add_head_rcu(&ve->_main_table->tb_hlist, + &ve->_fib_table_hash[0]); + return 0; +#endif +} + +void fini_ve_route(struct ve_struct *ve) +{ + unsigned int bytes; +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib_rules_destroy(); +#else + fib_hash_destroy(ve->_local_table); + fib_hash_destroy(ve->_main_table); +#endif + bytes = ve->_fib_hash_size * sizeof(struct hlist_head *); + fib_hash_free(ve->_fib_info_hash, bytes); + fib_hash_free(ve->_fib_info_laddrhash, bytes); + ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL; +} + +EXPORT_SYMBOL(init_ve_route); +EXPORT_SYMBOL(fini_ve_route); +#endif + static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -761,7 +833,7 @@ static int fn_hash_dump(struct fib_table return skb->len; } -#ifdef CONFIG_IP_MULTIPLE_TABLES +#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE) struct fib_table * fib_hash_init(u32 id) #else struct fib_table * __init fib_hash_init(u32 id) @@ -771,14 +843,14 @@ struct fib_table * __init fib_hash_init( if (fn_hash_kmem == NULL) fn_hash_kmem = kmem_cache_create("ip_fib_hash", - sizeof(struct fib_node), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fib_node), 0, + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (fn_alias_kmem == NULL) fn_alias_kmem = kmem_cache_create("ip_fib_alias", - sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fib_alias), 0, + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), @@ -1052,13 +1124,28 @@ static const struct file_operations fib_ .release = seq_release_private, }; -int __init fib_proc_init(void) +static int fib_proc_net_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops)) + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } +static void fib_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "route"); +} + +static struct pernet_operations fib_proc_net_ops = { + .init = fib_proc_net_init, + .exit = fib_proc_net_exit, +}; + +int __init fib_proc_init(void) +{ + return register_pernet_subsys(&fib_proc_net_ops); +} + void __init fib_proc_exit(void) { proc_net_remove(&init_net, "route"); diff -uprN linux-2.6.24/net/ipv4/fib_lookup.h linux-2.6.24.ovz/net/ipv4/fib_lookup.h --- linux-2.6.24/net/ipv4/fib_lookup.h 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/fib_lookup.h 2008-03-25 18:53:59.000000000 -0500 @@ -37,5 +37,6 @@ extern struct fib_alias *fib_find_alias( extern int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int *dflt); +void fib_hash_free(struct hlist_head *hash, int bytes); #endif /* _FIB_LOOKUP_H */ diff -uprN linux-2.6.24/net/ipv4/fib_rules.c linux-2.6.24.ovz/net/ipv4/fib_rules.c --- linux-2.6.24/net/ipv4/fib_rules.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/fib_rules.c 2008-03-25 18:53:59.000000000 -0500 @@ -49,6 +49,85 @@ struct fib4_rule #endif }; +#ifdef CONFIG_VE +#define local_rule (*(get_exec_env()->_local_rule)) +#define fib4_rules (get_exec_env()->_fib_rules) +#else +#define local_rule loc_rule +static LIST_HEAD(fib4_rules); +#endif + +#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) +#ifdef CONFIG_VE +static inline void init_rule_struct(struct fib_rule *r, + u32 pref, unsigned char table, unsigned char action) +{ + memset(r, 0, sizeof(struct fib_rule)); + atomic_set(&r->refcnt, 1); + r->pref = pref; + r->table = table; + r->action = action; +} +#endif + +int fib_rules_create(void) +{ +#ifdef CONFIG_VE + struct fib_rule *default_rule, *main_rule, *loc_rule; + + default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL_UBC); + if (default_rule == NULL) + goto out_def; + + main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL_UBC); + if (main_rule == NULL) + goto out_main; + + loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL_UBC); + if (loc_rule == NULL) + goto out_loc; + + init_rule_struct(default_rule, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST); + init_rule_struct(main_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST); + init_rule_struct(loc_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST); + + INIT_LIST_HEAD(&fib4_rules); + list_add_tail(&loc_rule->list, &fib4_rules); + list_add_tail(&main_rule->list, &fib4_rules); + list_add_tail(&default_rule->list, &fib4_rules); + get_exec_env()->_local_rule = loc_rule; + + return 0; + +out_loc: + kfree(main_rule); +out_main: + kfree(default_rule); +out_def: + return -1; +#else + return 0; +#endif +} + +void fib_rules_destroy(void) +{ +#ifdef CONFIG_VE + struct fib_rule *r; + struct list_head *pos, *tmp; + + rtnl_lock(); + list_for_each_safe (pos, tmp, &fib4_rules) { + r = list_entry(pos, struct fib_rule, list); + + list_del_rcu(pos); + fib_rule_put(r); + } + rtnl_unlock(); +#endif +} +#endif + #ifdef CONFIG_NET_CLS_ROUTE u32 fib_rules_tclass(struct fib_result *res) { diff -uprN linux-2.6.24/net/ipv4/fib_semantics.c linux-2.6.24.ovz/net/ipv4/fib_semantics.c --- linux-2.6.24/net/ipv4/fib_semantics.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/fib_semantics.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include @@ -55,6 +57,24 @@ static struct hlist_head *fib_info_laddr static unsigned int fib_hash_size; static unsigned int fib_info_cnt; +void prepare_fib_info(void) +{ +#ifdef CONFIG_VE + get_ve0()->_fib_info_hash = fib_info_hash; + get_ve0()->_fib_info_laddrhash = fib_info_laddrhash; + get_ve0()->_fib_hash_size = fib_hash_size; + get_ve0()->_fib_info_cnt = fib_info_cnt; +#endif +} + +#ifdef CONFIG_VE +#define fib_info_hash (get_exec_env()->_fib_info_hash) +#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash) +#define fib_hash_size (get_exec_env()->_fib_hash_size) +#define fib_info_cnt (get_exec_env()->_fib_info_cnt) +#endif + + #define DEVINDEX_HASHBITS 8 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; @@ -234,13 +254,15 @@ static struct fib_info *fib_find_info(co return NULL; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) +static inline unsigned int fib_devindex_hashfn(unsigned int val, + envid_t veid) { unsigned int mask = DEVINDEX_HASHSIZE - 1; return (val ^ (val >> DEVINDEX_HASHBITS) ^ - (val >> (DEVINDEX_HASHBITS * 2))) & mask; + (val >> (DEVINDEX_HASHBITS * 2)) ^ + (veid ^ (veid >> 16))) & mask; } /* Check, that the gateway is already configured. @@ -256,7 +278,7 @@ int ip_fib_check_default(__be32 gw, stru spin_lock(&fib_info_lock); - hash = fib_devindex_hashfn(dev->ifindex); + hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); head = &fib_info_devhash[hash]; hlist_for_each_entry(nh, node, head, nh_hash) { if (nh->nh_dev == dev && @@ -533,7 +555,7 @@ static int fib_check_nh(struct fib_confi return -EINVAL; if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) + if ((dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, nh->nh_oif)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) return -ENETDOWN; @@ -611,7 +633,7 @@ static struct hlist_head *fib_hash_alloc __get_free_pages(GFP_KERNEL, get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +void fib_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -799,7 +821,7 @@ struct fib_info *fib_create_info(struct if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); + nh->nh_dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, fi->fib_nh->nh_oif); err = -ENODEV; if (nh->nh_dev == NULL) goto failure; @@ -842,7 +864,8 @@ link_it: if (!nh->nh_dev) continue; - hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nh->nh_dev->ifindex, + VEID(nh->nh_dev->owner_env)); head = &fib_info_devhash[hash]; hlist_add_head(&nh->nh_hash, head); } endfor_nexthops(fi) @@ -1054,7 +1077,8 @@ int fib_sync_down(__be32 local, struct n if (dev) { struct fib_info *prev_fi = NULL; - unsigned int hash = fib_devindex_hashfn(dev->ifindex); + unsigned int hash = fib_devindex_hashfn(dev->ifindex, + VEID(dev->owner_env)); struct hlist_head *head = &fib_info_devhash[hash]; struct hlist_node *node; struct fib_nh *nh; @@ -1119,7 +1143,7 @@ int fib_sync_up(struct net_device *dev) return 0; prev_fi = NULL; - hash = fib_devindex_hashfn(dev->ifindex); + hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); head = &fib_info_devhash[hash]; ret = 0; diff -uprN linux-2.6.24/net/ipv4/icmp.c linux-2.6.24.ovz/net/ipv4/icmp.c --- linux-2.6.24/net/ipv4/icmp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/icmp.c 2008-03-25 18:53:59.000000000 -0500 @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -513,7 +514,7 @@ void icmp_send(struct sk_buff *skb_in, i struct net_device *dev = NULL; if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(&init_net, rt->fl.iif); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, rt->fl.iif); if (dev) { saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); diff -uprN linux-2.6.24/net/ipv4/igmp.c linux-2.6.24.ovz/net/ipv4/igmp.c --- linux-2.6.24/net/ipv4/igmp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/igmp.c 2008-03-25 18:53:59.000000000 -0500 @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -2291,8 +2292,9 @@ static inline struct ip_mc_list *igmp_mc struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, state->dev) { struct in_device *in_dev; + in_dev = in_dev_get(state->dev); if (!in_dev) continue; @@ -2438,8 +2440,9 @@ static inline struct ip_sf_list *igmp_mc state->idev = NULL; state->im = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, state->dev) { struct in_device *idev; + idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) continue; @@ -2581,11 +2584,34 @@ static const struct file_operations igmp .release = seq_release_private, }; -int __init igmp_mc_proc_init(void) +static int igmp_net_init(struct net *net) { - proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); - proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + if (!proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops)) + goto out_igmp; + if (!proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops)) + goto out_mcfilter; return 0; + +out_mcfilter: + proc_net_remove(net, "igmp"); +out_igmp: + return -ENOMEM; +} + +static void igmp_net_exit(struct net *net) +{ + proc_net_remove(net, "igmp"); + proc_net_remove(net, "mcfilter"); +} + +static struct pernet_operations igmp_net_ops = { + .init = igmp_net_init, + .exit = igmp_net_exit, +}; + +int __init igmp_mc_proc_init(void) +{ + return register_pernet_subsys(&igmp_net_ops); } #endif diff -uprN linux-2.6.24/net/ipv4/inet_connection_sock.c linux-2.6.24.ovz/net/ipv4/inet_connection_sock.c --- linux-2.6.24/net/ipv4/inet_connection_sock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/inet_connection_sock.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,9 @@ #include #include +#include +#include + #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); @@ -58,6 +61,7 @@ int inet_csk_bind_conflict(const struct sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && + ve_accessible_strict(sk->owner_env, sk2->owner_env) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -87,7 +91,9 @@ int inet_csk_get_port(struct inet_hashin struct hlist_node *node; struct inet_bind_bucket *tb; int ret; + struct ve_struct *env; + env = sk->owner_env; local_bh_disable(); if (!snum) { int remaining, rover, low, high; @@ -97,11 +103,15 @@ int inet_csk_get_port(struct inet_hashin rover = net_random() % remaining + low; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(rover, + hashinfo->bhash_size, VEID(env))]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (!ve_accessible_strict(tb->owner_env, env)) + continue; if (tb->port == rover) goto next; + } break; next: spin_unlock(&head->lock); @@ -124,11 +134,15 @@ int inet_csk_get_port(struct inet_hashin */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(snum, + hashinfo->bhash_size, VEID(env))]; spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (!ve_accessible_strict(tb->owner_env, env)) + continue; if (tb->port == snum) goto tb_found; + } } tb = NULL; goto tb_not_found; @@ -147,7 +161,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -555,7 +569,7 @@ void inet_csk_destroy_sock(struct sock * sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + ub_dec_orphan_count(sk); sock_put(sk); } @@ -635,7 +649,7 @@ void inet_csk_listen_stop(struct sock *s sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); inet_csk_destroy_sock(child); diff -uprN linux-2.6.24/net/ipv4/inet_diag.c linux-2.6.24.ovz/net/ipv4/inet_diag.c --- linux-2.6.24/net/ipv4/inet_diag.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/inet_diag.c 2008-03-25 18:53:59.000000000 -0500 @@ -706,7 +706,9 @@ static int inet_diag_dump(struct sk_buff struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + struct ve_struct *ve; + ve = get_exec_env(); handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); if (!handler) goto no_handler; @@ -729,6 +731,8 @@ static int inet_diag_dump(struct sk_buff sk_for_each(sk, node, &hashinfo->listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) { num++; continue; @@ -790,6 +794,8 @@ skip_listen_ht: sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible(sk->owner_env, ve)) + continue; if (num < s_num) goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) @@ -814,6 +820,8 @@ next_normal: inet_twsk_for_each(tw, node, &head->twchain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (num < s_num) goto next_dying; if (r->id.idiag_sport != tw->tw_sport && diff -uprN linux-2.6.24/net/ipv4/inet_fragment.c linux-2.6.24.ovz/net/ipv4/inet_fragment.c --- linux-2.6.24/net/ipv4/inet_fragment.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/inet_fragment.c 2008-03-25 18:53:59.000000000 -0500 @@ -224,7 +224,9 @@ static struct inet_frag_queue *inet_frag setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - +#ifdef CONFIG_VE + q->owner_ve = get_exec_env(); +#endif return q; } diff -uprN linux-2.6.24/net/ipv4/inet_hashtables.c linux-2.6.24.ovz/net/ipv4/inet_hashtables.c --- linux-2.6.24/net/ipv4/inet_hashtables.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/inet_hashtables.c 2008-03-25 18:53:59.000000000 -0500 @@ -29,7 +29,8 @@ */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + struct ve_struct *ve) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); @@ -37,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucke tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); + tb->owner_env = ve; hlist_add_head(&tb->node, &head->chain); } return tb; @@ -66,10 +68,13 @@ void inet_bind_hash(struct sock *sk, str */ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) { - const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); - struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + int bhash; + struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; + bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, + VEID(sk->owner_env)); + head = &hashinfo->bhash[bhash]; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -132,10 +137,14 @@ static struct sock *inet_lookup_listener struct sock *result = NULL, *sk; const struct hlist_node *node; int hiscore = -1; + struct ve_struct *env; + env = get_exec_env(); sk_for_each(sk, node, head) { const struct inet_sock *inet = inet_sk(sk); + if (!ve_accessible_strict(sk->owner_env, env)) + continue; if (inet->num == hnum && !ipv6_only_sock(sk)) { const __be32 rcv_saddr = inet->rcv_saddr; int score = sk->sk_family == PF_INET ? 1 : 0; @@ -168,13 +177,16 @@ struct sock *__inet_lookup_listener(stru { struct sock *sk = NULL; const struct hlist_head *head; + struct ve_struct *env; + env = get_exec_env(); read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && + ve_accessible_strict(sk->owner_env, env) && (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && !sk->sk_bound_dev_if) @@ -193,7 +205,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_listener /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + struct ve_struct *ve) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -202,7 +215,7 @@ static int __inet_check_established(stru int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve)); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; @@ -216,7 +229,8 @@ static int __inet_check_established(stru sk_for_each(sk2, node, &head->twchain) { tw = inet_twsk(sk2); - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, + ports, dif, ve)) { if (twsk_unique(sk, sk2, twp)) goto unique; else @@ -227,7 +241,8 @@ static int __inet_check_established(stru /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, + ports, dif, ve)) goto not_unique; } @@ -278,7 +293,9 @@ int inet_hash_connect(struct inet_timewa struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; + struct ve_struct *ve; + ve = sk->owner_env; if (!snum) { int i, remaining, low, high, port; static u32 hint; @@ -292,7 +309,7 @@ int inet_hash_connect(struct inet_timewa local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size, VEID(ve))]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -300,19 +317,21 @@ int inet_hash_connect(struct inet_timewa * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { + if (tb->port == port && + ve_accessible_strict(tb->owner_env, ve)) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; if (!__inet_check_established(death_row, sk, port, - &tw)) + &tw, ve)) goto ok; goto next_port; } } - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port, ve); if (!tb) { spin_unlock(&head->lock); break; @@ -320,7 +339,7 @@ int inet_hash_connect(struct inet_timewa tb->fastreuse = -1; goto ok; - next_port: + next_port: spin_unlock(&head->lock); } local_bh_enable(); @@ -347,7 +366,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -357,7 +376,7 @@ ok: } else { spin_unlock(&head->lock); /* No definite answer... Walk to established hash table */ - ret = __inet_check_established(death_row, sk, snum, NULL); + ret = __inet_check_established(death_row, sk, snum, NULL, ve); out: local_bh_enable(); return ret; diff -uprN linux-2.6.24/net/ipv4/inet_timewait_sock.c linux-2.6.24.ovz/net/ipv4/inet_timewait_sock.c --- linux-2.6.24/net/ipv4/inet_timewait_sock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/inet_timewait_sock.c 2008-03-25 18:53:59.000000000 -0500 @@ -13,6 +13,8 @@ #include #include +#include + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -32,7 +34,8 @@ static void __inet_twsk_kill(struct inet write_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, + hashinfo->bhash_size, tw->tw_owner_env)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); @@ -65,7 +68,8 @@ void __inet_twsk_hashdance(struct inet_t Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, + hashinfo->bhash_size, tw->tw_owner_env)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; BUG_TRAP(icsk->icsk_bind_hash); @@ -89,9 +93,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); + struct user_beancounter *ub; + struct inet_timewait_sock *tw; + + ub = set_exec_ub(sock_bc(sk)->ub); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); + (void)set_exec_ub(ub); + if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); @@ -139,6 +148,7 @@ static int inet_twdr_do_twkill_work(stru rescan: inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); + ub_timewait_dec(tw, twdr); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); @@ -237,6 +247,7 @@ void inet_twsk_deschedule(struct inet_ti { spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); if (--twdr->tw_count == 0) del_timer(&twdr->tw_timer); @@ -283,9 +294,10 @@ void inet_twsk_schedule(struct inet_time spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) { + ub_timewait_dec(tw, twdr); twdr->tw_count--; - else + } else atomic_inc(&tw->tw_refcnt); if (slot >= INET_TWDR_RECYCLE_SLOTS) { @@ -321,6 +333,7 @@ void inet_twsk_schedule(struct inet_time hlist_add_head(&tw->tw_death_node, list); + ub_timewait_inc(tw, twdr); if (twdr->tw_count++ == 0) mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); @@ -355,6 +368,7 @@ void inet_twdr_twcal_tick(unsigned long &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); + ub_timewait_dec(tw, twdr); inet_twsk_put(tw); killed++; } diff -uprN linux-2.6.24/net/ipv4/ip_forward.c linux-2.6.24.ovz/net/ipv4/ip_forward.c --- linux-2.6.24/net/ipv4/ip_forward.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_forward.c 2008-03-25 18:53:59.000000000 -0500 @@ -93,6 +93,24 @@ int ip_forward(struct sk_buff *skb) goto drop; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + iph = ip_hdr(skb); + if ( +#ifdef CONFIG_IP_ROUTE_NAT + (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ +#endif /* and */ + (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ + goto no_ttl_decr; + /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; @@ -101,6 +119,8 @@ int ip_forward(struct sk_buff *skb) /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); +no_ttl_decr: + /* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. diff -uprN linux-2.6.24/net/ipv4/ip_fragment.c linux-2.6.24.ovz/net/ipv4/ip_fragment.c --- linux-2.6.24/net/ipv4/ip_fragment.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_fragment.c 2008-03-25 18:53:59.000000000 -0500 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -138,7 +139,8 @@ static int ip4_frag_match(struct inet_fr qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user); + qp->user == arg->user && + q->owner_ve == get_exec_env()); } /* Memory Tracking Functions. */ @@ -206,9 +208,12 @@ static void ip_evictor(void) */ static void ip_expire(unsigned long arg) { + struct inet_frag_queue *q = (struct inet_frag_queue *)arg; struct ipq *qp; + struct ve_struct *old_ve; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(q, struct ipq, q); + old_ve = set_exec_env(q->owner_ve); spin_lock(&qp->q.lock); @@ -223,7 +228,7 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in&FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { + if ((head->dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); dev_put(head->dev); } @@ -231,6 +236,8 @@ static void ip_expire(unsigned long arg) out: spin_unlock(&qp->q.lock); ipq_put(qp); + + (void)set_exec_env(old_ve); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -535,6 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &ip4_frags.mem); + clone->owner_env = head->owner_env; } skb_shinfo(head)->frag_list = head->next; @@ -607,6 +615,49 @@ int ip_defrag(struct sk_buff *skb, u32 u return -ENOMEM; } +#ifdef CONFIG_VE +/* XXX */ +void ip_fragment_cleanup(struct ve_struct *ve) +{ + int i, progress; + + /* All operations with fragment queues are performed from NET_RX/TX + * soft interrupts or from timer context. --Den */ + local_bh_disable(); + do { + progress = 0; + for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct ipq *qp; + struct hlist_node *p, *n; + + if (hlist_empty(&ip4_frags.hash[i])) + continue; +inner_restart: + read_lock(&ip4_frags.lock); + hlist_for_each_entry_safe(qp, p, n, + &ip4_frags.hash[i], q.list) { + if (!ve_accessible_strict(qp->q.owner_ve, ve)) + continue; + atomic_inc(&qp->q.refcnt); + read_unlock(&ip4_frags.lock); + + spin_lock(&qp->q.lock); + if (!(qp->q.last_in & COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->q.lock); + + ipq_put(qp); + progress = 1; + goto inner_restart; + } + read_unlock(&ip4_frags.lock); + } + } while (progress); + local_bh_enable(); +} +EXPORT_SYMBOL(ip_fragment_cleanup); +#endif + void __init ipfrag_init(void) { ip4_frags.ctl = &ip4_frags_ctl; diff -uprN linux-2.6.24/net/ipv4/ip_gre.c linux-2.6.24.ovz/net/ipv4/ip_gre.c --- linux-2.6.24/net/ipv4/ip_gre.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_gre.c 2008-03-25 18:53:59.000000000 -0500 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -262,7 +263,7 @@ static struct ip_tunnel * ipgre_tunnel_l int i; for (i=1; i<100; i++) { sprintf(name, "gre%d", i); - if (__dev_get_by_name(&init_net, name) == NULL) + if (__dev_get_by_name(get_exec_env()->ve_ns->net_ns, name) == NULL) break; } if (i==100) @@ -1210,7 +1211,7 @@ static int ipgre_tunnel_init(struct net_ } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len; diff -uprN linux-2.6.24/net/ipv4/ip_input.c linux-2.6.24.ovz/net/ipv4/ip_input.c --- linux-2.6.24/net/ipv4/ip_input.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_input.c 2008-03-25 18:53:59.000000000 -0500 @@ -197,6 +197,8 @@ int ip_call_ra_chain(struct sk_buff *skb static int ip_local_deliver_finish(struct sk_buff *skb) { + if (skb->destructor) + skb_orphan(skb); __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ @@ -380,9 +382,6 @@ int ip_rcv(struct sk_buff *skb, struct n struct iphdr *iph; u32 len; - if (dev->nd_net != &init_net) - goto drop; - /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ diff -uprN linux-2.6.24/net/ipv4/ip_output.c linux-2.6.24.ovz/net/ipv4/ip_output.c --- linux-2.6.24/net/ipv4/ip_output.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_output.c 2008-03-25 18:53:59.000000000 -0500 @@ -1342,12 +1342,13 @@ void ip_send_reply(struct sock *sk, stru char data[40]; } replyopts; struct ipcm_cookie ipc; - __be32 daddr; + __be32 saddr, daddr; struct rtable *rt = (struct rtable*)skb->dst; if (ip_options_echo(&replyopts.opt, skb)) return; + saddr = ip_hdr(skb)->daddr; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; @@ -1362,7 +1363,7 @@ void ip_send_reply(struct sock *sk, stru struct flowi fl = { .oif = arg->bound_dev_if, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = rt->rt_spec_dst, + .saddr = saddr, .tos = RT_TOS(ip_hdr(skb)->tos) } }, /* Not quite clean, but right. */ .uli_u = { .ports = diff -uprN linux-2.6.24/net/ipv4/ip_sockglue.c linux-2.6.24.ovz/net/ipv4/ip_sockglue.c --- linux-2.6.24/net/ipv4/ip_sockglue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ip_sockglue.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -515,7 +516,8 @@ static int do_ip_setsockopt(struct sock val |= inet->tos & 3; } if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && - !capable(CAP_NET_ADMIN)) { + !capable(CAP_NET_ADMIN) && + !capable(CAP_VE_NET_ADMIN)) { err = -EPERM; break; } @@ -600,7 +602,7 @@ static int do_ip_setsockopt(struct sock dev_put(dev); } } else - dev = __dev_get_by_index(&init_net, mreq.imr_ifindex); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, mreq.imr_ifindex); err = -EADDRNOTAVAIL; diff -uprN linux-2.6.24/net/ipv4/ipconfig.c linux-2.6.24.ovz/net/ipv4/ipconfig.c --- linux-2.6.24/net/ipv4/ipconfig.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipconfig.c 2008-03-25 18:53:59.000000000 -0500 @@ -185,19 +185,20 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; + struct net *net = get_exec_env()->ve_ns->net_ns; last = &ic_first_dev; rtnl_lock(); /* bring loopback device up first */ - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (!(dev->flags & IFF_LOOPBACK)) continue; if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); } - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if (dev->flags & IFF_LOOPBACK) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : @@ -430,9 +431,6 @@ ic_rarp_recv(struct sk_buff *skb, struct unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (dev->nd_net != &init_net) - goto drop; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -842,9 +840,6 @@ static int __init ic_bootp_recv(struct s struct ic_device *d; int len, ext_len; - if (dev->nd_net != &init_net) - goto drop; - /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; diff -uprN linux-2.6.24/net/ipv4/ipip.c linux-2.6.24.ovz/net/ipv4/ipip.c --- linux-2.6.24/net/ipv4/ipip.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipip.c 2008-03-25 18:53:59.000000000 -0500 @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -225,7 +226,7 @@ static struct ip_tunnel * ipip_tunnel_lo int i; for (i=1; i<100; i++) { sprintf(name, "tunl%d", i); - if (__dev_get_by_name(&init_net, name) == NULL) + if (__dev_get_by_name(get_exec_env()->ve_ns->net_ns, name) == NULL) break; } if (i==100) @@ -820,7 +821,7 @@ static int ipip_tunnel_init(struct net_d } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); diff -uprN linux-2.6.24/net/ipv4/ipmr.c linux-2.6.24.ovz/net/ipv4/ipmr.c --- linux-2.6.24/net/ipv4/ipmr.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipmr.c 2008-03-25 18:53:59.000000000 -0500 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -123,9 +124,10 @@ static struct timer_list ipmr_expire_tim static struct net_device *ipmr_new_tunnel(struct vifctl *v) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct net_device *dev; - dev = __dev_get_by_name(&init_net, "tunl0"); + dev = __dev_get_by_name(net, "tunl0"); if (dev) { int err; @@ -149,7 +151,7 @@ struct net_device *ipmr_new_tunnel(struc dev = NULL; - if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { + if (err == 0 && (dev = __dev_get_by_name(net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); @@ -1087,9 +1089,6 @@ static int ipmr_device_event(struct noti struct vif_device *v; int ct; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; diff -uprN linux-2.6.24/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_conn.c --- linux-2.6.24/net/ipv4/ipvs/ip_vs_conn.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_conn.c 2008-03-25 18:53:59.000000000 -0500 @@ -920,7 +920,7 @@ int ip_vs_conn_init(void) /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); if (!ip_vs_conn_cachep) { vfree(ip_vs_conn_tab); return -ENOMEM; diff -uprN linux-2.6.24/net/ipv4/ipvs/ip_vs_core.c linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_core.c --- linux-2.6.24/net/ipv4/ipvs/ip_vs_core.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_core.c 2008-03-25 18:53:59.000000000 -0500 @@ -909,6 +909,10 @@ ip_vs_in(unsigned int hooknum, struct sk * Big tappo: only PACKET_HOST (neither loopback nor mcasts) * ... don't know why 1st test DOES NOT include 2nd (?) */ + /* + * VZ: the question above is right. + * The second test is superfluous. + */ if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", diff -uprN linux-2.6.24/net/ipv4/ipvs/ip_vs_sync.c linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_sync.c --- linux-2.6.24/net/ipv4/ipvs/ip_vs_sync.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/ipvs/ip_vs_sync.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -404,7 +405,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + if ((dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -425,11 +426,12 @@ static int set_mcast_if(struct sock *sk, */ static int set_sync_mesg_maxlen(int sync_state) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - @@ -440,7 +442,7 @@ static int set_sync_mesg_maxlen(int sync IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL) return -ENODEV; sync_recv_mesg_maxlen = dev->mtu - @@ -468,7 +470,7 @@ join_mcast_group(struct sock *sk, struct memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + if ((dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -489,7 +491,7 @@ static int bind_mcastif_addr(struct sock __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + if ((dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, ifname)) == NULL) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); diff -uprN linux-2.6.24/net/ipv4/netfilter/ip_queue.c linux-2.6.24.ovz/net/ipv4/netfilter/ip_queue.c --- linux-2.6.24/net/ipv4/netfilter/ip_queue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ip_queue.c 2008-03-25 18:53:59.000000000 -0500 @@ -500,7 +500,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -530,8 +530,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -541,9 +545,6 @@ ipq_rcv_dev_event(struct notifier_block { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -563,7 +564,7 @@ ipq_rcv_nl_event(struct notifier_block * if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff -uprN linux-2.6.24/net/ipv4/netfilter/ip_tables.c linux-2.6.24.ovz/net/ipv4/netfilter/ip_tables.c --- linux-2.6.24/net/ipv4/netfilter/ip_tables.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ip_tables.c 2008-03-25 18:53:59.000000000 -0500 @@ -23,9 +23,11 @@ #include #include #include +#include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -482,8 +484,8 @@ mark_source_chains(struct xt_table_info int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + ve_printk(VE_LOG, "iptables: loop hook %u pos " + "%u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom @@ -504,6 +506,13 @@ mark_source_chains(struct xt_table_info return 0; } + if (t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + /* Return: backtrack through the last big jump. */ do { @@ -925,7 +934,7 @@ static inline struct xt_counters * alloc (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = ub_vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1254,7 +1263,7 @@ __do_replace(const char *name, unsigned void *loc_cpu_old_entry; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1433,7 +1442,7 @@ do_add_counters(void __user *user, unsig if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = ub_vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1778,18 +1787,18 @@ translate_compat_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (j = 0; j < NF_IP_NUMHOOKS; j++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(valid_hooks & (1 << j))) continue; - if (info->hook_entry[i] == 0xFFFFFFFF) { + if (info->hook_entry[j] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + j, hook_entries[j]); goto out_unlock; } - if (info->underflow[i] == 0xFFFFFFFF) { + if (info->underflow[j] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + j, underflows[j]); goto out_unlock; } } @@ -1800,9 +1809,9 @@ translate_compat_table(const char *name, goto out_unlock; newinfo->number = number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + for (j = 0; j < NF_IP_NUMHOOKS; j++) { + newinfo->hook_entry[j] = info->hook_entry[j]; + newinfo->underflow[j] = info->underflow[j]; } entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; @@ -1908,15 +1917,22 @@ compat_do_replace(void __user *user, uns return ret; } +static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int); + static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_SET_REPLACE: ret = compat_do_replace(user, len); @@ -1927,8 +1943,7 @@ compat_do_ipt_set_ctl(struct sock *sk, i break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); - ret = -EINVAL; + ret = do_ipt_set_ctl(sk, cmd, user, len); } return ret; @@ -2029,9 +2044,14 @@ compat_do_ipt_get_ctl(struct sock *sk, i { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_GET_INFO: ret = get_info(user, len, 1); @@ -2051,9 +2071,14 @@ do_ipt_set_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_SET_REPLACE: ret = do_replace(user, len); @@ -2076,9 +2101,14 @@ do_ipt_get_ctl(struct sock *sk, int cmd, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET].next) + return -ENOENT; +#endif + switch (cmd) { case IPT_SO_GET_INFO: ret = get_info(user, len, 0); @@ -2122,17 +2152,18 @@ do_ipt_get_ctl(struct sock *sk, int cmd, return ret; } -int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) +struct xt_table *ipt_register_table(struct xt_table *table, + const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo; static struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* choose the copy on our node/cpu * but dont care of preemption @@ -2147,28 +2178,30 @@ int ipt_register_table(struct xt_table * repl->underflow); if (ret != 0) { xt_free_table_info(newinfo); - return ret; + return ERR_PTR(ret); } - ret = xt_register_table(table, &bootstrap, newinfo); - if (ret != 0) { + table = virt_xt_register_table(table, &bootstrap, newinfo); + if (IS_ERR(table)) xt_free_table_info(newinfo); - return ret; - } - return 0; + return table; } void ipt_unregister_table(struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; + struct module *me; - private = xt_unregister_table(table); + me = table->me; + private = virt_xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + if (private->number > private->initial_entries) + module_put(me); xt_free_table_info(private); } @@ -2275,12 +2308,30 @@ static struct xt_match icmp_matchstruct .checkentry = icmp_checkentry, }; +static int init_iptables(void) +{ +#ifdef CONFIG_VE_IPTABLES + if (get_exec_env()->_xt_tables[AF_INET].next != NULL) + return -EEXIST; +#endif + + return xt_proto_init(AF_INET); +} + +static void fini_iptables(void) +{ +#ifdef CONFIG_VE_IPTABLES + get_exec_env()->_xt_tables[AF_INET].next = NULL; +#endif + xt_proto_fini(AF_INET); +} + static int __init ip_tables_init(void) { int ret; - ret = xt_proto_init(AF_INET); - if (ret < 0) + ret = init_iptables(); + if (ret) goto err1; /* Noone else will be downing sem now, so we won't sleep */ @@ -2299,6 +2350,10 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; + KSYMRESOLVE(init_iptables); + KSYMRESOLVE(fini_iptables); + KSYMMODRESOLVE(ip_tables); + printk(KERN_INFO "ip_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; @@ -2309,24 +2364,25 @@ err4: err3: xt_unregister_target(&ipt_standard_target); err2: - xt_proto_fini(AF_INET); + fini_iptables(); err1: return ret; } static void __exit ip_tables_fini(void) { + KSYMMODUNRESOLVE(ip_tables); + KSYMUNRESOLVE(init_iptables); + KSYMUNRESOLVE(fini_iptables); nf_unregister_sockopt(&ipt_sockopts); - xt_unregister_match(&icmp_matchstruct); xt_unregister_target(&ipt_error_target); xt_unregister_target(&ipt_standard_target); - - xt_proto_fini(AF_INET); + fini_iptables(); } EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table); EXPORT_SYMBOL(ipt_do_table); -module_init(ip_tables_init); +subsys_initcall(ip_tables_init); module_exit(ip_tables_fini); diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_CLUSTERIP.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_CLUSTERIP.c --- linux-2.6.24/net/ipv4/netfilter/ipt_CLUSTERIP.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_CLUSTERIP.c 2008-03-25 18:53:59.000000000 -0500 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -401,7 +402,7 @@ checkentry(const char *tablename, return false; } - dev = dev_get_by_name(&init_net, e->ip.iniface); + dev = dev_get_by_name(get_exec_env()->ve_ns->net_ns, e->ip.iniface); if (!dev) { printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); return false; diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_LOG.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_LOG.c --- linux-2.6.24/net/ipv4/netfilter/ipt_LOG.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_LOG.c 2008-03-25 18:53:59.000000000 -0500 @@ -46,32 +46,32 @@ static void dump_packet(const struct nf_ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + ve_printk(VE_LOG, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + ve_printk(VE_LOG, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + ve_printk(VE_LOG, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -82,15 +82,15 @@ static void dump_packet(const struct nf_ op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } switch (ih->protocol) { @@ -99,7 +99,7 @@ static void dump_packet(const struct nf_ const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + ve_printk(VE_LOG, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -108,41 +108,41 @@ static void dump_packet(const struct nf_ th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + ve_printk(VE_LOG, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + ve_printk(VE_LOG, "CWR "); if (th->ece) - printk("ECE "); + ve_printk(VE_LOG, "ECE "); if (th->urg) - printk("URG "); + ve_printk(VE_LOG, "URG "); if (th->ack) - printk("ACK "); + ve_printk(VE_LOG, "ACK "); if (th->psh) - printk("PSH "); + ve_printk(VE_LOG, "PSH "); if (th->rst) - printk("RST "); + ve_printk(VE_LOG, "RST "); if (th->syn) - printk("SYN "); + ve_printk(VE_LOG, "SYN "); if (th->fin) - printk("FIN "); + ve_printk(VE_LOG, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -155,15 +155,15 @@ static void dump_packet(const struct nf_ iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + ve_printk(VE_LOG, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + ve_printk(VE_LOG, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + ve_printk(VE_LOG, "%02X", op[i]); + ve_printk(VE_LOG, ") "); } break; } @@ -174,9 +174,9 @@ static void dump_packet(const struct nf_ if (ih->protocol == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + ve_printk(VE_LOG, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + ve_printk(VE_LOG, "PROTO=UDPLITE "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -185,13 +185,13 @@ static void dump_packet(const struct nf_ uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -218,7 +218,7 @@ static void dump_packet(const struct nf_ [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + ve_printk(VE_LOG, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -227,19 +227,19 @@ static void dump_packet(const struct nf_ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -248,19 +248,19 @@ static void dump_packet(const struct nf_ case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + ve_printk(VE_LOG, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + ve_printk(VE_LOG, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%u.%u.%u.%u ", + ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ", NIPQUAD(ich->un.gateway)); /* Fall through */ case ICMP_DEST_UNREACH: @@ -268,16 +268,16 @@ static void dump_packet(const struct nf_ case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); + ve_printk(VE_LOG, "["); dump_packet(info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + ve_printk(VE_LOG, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -290,19 +290,19 @@ static void dump_packet(const struct nf_ break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + ve_printk(VE_LOG, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { @@ -310,7 +310,7 @@ static void dump_packet(const struct nf_ const struct ip_esp_hdr *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + ve_printk(VE_LOG, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -319,25 +319,25 @@ static void dump_packet(const struct nf_ eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + ve_printk(VE_LOG, "UID=%u ", skb->sk->sk_socket->file->f_uid); read_unlock_bh(&skb->sk->sk_callback_lock); } @@ -379,7 +379,7 @@ ipt_log_packet(unsigned int pf, loginfo = &default_loginfo; spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -390,30 +390,30 @@ ipt_log_packet(unsigned int pf, physindev = skb->nf_bridge->physindev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); physoutdev = skb->nf_bridge->physoutdev; if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); } #endif if (in && !out) { /* MAC logging for input chain only. */ - printk("MAC="); + ve_printk(VE_LOG, "MAC="); if (skb->dev && skb->dev->hard_header_len && skb->mac_header != skb->network_header) { int i; const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, + ve_printk(VE_LOG, "%02x%c", *p, i==skb->dev->hard_header_len - 1 ? ' ':':'); } else - printk(" "); + ve_printk(VE_LOG, " "); } dump_packet(loginfo, skb, 0); - printk("\n"); + ve_printk(VE_LOG, "\n"); spin_unlock_bh(&log_lock); } diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_MASQUERADE.c --- linux-2.6.24/net/ipv4/netfilter/ipt_MASQUERADE.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_MASQUERADE.c 2008-03-25 18:53:59.000000000 -0500 @@ -103,6 +103,7 @@ masquerade_target(struct sk_buff *skb, return nf_nat_setup_info(ct, &newrange, hooknum); } +#if 0 static int device_cmp(struct nf_conn *i, void *ifindex) { @@ -125,9 +126,6 @@ static int masq_device_event(struct noti { const struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for conntracks which were associated with that device, @@ -165,6 +163,7 @@ static struct notifier_block masq_dev_no static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; +#endif static struct xt_target masquerade __read_mostly = { .name = "MASQUERADE", @@ -183,12 +182,16 @@ static int __init ipt_masquerade_init(vo ret = xt_register_target(&masquerade); +#if 0 +/* These notifiers are unnecessary and may + lead to oops in virtual environments */ if (ret == 0) { /* Register for device down reports */ register_netdevice_notifier(&masq_dev_notifier); /* Register IP address change reports */ register_inetaddr_notifier(&masq_inet_notifier); } +#endif return ret; } @@ -196,8 +199,8 @@ static int __init ipt_masquerade_init(vo static void __exit ipt_masquerade_fini(void) { xt_unregister_target(&masquerade); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); +/* unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier);*/ } module_init(ipt_masquerade_init); diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_REDIRECT.c --- linux-2.6.24/net/ipv4/netfilter/ipt_REDIRECT.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_REDIRECT.c 2008-03-25 18:53:59.000000000 -0500 @@ -77,8 +77,13 @@ redirect_target(struct sk_buff *skb, rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev && (ifa = indev->ifa_list)) + if (indev && (ifa = indev->ifa_list)) { + /* because of venet device specific, we should use + * second ifa in the list */ + if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next) + ifa = ifa->ifa_next; newdst = ifa->ifa_local; + } rcu_read_unlock(); if (!newdst) diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_REJECT.c --- linux-2.6.24/net/ipv4/netfilter/ipt_REJECT.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_REJECT.c 2008-03-25 18:53:59.000000000 -0500 @@ -221,13 +221,13 @@ static bool check(const char *tablename, const struct ipt_entry *e = e_void; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { - printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); + ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n"); return false; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO)) { - printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); + ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n"); return false; } } diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_TOS.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_TOS.c --- linux-2.6.24/net/ipv4/netfilter/ipt_TOS.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_TOS.c 2008-03-25 18:53:59.000000000 -0500 @@ -57,7 +57,7 @@ checkentry(const char *tablename, && tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && tos != IPTOS_NORMALSVC) { - printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + ve_printk(VE_LOG, KERN_WARNING "TOS: bad tos value %#x\n", tos); return false; } return true; diff -uprN linux-2.6.24/net/ipv4/netfilter/ipt_recent.c linux-2.6.24.ovz/net/ipv4/netfilter/ipt_recent.c --- linux-2.6.24/net/ipv4/netfilter/ipt_recent.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/ipt_recent.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,19 @@ MODULE_PARM_DESC(ip_list_perms, "permiss MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); +#include + +#if defined(CONFIG_VE_IPTABLES) +#define tables (get_exec_env()->_ipt_recent->tables) +#define proc_dir (get_exec_env()->_ipt_recent->proc_dir) +#else +static LIST_HEAD(tables); +static struct proc_dir_entry *proc_dir; +#endif /* CONFIG_VE_IPTABLES */ + +static int init_ipt_recent(struct ve_struct *ve); +static void fini_ipt_recent(struct ve_struct *ve); + struct recent_entry { struct list_head list; struct list_head lru_list; @@ -74,12 +88,10 @@ struct recent_table { struct list_head iphash[0]; }; -static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_dir; static const struct file_operations recent_fops; #endif @@ -256,6 +268,9 @@ ipt_recent_checkentry(const char *tablen strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return false; + if (init_ipt_recent(get_exec_env())) + return 0; + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { @@ -298,6 +313,13 @@ ipt_recent_destroy(const struct xt_match { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; + struct ve_struct *ve; + + ve = get_exec_env(); +#ifdef CONFIG_VE_IPTABLES + if (!ve->_ipt_recent) + return; +#endif mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); @@ -312,6 +334,8 @@ ipt_recent_destroy(const struct xt_match kfree(t); } mutex_unlock(&recent_mutex); + if (!ve_is_super(ve) && list_empty(&tables)) + fini_ipt_recent(ve); } #ifdef CONFIG_PROC_FS @@ -465,6 +489,49 @@ static struct xt_match recent_match __re .me = THIS_MODULE, }; +static int init_ipt_recent(struct ve_struct *ve) +{ + int err = 0; + +#ifdef CONFIG_VE_IPTABLES + if (ve->_ipt_recent) + return 0; + + ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL); + if (!ve->_ipt_recent) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&tables); +#endif +#ifdef CONFIG_PROC_FS + if (err) + return err; + proc_dir = proc_mkdir("ipt_recent", ve->ve_ns->net_ns->proc_net); + if (proc_dir == NULL) { + err = -ENOMEM; + goto out_mem; + } +#endif +out: + return err; +out_mem: +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); +#endif + goto out; +} + +static void fini_ipt_recent(struct ve_struct *ve) +{ + remove_proc_entry("ipt_recent", ve->ve_ns->net_ns->proc_net); +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_ipt_recent); + ve->_ipt_recent = NULL; +#endif +} + static int __init ipt_recent_init(void) { int err; @@ -474,25 +541,24 @@ static int __init ipt_recent_init(void) ip_list_hash_size = 1 << fls(ip_list_tot); err = xt_register_match(&recent_match); -#ifdef CONFIG_PROC_FS if (err) return err; - proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); - if (proc_dir == NULL) { + + err = init_ipt_recent(&ve0); + if (err) { xt_unregister_match(&recent_match); - err = -ENOMEM; + return err; } -#endif - return err; + + return 0; } static void __exit ipt_recent_exit(void) { BUG_ON(!list_empty(&tables)); + + fini_ipt_recent(&ve0); xt_unregister_match(&recent_match); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", init_net.proc_net); -#endif } module_init(ipt_recent_init); diff -uprN linux-2.6.24/net/ipv4/netfilter/iptable_filter.c linux-2.6.24.ovz/net/ipv4/netfilter/iptable_filter.c --- linux-2.6.24/net/ipv4/netfilter/iptable_filter.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/iptable_filter.c 2008-03-25 18:53:59.000000000 -0500 @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -19,6 +20,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("iptables filter table"); +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf) +#else +#define ve_packet_filter &packet_filter +#endif + #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) static struct @@ -26,7 +34,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; -} initial_table __initdata = { +} initial_table = { .repl = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, @@ -67,7 +75,7 @@ ipt_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, &packet_filter); + return ipt_do_table(skb, hook, in, out, ve_packet_filter); } static unsigned int @@ -86,7 +94,7 @@ ipt_local_out_hook(unsigned int hook, return NF_ACCEPT; } - return ipt_do_table(skb, hook, in, out, &packet_filter); + return ipt_do_table(skb, hook, in, out, ve_packet_filter); } static struct nf_hook_ops ipt_ops[] = { @@ -117,22 +125,19 @@ static struct nf_hook_ops ipt_ops[] = { static int forward = NF_ACCEPT; module_param(forward, bool, 0000); -static int __init iptable_filter_init(void) +int init_iptable_filter(void) { int ret; - - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; + struct ipt_table *tmp_filter; /* Register table */ - ret = ipt_register_table(&packet_filter, &initial_table.repl); - if (ret < 0) - return ret; + tmp_filter = ipt_register_table(&packet_filter, + &initial_table.repl); + if (IS_ERR(tmp_filter)) + return PTR_ERR(tmp_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = tmp_filter; +#endif /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); @@ -142,14 +147,50 @@ static int __init iptable_filter_init(vo return ret; cleanup_table: - ipt_unregister_table(&packet_filter); + ipt_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif return ret; } -static void __exit iptable_filter_fini(void) +void fini_iptable_filter(void) { nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - ipt_unregister_table(&packet_filter); + ipt_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif +} + +static int __init iptable_filter_init(void) +{ + int err; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + err = init_iptable_filter(); + if (err < 0) + return err; + + KSYMRESOLVE(init_iptable_filter); + KSYMRESOLVE(fini_iptable_filter); + KSYMMODRESOLVE(iptable_filter); + return 0; +} + +static void __exit iptable_filter_fini(void) +{ + KSYMMODUNRESOLVE(iptable_filter); + KSYMUNRESOLVE(init_iptable_filter); + KSYMUNRESOLVE(fini_iptable_filter); + fini_iptable_filter(); } module_init(iptable_filter_init); diff -uprN linux-2.6.24/net/ipv4/netfilter/iptable_mangle.c linux-2.6.24.ovz/net/ipv4/netfilter/iptable_mangle.c --- linux-2.6.24/net/ipv4/netfilter/iptable_mangle.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/iptable_mangle.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[5]; struct ipt_error term; -} initial_table __initdata = { +} initial_table = { .repl = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, @@ -72,6 +73,13 @@ static struct xt_table packet_mangler = .af = AF_INET, }; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table) +#else +#define ve_packet_mangler &packet_mangler +#endif + /* The work comes in here from netfilter.c. */ static unsigned int ipt_route_hook(unsigned int hook, @@ -80,7 +88,7 @@ ipt_route_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, &packet_mangler); + return ipt_do_table(skb, hook, in, out, ve_packet_mangler); } static unsigned int @@ -112,7 +120,7 @@ ipt_local_hook(unsigned int hook, daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, hook, in, out, &packet_mangler); + ret = ipt_do_table(skb, hook, in, out, ve_packet_mangler); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -166,14 +174,19 @@ static struct nf_hook_ops ipt_ops[] = { }, }; -static int __init iptable_mangle_init(void) +int init_iptable_mangle(void) { int ret; + struct ipt_table *tmp_mangler; /* Register table */ - ret = ipt_register_table(&packet_mangler, &initial_table.repl); - if (ret < 0) - return ret; + tmp_mangler = ipt_register_table(&packet_mangler, + &initial_table.repl); + if (IS_ERR(tmp_mangler)) + return PTR_ERR(tmp_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = tmp_mangler; +#endif /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); @@ -183,14 +196,42 @@ static int __init iptable_mangle_init(vo return ret; cleanup_table: - ipt_unregister_table(&packet_mangler); + ipt_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif return ret; } -static void __exit iptable_mangle_fini(void) +void fini_iptable_mangle(void) { nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - ipt_unregister_table(&packet_mangler); + ipt_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif +} + +static int __init iptable_mangle_init(void) +{ + int err; + + err = init_iptable_mangle(); + if (err < 0) + return err; + + KSYMRESOLVE(init_iptable_mangle); + KSYMRESOLVE(fini_iptable_mangle); + KSYMMODRESOLVE(iptable_mangle); + return 0; +} + +static void __exit iptable_mangle_fini(void) +{ + KSYMMODUNRESOLVE(iptable_mangle); + KSYMUNRESOLVE(init_iptable_mangle); + KSYMUNRESOLVE(fini_iptable_mangle); + fini_iptable_mangle(); } module_init(iptable_mangle_init); diff -uprN linux-2.6.24/net/ipv4/netfilter/iptable_raw.c linux-2.6.24.ovz/net/ipv4/netfilter/iptable_raw.c --- linux-2.6.24/net/ipv4/netfilter/iptable_raw.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/iptable_raw.c 2008-03-25 18:53:59.000000000 -0500 @@ -93,12 +93,13 @@ static struct nf_hook_ops ipt_ops[] = { static int __init iptable_raw_init(void) { + struct xt_table *tmp; int ret; /* Register table */ - ret = ipt_register_table(&packet_raw, &initial_table.repl); - if (ret < 0) - return ret; + tmp = ipt_register_table(&packet_raw, &initial_table.repl); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); /* Register hooks */ ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); diff -uprN linux-2.6.24/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c --- linux-2.6.24/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2008-03-25 18:53:59.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -426,66 +427,204 @@ MODULE_ALIAS("nf_conntrack-" __stringify MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); -static int __init nf_conntrack_l3proto_ipv4_init(void) +#ifdef CONFIG_VE_IPTABLES +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +static int nf_ct_proto_ipv4_sysctl_init(void) { - int ret = 0; + struct nf_conntrack_l3proto *ipv4 = ve_nf_conntrack_l3proto_ipv4; - need_conntrack(); + ipv4->ctl_table_header = NULL; + ipv4->ctl_table_path = nf_net_ipv4_netfilter_sysctl_path; + ipv4->ctl_table = clone_sysctl_template(ip_ct_sysctl_table); + if (ipv4->ctl_table == NULL) + return -ENOMEM; + + ipv4->ctl_table[0].data = &ve_nf_conntrack_max; + ipv4->ctl_table[1].data = &ve_nf_conntrack_count; + ipv4->ctl_table[3].data = &ve_nf_conntrack_checksum; + ipv4->ctl_table[4].data = &ve_nf_ct_log_invalid; - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); - return ret; - } + return 0; +} + +static void nf_ct_proto_ipv4_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) + free_sysctl_clone(ve_nf_conntrack_l3proto_ipv4->ctl_table); +} +#else +static inline int nf_ct_proto_ipv4_sysctl_init(void) +{ + return 0; +} +static inline void nf_ct_proto_ipv4_sysctl_cleanup(void) +{ +} +#endif /* SYSCTL && NF_CONNTRACK_PROC_COMPAT */ + +/* + * Functions init/fini_nf_ct_l3proto_ipv4 glue distributed nf_conntrack + * virtualization efforts. They are to be called from 2 places: + * + * 1) on loading/unloading module nf_conntrack_ipv4 from + * nf_conntrack_l3proto_ipv4_init/fini + * 2) on start/stop ve - from do_ve_iptables + */ +static int nf_ct_proto_ipv4_init(void) +{ + struct nf_conntrack_l3proto *ipv4; + + if (ve_is_super(get_exec_env())) { + ipv4 = &nf_conntrack_l3proto_ipv4; + goto out; + } + ipv4 = kmemdup(&nf_conntrack_l3proto_ipv4, + sizeof(struct nf_conntrack_l3proto), GFP_KERNEL); + if (!ipv4) + return -ENOMEM; +out: + ve_nf_conntrack_l3proto_ipv4 = ipv4; + return 0; +} + +static void nf_ct_proto_ipv4_fini(void) +{ + if (!ve_is_super(get_exec_env())) + kfree(ve_nf_conntrack_l3proto_ipv4); +} +#endif + +int init_nf_ct_l3proto_ipv4(void) +{ + int ret = -ENOMEM; + +#ifdef CONFIG_VE_IPTABLES + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); + + ret = nf_ct_proto_ipv4_init(); + if (ret < 0) + goto err_out; + ret = nf_ct_proto_ipv4_sysctl_init(); + if (ret < 0) + goto no_mem_ipv4; + ret = nf_ct_proto_tcp_sysctl_init(); + if (ret < 0) + goto no_mem_tcp; + ret = nf_ct_proto_udp_sysctl_init(); + if (ret < 0) + goto no_mem_udp; + ret = nf_ct_proto_icmp_sysctl_init(); + if (ret < 0) + goto no_mem_icmp; +#endif /* CONFIG_VE_IPTABLES */ - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register tcp.\n"); - goto cleanup_sockopt; + goto cleanup_sys; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register udp.\n"); - goto cleanup_tcp; + goto unreg_tcp; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmp); if (ret < 0) { printk("nf_conntrack_ipv4: can't register icmp.\n"); - goto cleanup_udp; + goto unreg_udp; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv4); if (ret < 0) { printk("nf_conntrack_ipv4: can't register ipv4\n"); - goto cleanup_icmp; + goto unreg_icmp; } ret = nf_register_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); if (ret < 0) { printk("nf_conntrack_ipv4: can't register hooks.\n"); - goto cleanup_ipv4; + goto unreg_ipv4; } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ret = nf_conntrack_ipv4_compat_init(); if (ret < 0) - goto cleanup_hooks; -#endif + goto unreg_hooks; + return 0; + +unreg_hooks: + nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +unreg_ipv4: + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); +unreg_icmp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); +unreg_udp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); +unreg_tcp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); +cleanup_sys: +#ifdef CONFIG_VE_IPTABLES +no_mem_icmp: + nf_ct_proto_udp_sysctl_cleanup(); +no_mem_udp: + nf_ct_proto_tcp_sysctl_cleanup(); +no_mem_tcp: + nf_ct_proto_ipv4_sysctl_cleanup(); +no_mem_ipv4: + nf_ct_proto_ipv4_fini(); +err_out: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ return ret; -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - cleanup_hooks: +} +EXPORT_SYMBOL(init_nf_ct_l3proto_ipv4); + +void fini_nf_ct_l3proto_ipv4(void) +{ + nf_conntrack_ipv4_compat_fini(); nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -#endif - cleanup_ipv4: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - cleanup_icmp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); + +#ifdef CONFIG_VE_IPTABLES + nf_ct_proto_icmp_sysctl_cleanup(); + nf_ct_proto_udp_sysctl_cleanup(); + nf_ct_proto_tcp_sysctl_cleanup(); + nf_ct_proto_ipv4_sysctl_cleanup(); + nf_ct_proto_ipv4_fini(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ +} +EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv4); + +static int __init nf_conntrack_l3proto_ipv4_init(void) +{ + int ret = 0; + + need_conntrack(); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + ret = init_nf_ct_l3proto_ipv4(); + if (ret < 0) { + printk(KERN_ERR "Unable to initialize netfilter protocols\n"); + goto cleanup_sockopt; + } + KSYMRESOLVE(init_nf_ct_l3proto_ipv4); + KSYMRESOLVE(fini_nf_ct_l3proto_ipv4); + KSYMMODRESOLVE(nf_conntrack_ipv4); + return ret; + cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst); return ret; @@ -494,14 +633,12 @@ static int __init nf_conntrack_l3proto_i static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - nf_conntrack_ipv4_compat_fini(); -#endif - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + + KSYMMODUNRESOLVE(nf_conntrack_ipv4); + KSYMUNRESOLVE(init_nf_ct_l3proto_ipv4); + KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv4); + + fini_nf_ct_l3proto_ipv4(); nf_unregister_sockopt(&so_getorigdst); } diff -uprN linux-2.6.24/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c --- linux-2.6.24/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c 2008-03-25 18:53:59.000000000 -0500 @@ -9,7 +9,9 @@ */ #include #include +#include #include +#include #include #include @@ -43,8 +45,8 @@ static struct hlist_node *ct_get_first(s for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - if (!hlist_empty(&nf_conntrack_hash[st->bucket])) - return nf_conntrack_hash[st->bucket].first; + if (!hlist_empty(&ve_nf_conntrack_hash[st->bucket])) + return ve_nf_conntrack_hash[st->bucket].first; } return NULL; } @@ -58,7 +60,7 @@ static struct hlist_node *ct_get_next(st while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = nf_conntrack_hash[st->bucket].first; + head = ve_nf_conntrack_hash[st->bucket].first; } return head; } @@ -196,8 +198,8 @@ static struct hlist_node *ct_expect_get_ struct ct_expect_iter_state *st = seq->private; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - if (!hlist_empty(&nf_ct_expect_hash[st->bucket])) - return nf_ct_expect_hash[st->bucket].first; + if (!hlist_empty(&ve_nf_ct_expect_hash[st->bucket])) + return ve_nf_ct_expect_hash[st->bucket].first; } return NULL; } @@ -211,7 +213,7 @@ static struct hlist_node *ct_expect_get_ while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = nf_ct_expect_hash[st->bucket].first; + head = ve_nf_ct_expect_hash[st->bucket].first; } return head; } @@ -326,7 +328,7 @@ static void ct_cpu_seq_stop(struct seq_f static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -377,39 +379,104 @@ static const struct file_operations ct_c .release = seq_release_private, }; -int __init nf_conntrack_ipv4_compat_init(void) +#ifdef CONFIG_VE_IPTABLES +#define ve_ip_ct_net_table (get_exec_env()->_nf_conntrack->_ip_ct_net_table) +#define ve_ip_ct_netfilter_table (get_exec_env()->_nf_conntrack->_ip_ct_netfilter_table) +#define ve_ip_ct_sysctl_header (get_exec_env()->_nf_conntrack->_ip_ct_sysctl_header) +#else +#define ve_ip_ct_net_table ip_ct_net_table +#define ve_ip_ct_netfilter_table ip_ct_netfilter_table +#define ve_ip_ct_sysctl_header ip_ct_sysctl_header +#endif + +static ctl_table ip_ct_netfilter_table[] = { + { + .procname = "ip_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + {} +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + {} +}; + +int nf_conntrack_ipv4_compat_init(void) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat); + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, net->proc_net_stat); if (!proc_stat) goto err3; proc_stat->proc_fops = &ct_cpu_seq_fops; proc_stat->owner = THIS_MODULE; + if (ve_is_super(get_exec_env())) { + ve_ip_ct_net_table = ip_ct_net_table; + } else { + ve_ip_ct_net_table = clone_sysctl_template(ip_ct_net_table); + if (!ve_ip_ct_net_table) + goto err4; + } + ve_ip_ct_netfilter_table = ve_ip_ct_net_table[0].child[0].child; + ve_ip_ct_netfilter_table[0].data = &ve_nf_conntrack_max; + ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table); + if (!ve_ip_ct_sysctl_header) + goto err5; + return 0; +err5: + if (!ve_is_super(get_exec_env())) + free_sysctl_clone(ve_ip_ct_net_table); +err4: + remove_proc_entry("ip_conntrack", net->proc_net_stat); err3: - proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack_expect"); err2: - proc_net_remove(&init_net, "ip_conntrack"); + proc_net_remove(net, "ip_conntrack"); err1: return -ENOMEM; } -void __exit nf_conntrack_ipv4_compat_fini(void) +void nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "ip_conntrack_expect"); - proc_net_remove(&init_net, "ip_conntrack"); + struct net *net = get_exec_env()->ve_ns->net_ns; + + unregister_sysctl_table(ve_ip_ct_sysctl_header); + if (!ve_is_super(get_exec_env())) + free_sysctl_clone(ve_ip_ct_net_table); + remove_proc_entry("ip_conntrack", net->proc_net_stat); + proc_net_remove(net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack"); } diff -uprN linux-2.6.24/net/ipv4/netfilter/nf_conntrack_proto_icmp.c linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_proto_icmp.c --- linux-2.6.24/net/ipv4/netfilter/nf_conntrack_proto_icmp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/nf_conntrack_proto_icmp.c 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -19,7 +20,7 @@ #include #include -static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; +unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; static int icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -99,7 +100,7 @@ static int icmp_packet(struct nf_conn *c } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmp_timeout); } return NF_ACCEPT; @@ -156,7 +157,7 @@ icmp_error_message(struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { + ve_nf_conntrack_l3proto_ipv4, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } @@ -334,3 +335,63 @@ struct nf_conntrack_l4proto nf_conntrack #endif #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_icmp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *icmp; + + if (ve_is_super(get_exec_env())) { + icmp = &nf_conntrack_l4proto_icmp; + goto out; + } + + icmp = kmemdup(&nf_conntrack_l4proto_icmp, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!icmp) + goto no_mem_ct; + + icmp->ctl_table_header = &ve_icmp_sysctl_header; + icmp->ctl_table = clone_sysctl_template(icmp_sysctl_table); + if (icmp->ctl_table == NULL) + goto no_mem_sys; + icmp->ctl_table[0].data = &ve_nf_ct_icmp_timeout; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + icmp->ctl_compat_table_header = ve_icmp_compat_sysctl_header; + icmp->ctl_compat_table = + clone_sysctl_template(icmp_compat_sysctl_table); + if (icmp->ctl_compat_table == NULL) + goto no_mem_compat; + icmp->ctl_compat_table[0].data = &ve_nf_ct_icmp_timeout; +#endif +out: + ve_nf_ct_icmp_timeout = nf_ct_icmp_timeout; + + ve_nf_conntrack_l4proto_icmp = icmp; + return 0; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +no_mem_compat: + free_sysctl_clone(icmp->ctl_table); +#endif +no_mem_sys: + kfree(icmp); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_init); + +void nf_ct_proto_icmp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone( + ve_nf_conntrack_l4proto_icmp->ctl_compat_table); +#endif + free_sysctl_clone(ve_nf_conntrack_l4proto_icmp->ctl_table); + kfree(ve_nf_conntrack_l4proto_icmp); + } +} +EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff -uprN linux-2.6.24/net/ipv4/netfilter/nf_nat_core.c linux-2.6.24.ovz/net/ipv4/netfilter/nf_nat_core.c --- linux-2.6.24/net/ipv4/netfilter/nf_nat_core.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/nf_nat_core.c 2008-03-25 18:53:59.000000000 -0500 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -33,6 +35,9 @@ static DEFINE_RWLOCK(nf_nat_lock); +#define MAX_IP_NAT_PROTO 256 + +static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]; static struct nf_conntrack_l3proto *l3proto = NULL; /* Calculated at init based on memory size */ @@ -41,13 +46,22 @@ static int nf_nat_vmalloced; static struct hlist_head *bysource; -#define MAX_IP_NAT_PROTO 256 -static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_nat_protos (get_exec_env()->_nf_conntrack->_nf_nat_protos) +#define ve_nf_nat_l3proto (get_exec_env()->_nf_conntrack->_nf_nat_l3proto) +#define ve_bysource (get_exec_env()->_nf_conntrack->_bysource) +#define ve_nf_nat_vmalloced (get_exec_env()->_nf_conntrack->_nf_nat_vmalloced) +#else +#define ve_nf_nat_protos nf_nat_protos +#define ve_nf_nat_l3proto l3proto +#define ve_bysource bysource +#define ve_nf_nat_vmalloced nf_nat_vmalloced +#endif static inline struct nf_nat_protocol * __nf_nat_proto_find(u_int8_t protonum) { - return rcu_dereference(nf_nat_protos[protonum]); + return rcu_dereference(ve_nf_nat_protos[protonum]); } struct nf_nat_protocol * @@ -151,7 +165,7 @@ find_appropriate_src(const struct nf_con struct hlist_node *n; read_lock_bh(&nf_nat_lock); - hlist_for_each_entry(nat, n, &bysource[h], bysource) { + hlist_for_each_entry(nat, n, &ve_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ @@ -332,7 +346,7 @@ nf_nat_setup_info(struct nf_conn *ct, /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); nat->ct = ct; - hlist_add_head(&nat->bysource, &bysource[srchash]); + hlist_add_head(&nat->bysource, &ve_bysource[srchash]); write_unlock_bh(&nf_nat_lock); } @@ -424,7 +438,6 @@ int nf_nat_icmp_reply_translation(struct struct icmphdr icmp; struct iphdr ip; } *inside; - struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple inner, target; int hdrlen = ip_hdrlen(skb); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -461,16 +474,14 @@ int nf_nat_icmp_reply_translation(struct "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - /* rcu_read_lock()ed by nf_hook_slow */ - l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - if (!nf_ct_get_tuple(skb, ip_hdrlen(skb) + sizeof(struct icmphdr), (ip_hdrlen(skb) + sizeof(struct icmphdr) + inside->ip.ihl * 4), (u_int16_t)AF_INET, inside->ip.protocol, - &inner, l3proto, l4proto)) + &inner, ve_nf_nat_l3proto, + __nf_ct_l4proto_find(PF_INET, inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -520,11 +531,11 @@ int nf_nat_protocol_register(struct nf_n int ret = 0; write_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (ve_nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } - rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); + rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], proto); out: write_unlock_bh(&nf_nat_lock); return ret; @@ -535,7 +546,7 @@ EXPORT_SYMBOL(nf_nat_protocol_register); void nf_nat_protocol_unregister(struct nf_nat_protocol *proto) { write_lock_bh(&nf_nat_lock); - rcu_assign_pointer(nf_nat_protos[proto->protonum], + rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], &nf_nat_unknown_protocol); write_unlock_bh(&nf_nat_lock); synchronize_rcu(); @@ -626,46 +637,60 @@ static struct nf_ct_ext_type nat_extend .flags = NF_CT_EXT_F_PREALLOC, }; -static int __init nf_nat_init(void) +int nf_nat_init(void) { size_t i; int ret; - ret = nf_ct_extend_register(&nat_extend); - if (ret < 0) { - printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); - return ret; + if (ve_is_super(get_exec_env())) { + ret = nf_ct_extend_register(&nat_extend); + if (ret < 0) { + printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + return ret; + } } /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &nf_nat_vmalloced); - if (!bysource) { + ve_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, + &ve_nf_nat_vmalloced); + if (!ve_bysource) { ret = -ENOMEM; goto cleanup_extend; } - +#ifdef CONFIG_VE_IPTABLES + ve_nf_nat_protos = kcalloc(MAX_IP_NAT_PROTO, sizeof(void *), GFP_KERNEL); + if (!ve_nf_nat_protos) { + ret = -ENOMEM; + goto cleanup_hash; + } +#endif /* Sew in builtin protocols. */ write_lock_bh(&nf_nat_lock); for (i = 0; i < MAX_IP_NAT_PROTO; i++) - rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); - rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); - rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); + rcu_assign_pointer(ve_nf_nat_protos[i], &nf_nat_unknown_protocol); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); + rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); write_unlock_bh(&nf_nat_lock); for (i = 0; i < nf_nat_htable_size; i++) { - INIT_HLIST_HEAD(&bysource[i]); + INIT_HLIST_HEAD(&ve_bysource[i]); } - /* Initialize fake conntrack so that NAT will skip it */ - nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + if (ve_is_super(get_exec_env())) { + /* Initialize fake conntrack so that NAT will skip it */ + nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + } - l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); + ve_nf_nat_l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); return 0; +#ifdef CONFIG_VE_IPTABLES +cleanup_hash: +#endif + nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); cleanup_extend: nf_ct_extend_unregister(&nat_extend); return ret; @@ -683,16 +708,43 @@ static int clean_nat(struct nf_conn *i, return 0; } -static void __exit nf_nat_cleanup(void) +void nf_nat_cleanup(void) { nf_ct_iterate_cleanup(&clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); - nf_ct_l3proto_put(l3proto); - nf_ct_extend_unregister(&nat_extend); + nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); + nf_ct_l3proto_put(ve_nf_nat_l3proto); +#ifdef CONFIG_VE_IPTABLES + kfree(ve_nf_nat_protos); +#endif + if (ve_is_super(get_exec_env())) + nf_ct_extend_unregister(&nat_extend); +} + +static int __init init(void) +{ + int rv; + + rv = nf_nat_init(); + if (rv < 0) + return rv; + + KSYMRESOLVE(nf_nat_init); + KSYMRESOLVE(nf_nat_cleanup); + KSYMMODRESOLVE(nf_nat); + return 0; +} + +static void __exit fini(void) +{ + KSYMMODUNRESOLVE(nf_nat); + KSYMUNRESOLVE(nf_nat_cleanup); + KSYMUNRESOLVE(nf_nat_init); + + nf_nat_cleanup(); } MODULE_LICENSE("GPL"); -module_init(nf_nat_init); -module_exit(nf_nat_cleanup); +module_init(init); +module_exit(fini); diff -uprN linux-2.6.24/net/ipv4/netfilter/nf_nat_rule.c linux-2.6.24.ovz/net/ipv4/netfilter/nf_nat_rule.c --- linux-2.6.24/net/ipv4/netfilter/nf_nat_rule.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/netfilter/nf_nat_rule.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,13 @@ #include #include +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_nat_table \ + (get_exec_env()->_nf_conntrack->_nf_nat_table) +#else +#define ve_nf_nat_table &nat_table +#endif + #define NAT_VALID_HOOKS ((1< #include #include +#include #include #include @@ -324,30 +325,64 @@ static struct nf_hook_ops nf_nat_ops[] = }, }; -static int __init nf_nat_standalone_init(void) +int init_nftable_nat(void) { - int ret = 0; + int ret; - need_ipv4_conntrack(); + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); -#ifdef CONFIG_XFRM - BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; -#endif ret = nf_nat_rule_init(); if (ret < 0) { printk("nf_nat_init: can't setup rules.\n"); - goto cleanup_decode_session; + goto out_modput; } ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); if (ret < 0) { printk("nf_nat_init: can't register hooks.\n"); goto cleanup_rule_init; } + return 0; + +cleanup_rule_init: + nf_nat_rule_cleanup(); +out_modput: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); return ret; +} - cleanup_rule_init: +void fini_nftable_nat(void) +{ + nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); nf_nat_rule_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +} + +static int __init nf_nat_standalone_init(void) +{ + int ret = 0; + + need_ipv4_conntrack(); + +#ifdef CONFIG_XFRM + BUG_ON(ip_nat_decode_session != NULL); + ip_nat_decode_session = nat_decode_session; +#endif + + if (!ip_conntrack_disable_ve0) { + ret = init_nftable_nat(); + if (ret < 0) + goto cleanup_decode_session; + } + + KSYMRESOLVE(init_nftable_nat); + KSYMRESOLVE(fini_nftable_nat); + KSYMMODRESOLVE(iptable_nat); + + return ret; + cleanup_decode_session: #ifdef CONFIG_XFRM ip_nat_decode_session = NULL; @@ -358,8 +393,12 @@ static int __init nf_nat_standalone_init static void __exit nf_nat_standalone_fini(void) { - nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); - nf_nat_rule_cleanup(); + KSYMMODUNRESOLVE(iptable_nat); + KSYMUNRESOLVE(init_nftable_nat); + KSYMUNRESOLVE(fini_nftable_nat); + + if (!ip_conntrack_disable_ve0) + fini_nftable_nat(); #ifdef CONFIG_XFRM ip_nat_decode_session = NULL; synchronize_net(); diff -uprN linux-2.6.24/net/ipv4/proc.c linux-2.6.24.ovz/net/ipv4/proc.c --- linux-2.6.24/net/ipv4/proc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/proc.c 2008-03-25 18:53:59.000000000 -0500 @@ -51,6 +51,9 @@ */ static int sockstat_seq_show(struct seq_file *seq, void *v) { + if (!ve_is_super(get_exec_env())) + return 0; + socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), @@ -240,7 +243,7 @@ static void icmpmsg_put(struct seq_file count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - if (snmp_fold_field((void **) icmpmsg_statistics, i)) + if (snmp_fold_field((void **) ve_icmpmsg_statistics, i)) out[count++] = i; if (count < PERLINE) continue; @@ -252,7 +255,7 @@ static void icmpmsg_put(struct seq_file seq_printf(seq, "\nIcmpMsg: "); for (j = 0; j < PERLINE; ++j) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) ve_icmpmsg_statistics, out[j])); seq_putc(seq, '\n'); } @@ -264,7 +267,7 @@ static void icmpmsg_put(struct seq_file seq_printf(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", snmp_fold_field((void **) - icmpmsg_statistics, out[j])); + ve_icmpmsg_statistics, out[j])); } #undef PERLINE @@ -281,18 +284,18 @@ static void icmp_put(struct seq_file *se for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) ve_icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) icmpmsg_statistics, + snmp_fold_field((void **) ve_icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } @@ -313,7 +316,7 @@ static int snmp_seq_show(struct seq_file for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)ve_ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -328,11 +331,11 @@ static int snmp_seq_show(struct seq_file /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)ve_tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)tcp_statistics, + snmp_fold_field((void **)ve_tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -343,7 +346,7 @@ static int snmp_seq_show(struct seq_file seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udp_statistics, + snmp_fold_field((void **)ve_udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -354,7 +357,7 @@ static int snmp_seq_show(struct seq_file seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)udplite_statistics, + snmp_fold_field((void **)ve_udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -390,7 +393,7 @@ static int netstat_seq_show(struct seq_f seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net_statistics, + snmp_fold_field((void **)ve_net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -400,7 +403,7 @@ static int netstat_seq_show(struct seq_f seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)ip_statistics, + snmp_fold_field((void **)ve_ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); @@ -420,26 +423,38 @@ static const struct file_operations nets .release = single_release, }; -int __init ip_misc_proc_init(void) +static int ipv4_proc_net_init(struct net *net) { - int rc = 0; - - if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; - - if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; - - if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops)) - goto out_sockstat; -out: - return rc; -out_sockstat: - proc_net_remove(&init_net, "snmp"); + return 0; out_snmp: - proc_net_remove(&init_net, "netstat"); + proc_net_remove(net, "netstat"); out_netstat: - rc = -ENOMEM; - goto out; + return -ENOMEM; } +static void ipv4_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "snmp"); + proc_net_remove(net, "netstat"); +} + +static struct pernet_operations ipv4_proc_net_ops = { + .init = ipv4_proc_net_init, + .exit = ipv4_proc_net_exit, +}; + +int __init ip_misc_proc_init(void) +{ + int rv; + + if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops)) + return -ENOMEM; + rv = register_pernet_subsys(&ipv4_proc_net_ops); + if (rv < 0) + proc_net_remove(&init_net, "sockstat"); + return rv; +} diff -uprN linux-2.6.24/net/ipv4/raw.c linux-2.6.24.ovz/net/ipv4/raw.c --- linux-2.6.24/net/ipv4/raw.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/raw.c 2008-03-25 18:53:59.000000000 -0500 @@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock if (inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) && + ve_accessible_strict(sk->owner_env, get_exec_env())) goto found; /* gotcha */ } sk = NULL; @@ -804,8 +805,12 @@ static struct sock *raw_get_first(struct struct hlist_node *node; sk_for_each(sk, node, &raw_v4_htable[state->bucket]) - if (sk->sk_family == PF_INET) + if (sk->sk_family == PF_INET) { + if (!ve_accessible(sk->owner_env, + get_exec_env())) + continue; goto found; + } } sk = NULL; found: @@ -819,8 +824,13 @@ static struct sock *raw_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != PF_INET); + if (!sk) + break; + if (sk->sk_family != PF_INET) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { sk = sk_head(&raw_v4_htable[state->bucket]); @@ -919,13 +929,28 @@ static const struct file_operations raw_ .release = seq_release_private, }; -int __init raw_proc_init(void) +static int raw_net_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops)) + if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; return 0; } +static void raw_net_exit(struct net *net) +{ + proc_net_remove(net, "raw"); +} + +static struct pernet_operations raw_net_ops = { + .init = raw_net_init, + .exit = raw_net_exit, +}; + +int __init raw_proc_init(void) +{ + return register_pernet_subsys(&raw_net_ops); +} + void __init raw_proc_exit(void) { proc_net_remove(&init_net, "raw"); diff -uprN linux-2.6.24/net/ipv4/route.c linux-2.6.24.ovz/net/ipv4/route.c --- linux-2.6.24/net/ipv4/route.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/route.c 2008-03-25 18:53:59.000000000 -0500 @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,8 @@ #define RT_GC_TIMEOUT (300*HZ) +int ip_rt_src_check = 1; + static int ip_rt_min_delay = 2 * HZ; static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; @@ -266,11 +269,28 @@ static unsigned int rt_hash_code(u32 dad rt_hash_code((__force u32)(__be32)(daddr),\ (__force u32)(__be32)(saddr) ^ ((idx) << 5)) +void prepare_rt_cache(void) +{ +#ifdef CONFIG_VE + struct rtable *r; + int i; + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(rt_hash_lock_addr(i)); + for (r = rt_hash_table[i].chain; r; r = r->u.dst.rt_next) { + r->fl.owner_env = get_ve0(); + } + spin_unlock_bh(rt_hash_lock_addr(i)); + } +#endif +} + #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { int bucket; }; +static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r); static struct rtable *rt_cache_get_first(struct seq_file *seq) { struct rtable *r = NULL; @@ -283,6 +303,8 @@ static struct rtable *rt_cache_get_first break; rcu_read_unlock_bh(); } + if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) + return rt_cache_get_next(seq, r); return rcu_dereference(r); } @@ -290,6 +312,7 @@ static struct rtable *rt_cache_get_next( { struct rt_cache_iter_state *st = seq->private; +loop: r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); @@ -298,6 +321,8 @@ static struct rtable *rt_cache_get_next( rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } + if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) + goto loop; return rcu_dereference(r); } @@ -556,7 +581,8 @@ static inline int compare_keys(struct fl (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; + (fl1->iif ^ fl2->iif)) == 0 && + ve_accessible_strict(fl1->owner_env, fl2->owner_env); } static void rt_check_expire(struct work_struct *work) @@ -608,26 +634,105 @@ static void rt_check_expire(struct work_ schedule_delayed_work(&expires_work, ip_rt_gc_interval); } +typedef unsigned long rt_flush_gen_t; + +#ifdef CONFIG_VE + +static rt_flush_gen_t rt_flush_gen; + +/* called under rt_flush_lock */ +static void set_rt_flush_required(struct ve_struct *env) +{ + /* + * If the global generation rt_flush_gen is equal to G, then + * the pass considering entries labelled by G is yet to come. + */ + env->rt_flush_required = rt_flush_gen; +} + +static spinlock_t rt_flush_lock; +static rt_flush_gen_t reset_rt_flush_required(void) +{ + rt_flush_gen_t g; + + spin_lock_bh(&rt_flush_lock); + g = rt_flush_gen++; + spin_unlock_bh(&rt_flush_lock); + return g; +} + +static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen) +{ + /* can be checked without the lock */ + return env->rt_flush_required >= gen; +} + +#else + +static void set_rt_flush_required(struct ve_struct *env) +{ +} + +static rt_flush_gen_t reset_rt_flush_required(void) +{ + return 0; +} + +#endif + /* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */ static void rt_run_flush(unsigned long dummy) { int i; - struct rtable *rth, *next; + struct rtable * rth, * next; + struct rtable * tail; + rt_flush_gen_t gen; rt_deadline = 0; get_random_bytes(&rt_hash_rnd, 4); + gen = reset_rt_flush_required(); + for (i = rt_hash_mask; i >= 0; i--) { +#ifdef CONFIG_VE + struct rtable ** prev, * p; + + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.dst.rt_next) + if (!check_rt_flush_required(tail->fl.owner_env, gen)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.dst.rt_next; + if (!check_rt_flush_required(p->fl.owner_env, gen)) { + prev = &p->u.dst.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + +#else spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; + tail = NULL; + +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.dst.rt_next; rt_free(rth); } @@ -663,6 +768,8 @@ void rt_cache_flush(int delay) delay = tmo; } + set_rt_flush_required(get_exec_env()); + if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); rt_run_flush(0); @@ -678,9 +785,30 @@ void rt_cache_flush(int delay) static void rt_secret_rebuild(unsigned long dummy) { + int i; + struct rtable *rth, *next; unsigned long now = jiffies; - rt_cache_flush(0); + spin_lock_bh(&rt_flush_lock); + del_timer(&rt_flush_timer); + spin_unlock_bh(&rt_flush_lock); + + rt_deadline = 0; + get_random_bytes(&rt_hash_rnd, 4); + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; + spin_unlock_bh(rt_hash_lock_addr(i)); + + for (; rth; rth = next) { + next = rth->u.dst.rt_next; + rt_free(rth); + } + } + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); } @@ -1026,6 +1154,9 @@ void ip_rt_redirect(__be32 old_gw, __be3 __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; + struct ve_struct *ve; + + ve = get_exec_env(); if (!in_dev) return; @@ -1057,6 +1188,10 @@ void ip_rt_redirect(__be32 old_gw, __be3 if (rth->fl.fl4_dst != daddr || rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || +#ifdef CONFIG_VE + !ve_accessible_strict(rth->fl.owner_env, + ve) || +#endif rth->fl.iif != 0) { rthp = &rth->u.dst.rt_next; continue; @@ -1095,6 +1230,9 @@ void ip_rt_redirect(__be32 old_gw, __be3 rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; +#ifdef CONFIG_VE + rt->fl.owner_env = ve; +#endif rt->rt_flags |= RTCF_REDIRECTED; @@ -1389,8 +1527,9 @@ static void ipv4_dst_ifdown(struct dst_e { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != init_net.loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1540,9 +1679,12 @@ static int ip_route_input_mc(struct sk_b #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = get_exec_env()->ve_ns->net_ns->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1678,6 +1820,9 @@ static inline int __mkroute_input(struct rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; rth->u.dst.dev = (out_dev)->dev; @@ -1799,7 +1944,7 @@ static int ip_route_input_slow(struct sk if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - init_net.loopback_dev->ifindex, + get_exec_env()->ve_ns->net_ns->loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1861,11 +2006,14 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = get_exec_env()->ve_ns->net_ns->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { @@ -1933,6 +2081,9 @@ int ip_route_input(struct sk_buff *skb, rth->fl.iif == iif && rth->fl.oif == 0 && rth->fl.mark == skb->mark && +#ifdef CONFIG_VE + rth->fl.owner_env == get_exec_env() && +#endif rth->fl.fl4_tos == tos) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); @@ -2050,6 +2201,9 @@ static inline int __mkroute_output(struc rth->fl.mark = oldflp->mark; rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; +#ifdef CONFIG_VE + rth->fl.owner_env = get_exec_env(); +#endif rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ @@ -2121,6 +2275,8 @@ static inline int ip_mkroute_output(stru static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) { + struct net *net = get_exec_env()->ve_ns->net_ns; + struct net_device * loopback_dev = net->loopback_dev; u32 tos = RT_FL_TOS(oldflp); struct flowi fl = { .nl_u = { .ip4_u = { .daddr = oldflp->fl4_dst, @@ -2131,7 +2287,7 @@ static int ip_route_output_slow(struct r RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = init_net.loopback_dev->ifindex, + .iif = loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2152,10 +2308,13 @@ static int ip_route_output_slow(struct r ZERONET(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) - goto out; + if (ip_rt_src_check) { + /* It is equivalent to + inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: @@ -2182,6 +2341,12 @@ static int ip_route_output_slow(struct r Luckily, this hack is good workaround. */ + if (dev_out == NULL) { + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + } + fl.oif = dev_out->ifindex; goto make_route; } @@ -2192,7 +2357,7 @@ static int ip_route_output_slow(struct r if (oldflp->oif) { - dev_out = dev_get_by_index(&init_net, oldflp->oif); + dev_out = dev_get_by_index(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2225,9 +2390,9 @@ static int ip_route_output_slow(struct r fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); - fl.oif = init_net.loopback_dev->ifindex; + fl.oif = loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2272,7 +2437,7 @@ static int ip_route_output_slow(struct r fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2326,6 +2491,7 @@ int __ip_route_output_key(struct rtable rth->fl.iif == 0 && rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && + ve_accessible_strict(rth->fl.owner_env, get_exec_env()) && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { dst_use(&rth->u.dst, jiffies); @@ -2569,7 +2735,7 @@ static int inet_rtm_getroute(struct sk_b if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; @@ -2661,22 +2827,22 @@ void ip_rt_multicast_event(struct in_dev } #ifdef CONFIG_SYSCTL -static int flush_delay; +int ipv4_flush_delay; -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + rt_cache_flush(ipv4_flush_delay); return 0; } return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, +int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, @@ -2697,7 +2863,7 @@ ctl_table ipv4_route_table[] = { { .ctl_name = NET_IPV4_ROUTE_FLUSH, .procname = "flush", - .data = &flush_delay, + .data = &ipv4_flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = &ipv4_sysctl_rtcache_flush, @@ -2984,7 +3150,7 @@ int __init ip_rt_init(void) struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - init_net.proc_net_stat))) { + init_net.proc_net_stat))) { return -ENOMEM; } rtstat_pde->proc_fops = &rt_cpu_seq_fops; diff -uprN linux-2.6.24/net/ipv4/sysctl_net_ipv4.c linux-2.6.24.ovz/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.24/net/ipv4/sysctl_net_ipv4.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/sysctl_net_ipv4.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,9 @@ /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; +int sysctl_tcp_use_sg = 1; +EXPORT_SYMBOL(sysctl_tcp_use_sg); + #ifdef CONFIG_SYSCTL static int zero; static int tcp_retr1_max = 255; @@ -35,7 +38,6 @@ struct ipv4_config ipv4_config; #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -50,7 +52,7 @@ int ipv4_sysctl_forward(ctl_table *ctl, return ret; } -static int ipv4_sysctl_forward_strategy(ctl_table *table, +int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) @@ -292,7 +294,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_FORWARD, .procname = "ip_forward", - .data = &IPV4_DEVCONF_ALL(FORWARDING), + .data = &IPV4_DEVCONF(ipv4_devconf, FORWARDING), .maxlen = sizeof(int), .mode = 0644, .proc_handler = &ipv4_sysctl_forward, @@ -548,6 +550,13 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .procname = "tcp_use_sg", + .data = &sysctl_tcp_use_sg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { @@ -748,6 +757,20 @@ ctl_table ipv4_table[] = { .extra1 = &zero }, { + .procname = "tcp_max_tw_kmem_fraction", + .data = &sysctl_tcp_max_tw_kmem_fraction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_max_tw_buckets_ub", + .data = &sysctl_tcp_max_tw_buckets_ub, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff -uprN linux-2.6.24/net/ipv4/tcp.c linux-2.6.24.ovz/net/ipv4/tcp.c --- linux-2.6.24/net/ipv4/tcp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp.c 2008-03-25 18:53:59.000000000 -0500 @@ -266,6 +266,10 @@ #include #include +#include +#include +#include + #include #include @@ -323,6 +327,7 @@ unsigned int tcp_poll(struct file *file, unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); + int check_send_space; poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) @@ -337,6 +342,21 @@ unsigned int tcp_poll(struct file *file, if (sk->sk_err) mask = POLLERR; + check_send_space = 1; +#ifdef CONFIG_BEANCOUNTERS + if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { + unsigned long size; + size = MAX_TCP_HEADER + tp->mss_cache; + if (size > SOCK_MIN_UBCSPACE) + size = SOCK_MIN_UBCSPACE; + size = skb_charge_size(size); + if (ub_sock_makewres_tcp(sk, size)) { + check_send_space = 0; + ub_sock_sndqueueadd_tcp(sk, size); + } + } +#endif + /* * POLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -380,7 +400,7 @@ unsigned int tcp_poll(struct file *file, sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ @@ -531,16 +551,23 @@ static ssize_t do_tcp_sendpages(struct s int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); + unsigned long chargesize = 0; if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + chargesize = skb_charge_size(MAX_TCP_HEADER + + tp->mss_cache); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; skb = sk_stream_alloc_pskb(sk, 0, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); skb_entail(sk, skb); copy = size_goal; @@ -596,10 +623,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -636,12 +668,8 @@ ssize_t tcp_sendpage(struct socket *sock return res; } -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) - -static inline int select_size(struct sock *sk) +static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -700,6 +728,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; + unsigned long chargesize = 0; iov++; @@ -710,18 +739,27 @@ int tcp_sendmsg(struct kiocb *iocb, stru if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { + unsigned long size; new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. */ + chargesize = 0; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk), - 0, sk->sk_allocation); + size = select_size(sk, tp); + chargesize = skb_charge_size(MAX_TCP_HEADER + + size); + if (ub_sock_getwres_tcp(sk, chargesize) < 0) + goto wait_for_ubspace; + skb = sk_stream_alloc_pskb(sk, size, 0, + sk->sk_allocation); if (!skb) goto wait_for_memory; + ub_skb_set_charge(skb, sk, chargesize, + UB_TCPSNDBUF); /* * Check whether we can use HW checksum. @@ -767,6 +805,7 @@ new_segment: } else if (page) { if (off == PAGE_SIZE) { put_page(page); + ub_sock_tcp_detachpage(sk); TCP_PAGE(sk) = page = NULL; off = 0; } @@ -780,6 +819,9 @@ new_segment: goto wait_for_memory; if (!page) { + chargesize = PAGE_SIZE; + if (ub_sock_tcp_chargepage(sk) < 0) + goto wait_for_ubspace; /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; @@ -811,7 +853,8 @@ new_segment: } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; - } + } else + ub_sock_tcp_detachpage(sk); } TCP_OFF(sk) = off + copy; @@ -842,10 +885,15 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: + ub_sock_retwres_tcp(sk, chargesize, + skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); + chargesize = 0; +wait_for_ubspace: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = __sk_stream_wait_memory(sk, &timeo, chargesize); + if (err != 0) goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); @@ -945,7 +993,18 @@ void tcp_cleanup_rbuf(struct sock *sk, i #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { + printk("KERNEL: assertion: skb==NULL || " + "before(tp->copied_seq, skb->end_seq)\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, + tp->copied_seq, tp->rcv_nxt); + printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", + skb->len, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + } #endif if (inet_csk_ack_scheduled(sk)) { @@ -1199,7 +1258,23 @@ int tcp_recvmsg(struct kiocb *iocb, stru goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - BUG_TRAP(flags & MSG_PEEK); + if (!(flags & MSG_PEEK)) { + printk("KERNEL: assertion: flags&MSG_PEEK\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + printk("skb->len=%d, *seq=%d, skb->seq=%d, " + "skb->end_seq=%d, offset=%d\n", + skb->len, *seq, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + offset); + } skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -1262,8 +1337,19 @@ int tcp_recvmsg(struct kiocb *iocb, stru tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || - (flags & (MSG_PEEK | MSG_TRUNC))); + if (!(tp->copied_seq == tp->rcv_nxt || + (flags&(MSG_PEEK|MSG_TRUNC)))) { + printk("KERNEL: assertion: tp->copied_seq == " + "tp->rcv_nxt || ...\n"); + printk("VE%u pid %d comm %.16s\n", + (get_exec_env() ? + VEID(get_exec_env()) : 0), + current->pid, current->comm); + printk("flags=0x%x, len=%d, copied_seq=%d, " + "rcv_nxt=%d\n", flags, + (int)len, tp->copied_seq, + tp->rcv_nxt); + } /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1639,7 +1725,7 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); + ub_inc_orphan_count(sk); /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1689,12 +1775,19 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { + int orphans = ub_get_orphan_count(sk); + sk_stream_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, - atomic_read(sk->sk_prot->orphan_count))) { - if (net_ratelimit()) + if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) { + int ubid = 0; +#ifdef CONFIG_USER_RESOURCE + ubid = sock_has_ubc(sk) ? + top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; +#endif printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + "sockets (%d in CT%d)\n", orphans, ubid); + } tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); @@ -1770,6 +1863,7 @@ int tcp_disconnect(struct sock *sk, int tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->advmss = 65535; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2412,6 +2506,7 @@ void tcp_done(struct sock *sk) EXPORT_SYMBOL_GPL(tcp_done); extern void __skb_cb_too_small_for_tcp(int, int); +extern unsigned int nr_free_lowpages(void); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2437,7 +2532,7 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -2505,6 +2600,11 @@ void __init tcp_init(void) sysctl_tcp_mem[1] = limit; sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); diff -uprN linux-2.6.24/net/ipv4/tcp_input.c linux-2.6.24.ovz/net/ipv4/tcp_input.c --- linux-2.6.24/net/ipv4/tcp_input.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp_input.c 2008-03-25 18:53:59.000000000 -0500 @@ -72,6 +72,8 @@ #include #include +#include + int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; @@ -310,7 +312,7 @@ static void tcp_grow_window(struct sock /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_memory_pressure) { + ub_tcp_rmem_allows_expand(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -379,6 +381,8 @@ static void tcp_init_buffer_space(struct tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; + + ub_tcp_update_maxadvmss(sk); } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -391,7 +395,7 @@ static void tcp_clamp_window(struct sock if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && + !ub_tcp_memory_pressure(sk) && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -3747,7 +3751,7 @@ queue_and_out: !sk_stream_rmem_schedule(sk, skb))) { if (tcp_prune_queue(sk) < 0 || !sk_stream_rmem_schedule(sk, skb)) - goto drop; + goto drop_part; } sk_stream_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -3791,6 +3795,12 @@ out_of_window: drop: __kfree_skb(skb); return; + +drop_part: + if (after(tp->copied_seq, tp->rcv_nxt)) + tp->rcv_nxt = tp->copied_seq; + __kfree_skb(skb); + return; } /* Out of window. F.e. zero window probe. */ @@ -3962,6 +3972,10 @@ tcp_collapse(struct sock *sk, struct sk_ nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { + kfree_skb(nskb); + return; + } skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); skb_set_network_header(nskb, (skb_network_header(skb) - @@ -4063,7 +4077,7 @@ static int tcp_prune_queue(struct sock * if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (tcp_memory_pressure) + else if (ub_tcp_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4142,7 +4156,7 @@ static int tcp_should_expand_sndbuf(stru return 0; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -4587,6 +4601,10 @@ int tcp_rcv_established(struct sock *sk, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; + /* This is OK not to try to free memory here. + * Do this below on slow path. Den */ + if (ub_tcprcvbuf_charge(sk, skb) < 0) + goto step5; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); diff -uprN linux-2.6.24/net/ipv4/tcp_ipv4.c linux-2.6.24.ovz/net/ipv4/tcp_ipv4.c --- linux-2.6.24/net/ipv4/tcp_ipv4.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp_ipv4.c 2008-03-25 18:53:59.000000000 -0500 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,8 @@ #include #include +#include + #include #include #include @@ -720,7 +723,8 @@ static void tcp_v4_timewait_ack(struct s struct tcp_timewait_sock *tcptw = tcp_twsk(sk); tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, - tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_rcv_wnd >> + (tw->tw_rcv_wscale & TW_WSCALE_MASK), tcptw->tw_ts_recent); inet_twsk_put(tw); @@ -1245,6 +1249,7 @@ struct request_sock_ops tcp_request_sock .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, }; +EXPORT_SYMBOL_GPL(tcp_request_sock_ops); #ifdef CONFIG_TCP_MD5SIG static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -1555,6 +1560,10 @@ static __sum16 tcp_v4_checksum_init(stru int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; + struct user_beancounter *ub; + + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible @@ -1573,7 +1582,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc goto reset; } TCP_CHECK_TIMER(sk); - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1589,7 +1598,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc rsk = nsk; goto reset; } - return 0; + goto restore_context; } } @@ -1599,6 +1608,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc goto reset; } TCP_CHECK_TIMER(sk); + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1610,7 +1622,7 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -1870,6 +1882,8 @@ static int tcp_v4_init_sock(struct sock tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; + tp->advmss = 65535; /* max value */ + tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1931,6 +1945,8 @@ int tcp_v4_destroy_sock(struct sock *sk) * If sendmsg cached page exists, toss it. */ if (sk->sk_sndmsg_page) { + /* queue is empty, uncharge */ + ub_sock_tcp_detachpage(sk); __free_page(sk->sk_sndmsg_page); sk->sk_sndmsg_page = NULL; } @@ -1945,16 +1961,34 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head, + envid_t veid) { - return hlist_empty(head) ? NULL : - list_entry(head->first, struct inet_timewait_sock, tw_node); + struct inet_timewait_sock *tw; + struct hlist_node *pos; + + if (hlist_empty(head)) + return NULL; + hlist_for_each_entry(tw, pos, head, tw_node) { + if (!ve_accessible_veid(tw->tw_owner_env, veid)) + continue; + return tw; + } + return NULL; } -static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) +static inline struct inet_timewait_sock * + tw_next(struct inet_timewait_sock *tw, envid_t veid) { - return tw->tw_node.next ? - hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; + while (1) { + if (tw->tw_node.next == NULL) + return NULL; + tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node); + if (!ve_accessible_veid(tw->tw_owner_env, veid)) + continue; + return tw; + } + return NULL; /* make compiler happy */ } static void *listening_get_next(struct seq_file *seq, void *cur) @@ -1963,7 +1997,9 @@ static void *listening_get_next(struct s struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; + struct ve_struct *ve; + ve = get_exec_env(); if (!sk) { st->bucket = 0; sk = sk_head(&tcp_hashinfo.listening_hash[0]); @@ -2003,6 +2039,8 @@ get_req: } get_sk: sk_for_each_from(sk, node) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family == st->family) { cur = sk; goto out; @@ -2043,7 +2081,9 @@ static void *established_get_first(struc { struct tcp_iter_state* st = seq->private; void *rc = NULL; + struct ve_struct *ve; + ve = get_exec_env(); for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; @@ -2052,6 +2092,8 @@ static void *established_get_first(struc read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family != st->family) { continue; } @@ -2061,6 +2103,8 @@ static void *established_get_first(struc st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket].twchain) { + if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) + continue; if (tw->tw_family != st->family) { continue; } @@ -2080,16 +2124,17 @@ static void *established_get_next(struct struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; + struct ve_struct *ve; + ve = get_exec_env(); ++st->num; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; - tw = tw_next(tw); + tw = tw_next(tw, VEID(ve)); get_tw: - while (tw && tw->tw_family != st->family) { - tw = tw_next(tw); - } + while (tw && tw->tw_family != st->family) + tw = tw_next(tw, VEID(ve)); if (tw) { cur = tw; goto out; @@ -2108,12 +2153,14 @@ get_tw: sk = sk_next(sk); sk_for_each_from(sk, node) { + if (!ve_accessible(sk->owner_env, ve)) + continue; if (sk->sk_family == st->family) goto found; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain, VEID(ve)); goto get_tw; found: cur = sk; @@ -2255,7 +2302,7 @@ int tcp_proc_register(struct tcp_seq_afi afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(current->nsproxy->net_ns, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -2267,7 +2314,8 @@ void tcp_proc_unregister(struct tcp_seq_ { if (!afinfo) return; - proc_net_remove(&init_net, afinfo->name); + + proc_net_remove(current->nsproxy->net_ns, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } @@ -2406,15 +2454,30 @@ static struct tcp_seq_afinfo tcp4_seq_af .seq_fops = &tcp4_seq_fops, }; -int __init tcp4_proc_init(void) +static int tcp4_proc_net_init(struct net *net) { return tcp_proc_register(&tcp4_seq_afinfo); } -void tcp4_proc_exit(void) +static void tcp4_proc_net_exit(struct net *net) { tcp_proc_unregister(&tcp4_seq_afinfo); } + +static struct pernet_operations tcp4_proc_net_ops = { + .init = tcp4_proc_net_init, + .exit = tcp4_proc_net_exit, +}; + +int __init tcp4_proc_init(void) +{ + return register_pernet_subsys(&tcp4_proc_net_ops); +} + +void tcp4_proc_exit(void) +{ + unregister_pernet_subsys(&tcp4_proc_net_ops); +} #endif /* CONFIG_PROC_FS */ DEFINE_PROTO_INUSE(tcp) @@ -2463,6 +2526,87 @@ void __init tcp_v4_init(struct net_proto panic("Failed to create the TCP control socket.\n"); } +#ifdef CONFIG_VE +static void tcp_kill_ve_onesk(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check the assumed state of the socket. */ + if (!sock_flag(sk, SOCK_DEAD)) { + static int printed; +invalid: + if (!printed) + printk(KERN_DEBUG "Killing sk: dead %d, state %d, " + "wrseq %u unseq %u, wrqu %d.\n", + sock_flag(sk, SOCK_DEAD), sk->sk_state, + tp->write_seq, tp->snd_una, + !skb_queue_empty(&sk->sk_write_queue)); + printed = 1; + return; + } + + tcp_send_active_reset(sk, GFP_ATOMIC); + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSING: + /* In these 2 states the peer may want us to retransmit + * some data and/or FIN. Entering "resetting mode" + * instead. + */ + tcp_time_wait(sk, TCP_CLOSE, 0); + break; + case TCP_FIN_WAIT2: + /* By some reason the socket may stay in this state + * without turning into a TW bucket. Fix it. + */ + tcp_time_wait(sk, TCP_FIN_WAIT2, 0); + break; + case TCP_LAST_ACK: + /* Just jump into CLOSED state. */ + tcp_done(sk); + break; + default: + /* The socket must be already close()d. */ + goto invalid; + } +} + +void tcp_v4_kill_ve_sockets(struct ve_struct *envid) +{ + struct inet_ehash_bucket *head; + int i; + + /* alive */ + local_bh_disable(); + head = tcp_hashinfo.ehash; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + struct sock *sk; + struct hlist_node *node; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); +more_work: + write_lock(lock); + sk_for_each(sk, node, &head[i].chain) { + if (ve_accessible_strict(sk->owner_env, envid)) { + sock_hold(sk); + write_unlock(lock); + + bh_lock_sock(sk); + /* sk might have disappeared from the hash before + * we got the lock */ + if (sk->sk_state != TCP_CLOSE) + tcp_kill_ve_onesk(sk); + bh_unlock_sock(sk); + sock_put(sk); + goto more_work; + } + } + write_unlock(lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); +#endif + EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_prot); diff -uprN linux-2.6.24/net/ipv4/tcp_minisocks.c linux-2.6.24.ovz/net/ipv4/tcp_minisocks.c --- linux-2.6.24/net/ipv4/tcp_minisocks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp_minisocks.c 2008-03-25 18:53:59.000000000 -0500 @@ -28,6 +28,9 @@ #include #include +#include +#include + #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else @@ -36,6 +39,11 @@ int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; int sysctl_tcp_abort_on_overflow __read_mostly; +int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384; +int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536; + +EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction); +EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub); struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, @@ -51,6 +59,7 @@ struct inet_timewait_death_row tcp_death .twcal_hand = -1, .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), + .ub_managed = 1, }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -279,7 +288,8 @@ void tcp_time_wait(struct sock *sk, int if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets && + ub_timewait_check(sk, &tcp_death_row)) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -292,6 +302,8 @@ void tcp_time_wait(struct sock *sk, int tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + if (sk->sk_user_data != NULL) + tw->tw_rcv_wscale |= TW_WSCALE_SPEC; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -326,6 +338,7 @@ void tcp_time_wait(struct sock *sk, int } } while (0); #endif + tw->tw_owner_env = VEID(sk->owner_env); /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); @@ -346,11 +359,16 @@ void tcp_time_wait(struct sock *sk, int TCP_TIMEWAIT_LEN); inet_twsk_put(tw); } else { + int ubid = 0; /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); +#ifdef CONFIG_BEANCOUNTERS + if (sock_has_ubc(sk)) + ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid; +#endif + LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid); } tcp_update_metrics(sk); @@ -391,6 +409,8 @@ struct sock *tcp_create_openreq_child(st struct tcp_sock *newtp; /* Now setup tcp_sock */ + newsk->owner_env = sk->owner_env; + newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; diff -uprN linux-2.6.24/net/ipv4/tcp_output.c linux-2.6.24.ovz/net/ipv4/tcp_output.c --- linux-2.6.24/net/ipv4/tcp_output.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp_output.c 2008-03-25 18:53:59.000000000 -0500 @@ -41,6 +41,9 @@ #include #include +#include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -439,6 +442,13 @@ static void tcp_syn_build_options(__be32 #endif } +static int skb_header_size(struct sock *sk, int tcp_hlen) +{ + struct ip_options *opt = inet_sk(sk)->opt; + return tcp_hlen + sizeof(struct iphdr) + + (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */; +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -457,6 +467,7 @@ static int tcp_transmit_skb(struct sock struct tcp_sock *tp; struct tcp_skb_cb *tcb; int tcp_header_size; + int header_size; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location; @@ -516,6 +527,20 @@ static int tcp_transmit_skb(struct sock TCPOLEN_SACK_PERBLOCK)); } + /* Unfortunately, we can have skb from outside world here + * with size insufficient for header. It is impossible to make + * guess when we queue skb, so the decision should be made + * here. Den + */ + header_size = skb_header_size(sk, tcp_header_size); + if (skb->data - header_size < skb->head) { + int delta = header_size - skb_headroom(skb); + err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta), + 0, GFP_ATOMIC); + if (err) + return err; + } + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -692,15 +717,23 @@ int tcp_fragment(struct sock *sk, struct if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; + if (skb_cloned(skb) && skb_is_nonlinear(skb)) { + unsigned long chargesize; + chargesize = skb_bc(skb)->charged; + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + ub_sock_tcp_unchargesend(sk, chargesize); + ub_tcpsndbuf_charge_forced(sk, skb); + } /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } sk_charge_skb(sk, buff); nlen = skb->len - len - nsize; @@ -1000,6 +1033,8 @@ unsigned int tcp_current_mss(struct sock return mss_now; } +EXPORT_SYMBOL(tcp_current_mss); + /* Congestion window validation. (RFC2861) */ static void tcp_cwnd_validate(struct sock *sk) @@ -1186,6 +1221,11 @@ static int tso_fragment(struct sock *sk, if (unlikely(buff == NULL)) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOMEM; + } + sk_charge_skb(sk, buff); buff->truesize += nlen; skb->truesize -= nlen; @@ -1516,6 +1556,8 @@ void __tcp_push_pending_frames(struct so } } +EXPORT_SYMBOL(__tcp_push_pending_frames); + /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ @@ -1636,7 +1678,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < full_space/2) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (ub_tcp_shrink_rcvbuf(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); if (free_space < mss) @@ -2080,6 +2122,7 @@ void tcp_send_fin(struct sock *sk) break; yield(); } + ub_tcpsndbuf_charge_forced(sk, skb); /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); @@ -2150,6 +2193,10 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) return -ENOMEM; + if (ub_tcpsndbuf_charge(sk, skb) < 0) { + kfree_skb(nskb); + return -ENOMEM; + } tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); @@ -2275,6 +2322,7 @@ static void tcp_connect_init(struct sock struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + static int once = 0; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -2294,9 +2342,23 @@ static void tcp_connect_init(struct sock tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); + if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) { + once = 1; + + printk("Oops in connect_init! dst->advmss=%d\n", + dst_metric(dst, RTAX_ADVMSS)); + printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU)); + printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, " + "advmss=%d, user_mss=%d\n", + sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss, + tp->mss_cache, tp->advmss, tp->rx_opt.user_mss); + } + if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->advmss == 0) + tp->advmss = 1460; tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2337,6 +2399,10 @@ int tcp_connect(struct sock *sk) buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; + if (ub_tcpsndbuf_charge(sk, buff) < 0) { + kfree_skb(buff); + return -ENOBUFS; + } /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); diff -uprN linux-2.6.24/net/ipv4/tcp_timer.c linux-2.6.24.ovz/net/ipv4/tcp_timer.c --- linux-2.6.24/net/ipv4/tcp_timer.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/tcp_timer.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,6 +22,8 @@ #include #include +#include +#include int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; @@ -67,7 +69,8 @@ static void tcp_write_err(struct sock *s static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = atomic_read(&tcp_orphan_count); + int orphans = ub_get_orphan_count(sk); + int orph = orphans; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ @@ -78,10 +81,16 @@ static int tcp_out_of_resources(struct s if (sk->sk_err_soft) orphans <<= 1; - if (tcp_too_many_orphans(sk, orphans)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) { + int ubid = 0; +#ifdef CONFIG_USER_RESOURCE + ubid = sock_has_ubc(sk) ? + top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; +#endif + printk(KERN_INFO "Orphaned socket dropped " + "(%d,%d in CT%d)\n", orph, orphans, ubid); + } /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || @@ -167,9 +176,12 @@ static int tcp_write_timeout(struct sock static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ @@ -218,11 +230,12 @@ static void tcp_delack_timer(unsigned lo TCP_CHECK_TIMER(sk); out: - if (tcp_memory_pressure) + if (ub_tcp_memory_pressure(sk)) sk_stream_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } static void tcp_probe_timer(struct sock *sk) @@ -277,8 +290,11 @@ static void tcp_probe_timer(struct sock static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); + env = set_exec_env(sk->owner_env); + if (!tp->packets_out) goto out; @@ -375,15 +391,19 @@ out_reset_timer: if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); -out:; +out: + (void)set_exec_env(env); } static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); int event; + env = set_exec_env(sk->owner_env); + bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ @@ -417,6 +437,7 @@ out: out_unlock: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } /* @@ -444,10 +465,13 @@ void tcp_set_keepalive(struct sock *sk, static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct ve_struct *env; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; + env = set_exec_env(sk->owner_env); + /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -519,4 +543,5 @@ death: out: bh_unlock_sock(sk); sock_put(sk); + (void)set_exec_env(env); } diff -uprN linux-2.6.24/net/ipv4/udp.c linux-2.6.24.ovz/net/ipv4/udp.c --- linux-2.6.24/net/ipv4/udp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/udp.c 2008-03-25 18:53:59.000000000 -0500 @@ -115,13 +115,15 @@ struct hlist_head udp_hash[UDP_HTABLE_SI DEFINE_RWLOCK(udp_hash_lock); static inline int __udp_lib_lport_inuse(__u16 num, - const struct hlist_head udptable[]) + const struct hlist_head udptable[], + struct ve_struct *ve) { struct sock *sk; struct hlist_node *node; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[udp_hashfn(num, VEID(ve))]) + if (sk->sk_hash == num && + ve_accessible_strict(sk->owner_env, ve)) return 1; return 0; } @@ -143,7 +145,9 @@ int __udp_lib_get_port(struct sock *sk, struct hlist_head *head; struct sock *sk2; int error = 1; + struct ve_struct *ve; + ve = get_exec_env(); write_lock_bh(&udp_hash_lock); if (!snum) { @@ -160,7 +164,7 @@ int __udp_lib_get_port(struct sock *sk, for (i = 0; i < UDP_HTABLE_SIZE; i++) { int size = 0; - head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(rover, VEID(ve))]; if (hlist_empty(head)) goto gotit; @@ -182,7 +186,7 @@ int __udp_lib_get_port(struct sock *sk, /* 2nd pass: find hole in shortest hash chain */ rover = best; for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { - if (! __udp_lib_lport_inuse(rover, udptable)) + if (! __udp_lib_lport_inuse(rover, udptable, ve)) goto gotit; rover += UDP_HTABLE_SIZE; if (rover > high) @@ -197,12 +201,13 @@ int __udp_lib_get_port(struct sock *sk, gotit: snum = rover; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(snum, VEID(ve))]; sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && sk2 != sk && (!sk2->sk_reuse || !sk->sk_reuse) && + ve_accessible_strict(sk2->owner_env, ve) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2) ) @@ -212,7 +217,7 @@ gotit: inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(snum, VEID(ve))]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -253,12 +258,15 @@ static struct sock *__udp4_lib_lookup(__ struct hlist_node *node; unsigned short hnum = ntohs(dport); int badness = -1; + struct ve_struct *ve; + ve = get_exec_env(); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { + if (sk->sk_hash == hnum && !ipv6_only_sock(sk) && + ve_accessible_strict(sk->owner_env, ve)) { int score = (sk->sk_family == PF_INET ? 1 : 0); if (inet->rcv_saddr) { if (inet->rcv_saddr != daddr) @@ -1047,7 +1055,8 @@ static int __udp4_lib_mcast_deliver(stru int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest), + VEID(skb->owner_env))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1464,10 +1473,14 @@ static struct sock *udp_get_first(struct { struct sock *sk; struct udp_iter_state *state = seq->private; + struct ve_struct *env; + env = get_exec_env(); for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { struct hlist_node *node; sk_for_each(sk, node, state->hashtable + state->bucket) { + if (!ve_accessible(sk->owner_env, env)) + continue; if (sk->sk_family == state->family) goto found; } @@ -1484,8 +1497,13 @@ static struct sock *udp_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != state->family); + if (!sk) + break; + if (sk->sk_family != state->family) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(state->hashtable + state->bucket); diff -uprN linux-2.6.24/net/ipv4/xfrm4_policy.c linux-2.6.24.ovz/net/ipv4/xfrm4_policy.c --- linux-2.6.24/net/ipv4/xfrm4_policy.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv4/xfrm4_policy.c 2008-03-25 18:53:59.000000000 -0500 @@ -295,7 +295,8 @@ static void xfrm4_dst_ifdown(struct dst_ xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); BUG_ON(!loopback_idev); do { diff -uprN linux-2.6.24/net/ipv6/addrconf.c linux-2.6.24.ovz/net/ipv6/addrconf.c --- linux-2.6.24/net/ipv6/addrconf.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/addrconf.c 2008-03-25 18:53:59.000000000 -0500 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -101,6 +102,7 @@ #define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) #ifdef CONFIG_SYSCTL +static struct addrconf_sysctl_table * __addrconf_sysctl_register(struct inet6_dev *idev, char *devname, int ifindex, struct ipv6_devconf *p); static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); static void addrconf_sysctl_unregister(struct ipv6_devconf *p); #endif @@ -129,8 +131,6 @@ static DEFINE_SPINLOCK(addrconf_verify_l static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); -static int addrconf_ifdown(struct net_device *dev, int how); - static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); @@ -145,7 +145,7 @@ static int ipv6_chk_same_addr(const stru static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); -struct ipv6_devconf ipv6_devconf __read_mostly = { +struct ipv6_devconf global_ipv6_devconf __read_mostly = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, .mtu6 = IPV6_MIN_MTU, @@ -178,7 +178,7 @@ struct ipv6_devconf ipv6_devconf __read_ .accept_source_route = 0, /* we do not accept RH0 by default. */ }; -static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { +struct ipv6_devconf global_ipv6_devconf_dflt __read_mostly = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, .mtu6 = IPV6_MIN_MTU, @@ -210,6 +210,12 @@ static struct ipv6_devconf ipv6_devconf_ .accept_source_route = 0, /* we do not accept RH0 by default. */ }; +#ifdef CONFIG_VE +#define ipv6_devconf_dflt (*(get_exec_env()->_ipv6_devconf_dflt)) +#else +#define ipv6_devconf_dflt global_ipv6_devconf_dflt +#endif + /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; @@ -375,9 +381,8 @@ static struct inet6_dev * ipv6_add_dev(s dev->type == ARPHRD_SIT || #endif dev->type == ARPHRD_NONE) { - printk(KERN_INFO - "%s: Disabled Privacy Extensions\n", - dev->name); + ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n", + dev->name)); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); @@ -458,12 +463,12 @@ static void addrconf_forward_change(void struct inet6_dev *idev; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); - idev->cnf.forwarding = ipv6_devconf.forwarding; + int changed = (!idev->cnf.forwarding) ^ (!ve_ipv6_devconf.forwarding); + idev->cnf.forwarding = ve_ipv6_devconf.forwarding; if (changed) dev_forward_change(idev); } @@ -543,7 +548,7 @@ ipv6_add_addr(struct inet6_dev *idev, co goto out; } - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC); if (ifa == NULL) { ADBG(("ipv6_add_addr: malloc failed\n")); @@ -920,7 +925,7 @@ int ipv6_dev_get_saddr(struct net_device read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; @@ -1215,9 +1220,10 @@ int ipv6_chk_addr(struct in6_addr *addr, read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { if (ipv6_addr_equal(&ifp->addr, addr) && - !(ifp->flags&IFA_F_TENTATIVE)) { + !(ifp->flags&IFA_F_TENTATIVE) && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) + !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) break; } } @@ -1235,7 +1241,9 @@ int ipv6_chk_same_addr(const struct in6_ for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { if (ipv6_addr_equal(&ifp->addr, addr)) { - if (dev == NULL || ifp->idev->dev == dev) + if ((dev == NULL && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) + || ifp->idev->dev == dev) break; } } @@ -1249,9 +1257,10 @@ struct inet6_ifaddr * ipv6_get_ifaddr(st read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ipv6_addr_equal(&ifp->addr, addr)) { + if (ipv6_addr_equal(&ifp->addr, addr) && + ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) { in6_ifa_hold(ifp); break; } @@ -1739,7 +1748,7 @@ ok: #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && - !ipv6_devconf.forwarding) + !ve_ipv6_devconf.forwarding) addr_flags = IFA_F_OPTIMISTIC; #endif @@ -1856,6 +1865,7 @@ ok: */ int addrconf_set_dstaddr(void __user *arg) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct in6_ifreq ireq; struct net_device *dev; int err = -EINVAL; @@ -1866,7 +1876,7 @@ int addrconf_set_dstaddr(void __user *ar if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) goto err_exit; - dev = __dev_get_by_index(&init_net, ireq.ifr6_ifindex); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); err = -ENODEV; if (dev == NULL) @@ -1897,7 +1907,7 @@ int addrconf_set_dstaddr(void __user *ar if (err == 0) { err = -ENOBUFS; - if ((dev = __dev_get_by_name(&init_net, p.name)) == NULL) + if ((dev = __dev_get_by_name(net, p.name)) == NULL) goto err_exit; err = dev_open(dev); } @@ -1912,7 +1922,7 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, +int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; @@ -1927,7 +1937,7 @@ static int inet6_addr_add(int ifindex, s if (!valid_lft || prefered_lft > valid_lft) return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, ifindex)) == NULL) + if ((dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex)) == NULL) return -ENODEV; if ((idev = addrconf_add_dev(dev)) == NULL) @@ -1971,6 +1981,7 @@ static int inet6_addr_add(int ifindex, s return PTR_ERR(ifp); } +EXPORT_SYMBOL_GPL(inet6_addr_add); static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) { @@ -1978,7 +1989,7 @@ static int inet6_addr_del(int ifindex, s struct inet6_dev *idev; struct net_device *dev; - if ((dev = __dev_get_by_index(&init_net, ifindex)) == NULL) + if ((dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex)) == NULL) return -ENODEV; if ((idev = __in6_dev_get(dev)) == NULL) @@ -2011,7 +2022,7 @@ int addrconf_add_ifaddr(void __user *arg struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2029,7 +2040,7 @@ int addrconf_del_ifaddr(void __user *arg struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) @@ -2073,7 +2084,7 @@ static void sit_add_v4_addrs(struct inet return; } - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { struct in_device * in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr * ifa; @@ -2143,7 +2154,7 @@ static void addrconf_add_linklocal(struc #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (idev->cnf.optimistic_dad && - !ipv6_devconf.forwarding) + !ve_ipv6_devconf.forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -2225,16 +2236,17 @@ ipv6_inherit_linklocal(struct inet6_dev static void ip6_tnl_add_linklocal(struct inet6_dev *idev) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct net_device *link_dev; /* first try to inherit the link-local address from the link device */ if (idev->dev->iflink && - (link_dev = __dev_get_by_index(&init_net, idev->dev->iflink))) { + (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { if (!ipv6_inherit_linklocal(idev, link_dev)) return; } /* then try to inherit it from any device */ - for_each_netdev(&init_net, link_dev) { + for_each_netdev(net, link_dev) { if (!ipv6_inherit_linklocal(idev, link_dev)) return; } @@ -2267,9 +2279,6 @@ static int addrconf_notify(struct notifi int run_pending = 0; int err; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch(event) { case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { @@ -2412,7 +2421,7 @@ static struct notifier_block ipv6_dev_no .priority = 0 }; -static int addrconf_ifdown(struct net_device *dev, int how) +int addrconf_ifdown(struct net_device *dev, int how) { struct inet6_dev *idev; struct inet6_ifaddr *ifa, **bifa; @@ -2527,10 +2536,14 @@ static int addrconf_ifdown(struct net_de } return 0; } +EXPORT_SYMBOL_GPL(addrconf_ifdown); static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct ve_struct *old_env; + + old_env = set_exec_env(ifp->idev->dev->owner_env); if (ifp->idev->cnf.forwarding) goto out; @@ -2569,6 +2582,7 @@ static void addrconf_rs_timer(unsigned l out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } /* @@ -2645,6 +2659,9 @@ static void addrconf_dad_timer(unsigned struct inet6_dev *idev = ifp->idev; struct in6_addr unspec; struct in6_addr mcaddr; + struct ve_struct *old_env; + + old_env = set_exec_env(ifp->idev->dev->owner_env); read_lock_bh(&idev->lock); if (idev->dead) { @@ -2677,6 +2694,7 @@ static void addrconf_dad_timer(unsigned ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); out: in6_ifa_put(ifp); + (void)set_exec_env(old_env); } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) @@ -2744,8 +2762,11 @@ static struct inet6_ifaddr *if6_get_firs for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { ifa = inet6_addr_lst[state->bucket]; - if (ifa) - break; + while (ifa) { + if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) + return ifa; + ifa = ifa->lst_next; + } } return ifa; } @@ -2756,6 +2777,11 @@ static struct inet6_ifaddr *if6_get_next ifa = ifa->lst_next; try_again: + while (ifa) { + if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) + break; + ifa = ifa->lst_next; + } if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { ifa = inet6_addr_lst[state->bucket]; goto try_again; @@ -2870,6 +2896,7 @@ static void addrconf_verify(unsigned lon struct inet6_ifaddr *ifp; unsigned long now, next; int i; + struct ve_struct *old_env; spin_lock_bh(&addrconf_verify_lock); now = jiffies; @@ -2890,6 +2917,8 @@ restart: if (ifp->flags & IFA_F_PERMANENT) continue; + old_env = set_exec_env(ifp->idev->dev->owner_env); + spin_lock(&ifp->lock); age = (now - ifp->tstamp) / HZ; @@ -2905,9 +2934,11 @@ restart: in6_ifa_hold(ifp); read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); + (void)set_exec_env(old_env); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); + set_exec_env(old_env); continue; } else if (age >= ifp->prefered_lft) { /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ @@ -2929,6 +2960,7 @@ restart: ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } #ifdef CONFIG_IPV6_PRIVACY @@ -2950,6 +2982,7 @@ restart: ipv6_create_tempaddr(ifpub, ifp); in6_ifa_put(ifpub); in6_ifa_put(ifp); + (void)set_exec_env(old_env); goto restart; } } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) @@ -2962,6 +2995,7 @@ restart: next = ifp->tstamp + ifp->prefered_lft * HZ; spin_unlock(&ifp->lock); } + (void)set_exec_env(old_env); } read_unlock(&addrconf_hash_lock); } @@ -3083,7 +3117,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, s valid_lft = INFINITY_LIFE_TIME; } - dev = __dev_get_by_index(&init_net, ifm->ifa_index); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifm->ifa_index); if (dev == NULL) return -ENODEV; @@ -3267,7 +3301,7 @@ static int inet6_dump_addr(struct sk_buf s_ip_idx = ip_idx = cb->args[1]; idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -3376,7 +3410,7 @@ static int inet6_rtm_getaddr(struct sk_b ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(&init_net, ifm->ifa_index); + dev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifm->ifa_index); if ((ifa = ipv6_get_ifaddr(addr, dev, 1)) == NULL) { err = -EADDRNOTAVAIL; @@ -3588,7 +3622,7 @@ static int inet6_dump_ifinfo(struct sk_b read_lock(&dev_base_lock); idx = 0; - for_each_netdev(&init_net, dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) { if (idx < s_idx) goto cont; if ((idev = in6_dev_get(dev)) == NULL) @@ -3747,7 +3781,7 @@ int addrconf_sysctl_forward(ctl_table *c ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { + if (valp != &ve_ipv6_devconf.forwarding) { if ((!*valp) ^ (!val)) { struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; if (idev == NULL) @@ -3755,7 +3789,7 @@ int addrconf_sysctl_forward(ctl_table *c dev_forward_change(idev); } } else { - ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; + ipv6_devconf_dflt.forwarding = ve_ipv6_devconf.forwarding; addrconf_forward_change(); } if (*valp) @@ -3797,7 +3831,7 @@ static int addrconf_sysctl_forward_strat } if (valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { + if (valp != &ve_ipv6_devconf.forwarding) { struct inet6_dev *idev = (struct inet6_dev *)table->extra1; int changed; if (unlikely(idev == NULL)) @@ -3833,7 +3867,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_FORWARDING, .procname = "forwarding", - .data = &ipv6_devconf.forwarding, + .data = &global_ipv6_devconf.forwarding, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &addrconf_sysctl_forward, @@ -3842,7 +3876,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_HOP_LIMIT, .procname = "hop_limit", - .data = &ipv6_devconf.hop_limit, + .data = &global_ipv6_devconf.hop_limit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -3850,7 +3884,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MTU, .procname = "mtu", - .data = &ipv6_devconf.mtu6, + .data = &global_ipv6_devconf.mtu6, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3858,7 +3892,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA, .procname = "accept_ra", - .data = &ipv6_devconf.accept_ra, + .data = &global_ipv6_devconf.accept_ra, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3866,7 +3900,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, .procname = "accept_redirects", - .data = &ipv6_devconf.accept_redirects, + .data = &global_ipv6_devconf.accept_redirects, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3874,7 +3908,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_AUTOCONF, .procname = "autoconf", - .data = &ipv6_devconf.autoconf, + .data = &global_ipv6_devconf.autoconf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3882,7 +3916,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_DAD_TRANSMITS, .procname = "dad_transmits", - .data = &ipv6_devconf.dad_transmits, + .data = &global_ipv6_devconf.dad_transmits, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3890,7 +3924,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICITS, .procname = "router_solicitations", - .data = &ipv6_devconf.rtr_solicits, + .data = &global_ipv6_devconf.rtr_solicits, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3898,7 +3932,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, .procname = "router_solicitation_interval", - .data = &ipv6_devconf.rtr_solicit_interval, + .data = &global_ipv6_devconf.rtr_solicit_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -3907,7 +3941,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, .procname = "router_solicitation_delay", - .data = &ipv6_devconf.rtr_solicit_delay, + .data = &global_ipv6_devconf.rtr_solicit_delay, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -3916,7 +3950,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_FORCE_MLD_VERSION, .procname = "force_mld_version", - .data = &ipv6_devconf.force_mld_version, + .data = &global_ipv6_devconf.force_mld_version, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3925,7 +3959,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_USE_TEMPADDR, .procname = "use_tempaddr", - .data = &ipv6_devconf.use_tempaddr, + .data = &global_ipv6_devconf.use_tempaddr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3933,7 +3967,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_TEMP_VALID_LFT, .procname = "temp_valid_lft", - .data = &ipv6_devconf.temp_valid_lft, + .data = &global_ipv6_devconf.temp_valid_lft, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3941,7 +3975,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, .procname = "temp_prefered_lft", - .data = &ipv6_devconf.temp_prefered_lft, + .data = &global_ipv6_devconf.temp_prefered_lft, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3949,7 +3983,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_REGEN_MAX_RETRY, .procname = "regen_max_retry", - .data = &ipv6_devconf.regen_max_retry, + .data = &global_ipv6_devconf.regen_max_retry, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3957,7 +3991,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, .procname = "max_desync_factor", - .data = &ipv6_devconf.max_desync_factor, + .data = &global_ipv6_devconf.max_desync_factor, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3966,7 +4000,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_MAX_ADDRESSES, .procname = "max_addresses", - .data = &ipv6_devconf.max_addresses, + .data = &global_ipv6_devconf.max_addresses, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3974,7 +4008,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_DEFRTR, .procname = "accept_ra_defrtr", - .data = &ipv6_devconf.accept_ra_defrtr, + .data = &global_ipv6_devconf.accept_ra_defrtr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3982,7 +4016,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_PINFO, .procname = "accept_ra_pinfo", - .data = &ipv6_devconf.accept_ra_pinfo, + .data = &global_ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3991,7 +4025,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_RTR_PREF, .procname = "accept_ra_rtr_pref", - .data = &ipv6_devconf.accept_ra_rtr_pref, + .data = &global_ipv6_devconf.accept_ra_rtr_pref, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -3999,7 +4033,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_RTR_PROBE_INTERVAL, .procname = "router_probe_interval", - .data = &ipv6_devconf.rtr_probe_interval, + .data = &global_ipv6_devconf.rtr_probe_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -4009,7 +4043,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, .procname = "accept_ra_rt_info_max_plen", - .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .data = &global_ipv6_devconf.accept_ra_rt_info_max_plen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -4019,7 +4053,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_PROXY_NDP, .procname = "proxy_ndp", - .data = &ipv6_devconf.proxy_ndp, + .data = &global_ipv6_devconf.proxy_ndp, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -4027,7 +4061,7 @@ static struct addrconf_sysctl_table { .ctl_name = NET_IPV6_ACCEPT_SOURCE_ROUTE, .procname = "accept_source_route", - .data = &ipv6_devconf.accept_source_route, + .data = &global_ipv6_devconf.accept_source_route, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -4036,7 +4070,7 @@ static struct addrconf_sysctl_table { .ctl_name = CTL_UNNUMBERED, .procname = "optimistic_dad", - .data = &ipv6_devconf.optimistic_dad, + .data = &global_ipv6_devconf.optimistic_dad, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -4093,27 +4127,21 @@ static struct addrconf_sysctl_table }, }; -static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +static struct addrconf_sysctl_table *__addrconf_sysctl_register( + struct inet6_dev *idev, char *dev_name, + int ifindex, struct ipv6_devconf *p) { int i; - struct net_device *dev = idev ? idev->dev : NULL; struct addrconf_sysctl_table *t; - char *dev_name = NULL; t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) - return; + return NULL; + for (i=0; t->addrconf_vars[i].data; i++) { - t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + t->addrconf_vars[i].data += (char*)p - (char*)&global_ipv6_devconf; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ } - if (dev) { - dev_name = dev->name; - t->addrconf_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; - } /* * Make a copy of dev_name, because '.procname' is regarded as const @@ -4124,6 +4152,7 @@ static void addrconf_sysctl_register(str if (!dev_name) goto free; + t->addrconf_dev[0].ctl_name = ifindex; t->addrconf_dev[0].procname = dev_name; t->addrconf_dev[0].child = t->addrconf_vars; @@ -4134,9 +4163,7 @@ static void addrconf_sysctl_register(str t->sysctl_header = register_sysctl_table(t->addrconf_root_dir); if (t->sysctl_header == NULL) goto free_procname; - else - p->sysctl = t; - return; + return t; /* error path */ free_procname: @@ -4144,7 +4171,26 @@ static void addrconf_sysctl_register(str free: kfree(t); - return; + return NULL; +} + +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +{ + struct net_device *dev; + char *dev_name; + int ifindex; + + dev = idev ? idev->dev : NULL; + + if (dev) { + dev_name = dev->name; + ifindex = dev->ifindex; + } else { + dev_name = "default"; + ifindex = NET_PROTO_CONF_DEFAULT; + } + + p->sysctl = __addrconf_sysctl_register(idev, dev_name, ifindex, p); } static void addrconf_sysctl_unregister(struct ipv6_devconf *p) @@ -4158,9 +4204,64 @@ static void addrconf_sysctl_unregister(s } } +#ifdef CONFIG_VE +int addrconf_sysctl_init(struct ve_struct *ve) +{ + int err = 0; + struct ipv6_devconf *conf, *conf_def; + + err = -ENOMEM; -#endif + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + goto err1; + + memcpy(conf, &global_ipv6_devconf, sizeof(*conf)); + conf->sysctl = __addrconf_sysctl_register(NULL, "all", + NET_PROTO_CONF_ALL, conf); + if (!conf->sysctl) + goto err2; + + conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); + if (!conf_def) + goto err3; + + memcpy(conf_def, &global_ipv6_devconf_dflt, sizeof(*conf_def)); + conf_def->sysctl = __addrconf_sysctl_register(NULL, "default", + NET_PROTO_CONF_DEFAULT, conf_def); + if (!conf_def->sysctl) + goto err4; + + ve->_ipv6_devconf = conf; + ve->_ipv6_devconf_dflt = conf_def; + return 0; + +err4: + kfree(conf_def); +err3: + addrconf_sysctl_unregister(conf); +err2: + kfree(conf); +err1: + return err; +} +EXPORT_SYMBOL(addrconf_sysctl_init); +void addrconf_sysctl_fini(struct ve_struct *ve) +{ + addrconf_sysctl_unregister(ve->_ipv6_devconf); + addrconf_sysctl_unregister(ve->_ipv6_devconf_dflt); +} +EXPORT_SYMBOL(addrconf_sysctl_fini); + +void addrconf_sysctl_free(struct ve_struct *ve) +{ + kfree(ve->_ipv6_devconf); + kfree(ve->_ipv6_devconf_dflt); +} +EXPORT_SYMBOL(addrconf_sysctl_free); +#endif /* CONFIG_VE */ +#endif /* CONFIG_SYSCTL */ /* * Device notifier */ @@ -4187,6 +4288,11 @@ int __init addrconf_init(void) { int err = 0; +#ifdef CONFIG_VE + get_ve0()->_ipv6_devconf = &global_ipv6_devconf; + get_ve0()->_ipv6_devconf_dflt = &global_ipv6_devconf_dflt; +#endif + /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup * before it can bring up and give link-local addresses @@ -4239,7 +4345,7 @@ int __init addrconf_init(void) #ifdef CONFIG_SYSCTL addrconf_sysctl.sysctl_header = register_sysctl_table(addrconf_sysctl.addrconf_root_dir); - addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); + addrconf_sysctl_register(NULL, &global_ipv6_devconf_dflt); #endif return 0; @@ -4258,8 +4364,8 @@ void __exit addrconf_cleanup(void) unregister_netdevice_notifier(&ipv6_dev_notf); #ifdef CONFIG_SYSCTL - addrconf_sysctl_unregister(&ipv6_devconf_dflt); - addrconf_sysctl_unregister(&ipv6_devconf); + addrconf_sysctl_unregister(&global_ipv6_devconf_dflt); + addrconf_sysctl_unregister(&global_ipv6_devconf); #endif rtnl_lock(); diff -uprN linux-2.6.24/net/ipv6/af_inet6.c linux-2.6.24.ovz/net/ipv6/af_inet6.c --- linux-2.6.24/net/ipv6/af_inet6.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/af_inet6.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,10 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#ifdef CONFIG_IPV6_MIP6 +#include +#endif +#include #include #include @@ -94,9 +99,6 @@ static int inet6_create(struct net *net, int try_loading_module = 0; int err; - if (net != &init_net) - return -EAFNOSUPPORT; - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) @@ -149,6 +151,10 @@ lookup_protocol: goto out_rcu_unlock; } + err = vz_security_protocol_check(answer->protocol); + if (err < 0) + goto out_rcu_unlock; + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -166,6 +172,13 @@ lookup_protocol: if (sk == NULL) goto out; + err = -ENOBUFS; + if (ub_sock_charge(sk, PF_INET6, sock->type)) + goto out_sk_free; + /* if charge was successful, sock_init_data() MUST be called to + * set sk->sk_type. otherwise sk will be uncharged to wrong resource + */ + sock_init_data(sock, sk); err = 0; @@ -240,6 +253,9 @@ out: out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_free: + sk_free(sk); + return err; } @@ -302,7 +318,7 @@ int inet6_bind(struct socket *sock, stru err = -EINVAL; goto out; } - dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out; @@ -713,31 +729,31 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static int __init init_ipv6_mibs(void) { - if (snmp_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), + if (snmp_mib_init((void **)ve_ipv6_statistics, sizeof (struct ipstats_mib), __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), + if (snmp_mib_init((void **)ve_icmpv6_statistics, sizeof (struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void **)icmpv6msg_statistics, + if (snmp_mib_init((void **)ve_icmpv6msg_statistics, sizeof (struct icmpv6msg_mib), __alignof__(struct icmpv6_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), + if (snmp_mib_init((void **)ve_udp_stats_in6, sizeof (struct udp_mib), __alignof__(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib), + if (snmp_mib_init((void **)ve_udplite_stats_in6, sizeof (struct udp_mib), __alignof__(struct udp_mib)) < 0) goto err_udplite_mib; return 0; err_udplite_mib: - snmp_mib_free((void **)udp_stats_in6); + snmp_mib_free((void **)ve_udp_stats_in6); err_udp_mib: - snmp_mib_free((void **)icmpv6msg_statistics); + snmp_mib_free((void **)ve_icmpv6msg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmpv6_statistics); + snmp_mib_free((void **)ve_icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void **)ipv6_statistics); + snmp_mib_free((void **)ve_ipv6_statistics); err_ip_mib: return -ENOMEM; @@ -745,11 +761,11 @@ err_ip_mib: static void cleanup_ipv6_mibs(void) { - snmp_mib_free((void **)ipv6_statistics); - snmp_mib_free((void **)icmpv6_statistics); - snmp_mib_free((void **)icmpv6msg_statistics); - snmp_mib_free((void **)udp_stats_in6); - snmp_mib_free((void **)udplite_stats_in6); + snmp_mib_free((void **)ve_ipv6_statistics); + snmp_mib_free((void **)ve_icmpv6_statistics); + snmp_mib_free((void **)ve_icmpv6msg_statistics); + snmp_mib_free((void **)ve_udp_stats_in6); + snmp_mib_free((void **)ve_udplite_stats_in6); } static int __init inet6_init(void) diff -uprN linux-2.6.24/net/ipv6/anycast.c linux-2.6.24.ovz/net/ipv6/anycast.c --- linux-2.6.24/net/ipv6/anycast.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/anycast.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -82,7 +83,7 @@ int ipv6_sock_ac_join(struct sock *sk, i struct net_device *dev = NULL; struct inet6_dev *idev; struct ipv6_ac_socklist *pac; - int ishost = !ipv6_devconf.forwarding; + int ishost = !ve_ipv6_devconf.forwarding; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -112,10 +113,10 @@ int ipv6_sock_ac_join(struct sock *sk, i } else { /* router, no matching interface: just pick one */ - dev = dev_get_by_flags(&init_net, IFF_UP, IFF_UP|IFF_LOOPBACK); + dev = dev_get_by_flags(get_exec_env()->ve_ns->net_ns, IFF_UP, IFF_UP|IFF_LOOPBACK); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex); if (dev == NULL) { err = -ENODEV; @@ -196,7 +197,7 @@ int ipv6_sock_ac_drop(struct sock *sk, i write_unlock_bh(&ipv6_sk_ac_lock); - dev = dev_get_by_index(&init_net, pac->acl_ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, pac->acl_ifindex); if (dev) { ipv6_dev_ac_dec(dev, &pac->acl_addr); dev_put(dev); @@ -224,7 +225,7 @@ void ipv6_sock_ac_close(struct sock *sk) if (pac->acl_ifindex != prev_index) { if (dev) dev_put(dev); - dev = dev_get_by_index(&init_net, pac->acl_ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -429,7 +430,7 @@ int ipv6_chk_acast_addr(struct net_devic if (dev) return ipv6_chk_acast_dev(dev, addr); read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) + for_each_netdev(get_exec_env()->ve_ns->net_ns, dev) if (ipv6_chk_acast_dev(dev, addr)) { found = 1; break; @@ -453,8 +454,9 @@ static inline struct ifacaddr6 *ac6_get_ struct ac6_iter_state *state = ac6_seq_private(seq); state->idev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, state->dev) { struct inet6_dev *idev; + idev = in6_dev_get(state->dev); if (!idev) continue; @@ -485,6 +487,8 @@ static struct ifacaddr6 *ac6_get_next(st state->idev = NULL; break; } + if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) + continue; state->idev = in6_dev_get(state->dev); if (!state->idev) continue; diff -uprN linux-2.6.24/net/ipv6/datagram.c linux-2.6.24.ovz/net/ipv6/datagram.c --- linux-2.6.24/net/ipv6/datagram.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/datagram.c 2008-03-25 18:53:59.000000000 -0500 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -544,7 +545,7 @@ int datagram_send_ctl(struct msghdr *msg if (!src_info->ipi6_ifindex) return -EINVAL; else { - dev = dev_get_by_index(&init_net, src_info->ipi6_ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, src_info->ipi6_ifindex); if (!dev) return -ENODEV; } diff -uprN linux-2.6.24/net/ipv6/exthdrs.c linux-2.6.24.ovz/net/ipv6/exthdrs.c --- linux-2.6.24/net/ipv6/exthdrs.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/exthdrs.c 2008-03-25 18:53:59.000000000 -0500 @@ -352,7 +352,7 @@ static int ipv6_rthdr_rcv(struct sk_buff int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; - int accept_source_route = ipv6_devconf.accept_source_route; + int accept_source_route = ve_ipv6_devconf.accept_source_route; idev = in6_dev_get(skb->dev); if (idev) { diff -uprN linux-2.6.24/net/ipv6/inet6_connection_sock.c linux-2.6.24.ovz/net/ipv6/inet6_connection_sock.c --- linux-2.6.24/net/ipv6/inet6_connection_sock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/inet6_connection_sock.c 2008-03-25 18:53:59.000000000 -0500 @@ -25,6 +25,8 @@ #include #include #include +#include +#include int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -35,6 +37,7 @@ int inet6_csk_bind_conflict(const struct /* We must walk the whole port owner list in this case. -DaveM */ sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && + ve_accessible_strict(sk->owner_env, sk2->owner_env) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && diff -uprN linux-2.6.24/net/ipv6/inet6_hashtables.c linux-2.6.24.ovz/net/ipv6/inet6_hashtables.c --- linux-2.6.24/net/ipv6/inet6_hashtables.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/inet6_hashtables.c 2008-03-25 18:53:59.000000000 -0500 @@ -67,7 +67,8 @@ struct sock *__inet6_lookup_established( /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); + struct ve_struct *env = get_exec_env(); + unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env)); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); @@ -75,7 +76,7 @@ struct sock *__inet6_lookup_established( read_lock(lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ - if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif, env)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ @@ -88,6 +89,7 @@ struct sock *__inet6_lookup_established( if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + ve_accessible_strict(tw->tw_owner_env, VEID(env)) && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } @@ -110,9 +112,15 @@ struct sock *inet6_lookup_listener(struc const struct hlist_node *node; struct sock *result = NULL; int score, hiscore = 0; + struct ve_struct *env; + + env = get_exec_env(); read_lock(&hashinfo->lhash_lock); - sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { + sk_for_each(sk, node, &hashinfo->listening_hash[ + inet_lhashfn(hnum, VEID(env))]) { + if (!ve_accessible_strict(sk->owner_env, env)) + continue; if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); @@ -163,7 +171,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + struct ve_struct *ve) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -173,7 +182,7 @@ static int __inet6_check_established(str const int dif = sk->sk_bound_dev_if; const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, - inet->dport); + inet->dport, VEID(ve)); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; @@ -193,7 +202,8 @@ static int __inet6_check_established(str sk2->sk_family == PF_INET6 && ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - (!sk2->sk_bound_dev_if || sk2->sk_bound_dev_if == dif)) { + (!sk2->sk_bound_dev_if || sk2->sk_bound_dev_if == dif) && + ve_accessible_strict(tw->tw_owner_env, VEID(ve))) { if (twsk_unique(sk, sk2, twp)) goto unique; else @@ -204,7 +214,7 @@ static int __inet6_check_established(str /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif, ve)) goto not_unique; } @@ -253,7 +263,9 @@ int inet6_hash_connect(struct inet_timew struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; + struct ve_struct *ve; + ve = sk->owner_env; if (snum == 0) { int i, port, low, high, remaining; static u32 hint; @@ -267,7 +279,7 @@ int inet6_hash_connect(struct inet_timew local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size, VEID(ve))]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -275,20 +287,21 @@ int inet6_hash_connect(struct inet_timew * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { + if (tb->port == port && + ve_accessible_strict(tb->owner_env, ve)) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; if (!__inet6_check_established(death_row, sk, port, - &tw)) + &tw, ve)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - head, port); + head, port, ve); if (!tb) { spin_unlock(&head->lock); break; @@ -323,7 +336,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); @@ -334,7 +347,7 @@ ok: } else { spin_unlock(&head->lock); /* No definite answer... Walk to established hash table */ - ret = __inet6_check_established(death_row, sk, snum, NULL); + ret = __inet6_check_established(death_row, sk, snum, NULL, ve); out: local_bh_enable(); return ret; diff -uprN linux-2.6.24/net/ipv6/ip6_fib.c linux-2.6.24.ovz/net/ipv6/ip6_fib.c --- linux-2.6.24/net/ipv6/ip6_fib.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ip6_fib.c 2008-03-25 18:53:59.000000000 -0500 @@ -174,12 +174,28 @@ static struct fib6_table fib6_main_tbl = }, }; +#ifdef CONFIG_VE +static inline void prepare_fib6_table(void) +{ + get_ve0()->_fib6_table = &fib6_main_tbl; +} + +#define fib6_main_tbl (*(get_exec_env()->_fib6_table)) +#else +#define prepare_fib6_table() do { } while (0) +#endif + #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 1 #endif + +#ifdef CONFIG_VE +#define fib_table_hash (get_exec_env()->_fib6_table_hash) +#else static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +#endif static void fib6_link_table(struct fib6_table *tb) { @@ -193,11 +209,16 @@ static void fib6_link_table(struct fib6_ h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1); - /* - * No protection necessary, this is the only list mutatation - * operation, tables never disappear once they exist. - */ + write_lock_bh(&tb->tb6_lock); hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]); + write_unlock_bh(&tb->tb6_lock); +} + +static void fib6_unlink_table(struct fib6_table *tb) +{ + write_lock_bh(&tb->tb6_lock); + hlist_del_rcu(&tb->tb6_hlist); + write_unlock_bh(&tb->tb6_lock); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -209,6 +230,16 @@ static struct fib6_table fib6_local_tbl }, }; +#ifdef CONFIG_VE +static inline void prepare_fib6_local_table(void) +{ + get_ve0()->_fib6_local_table = &fib6_local_tbl; +} +#define fib6_local_tbl (*(get_exec_env())->_fib6_local_table) +#else +#define prepare_fib6_local_table() do { } while (0) +#endif + static struct fib6_table *fib6_alloc_table(u32 id) { struct fib6_table *table; @@ -261,12 +292,18 @@ struct fib6_table *fib6_get_table(u32 id return NULL; } -static void __init fib6_tables_init(void) +void fib6_tables_init(void) { fib6_link_table(&fib6_main_tbl); fib6_link_table(&fib6_local_tbl); } +void fib6_tables_cleanup(void) +{ + fib6_unlink_table(&fib6_main_tbl); + fib6_unlink_table(&fib6_local_tbl); +} + #else struct fib6_table *fib6_new_table(u32 id) @@ -285,11 +322,16 @@ struct dst_entry *fib6_rule_lookup(struc return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags); } -static void __init fib6_tables_init(void) +void fib6_tables_init(void) { fib6_link_table(&fib6_main_tbl); } +void fib6_tables_cleanup(void) +{ + fib6_unlink_table(&fib6_main_tbl); +} + #endif static int fib6_dump_node(struct fib6_walker_t *w) @@ -1371,9 +1413,13 @@ void fib6_clean_all(int (*func)(struct r for (h = 0; h < FIB_TABLE_HASHSZ; h++) { hlist_for_each_entry_rcu(table, node, &fib_table_hash[h], tb6_hlist) { + struct ve_struct *old_env; + + old_env = set_exec_env(table->owner_env); write_lock_bh(&table->tb6_lock); fib6_clean_tree(&table->tb6_root, func, prune, arg); write_unlock_bh(&table->tb6_lock); + (void)set_exec_env(old_env); } } rcu_read_unlock(); @@ -1441,6 +1487,8 @@ static int fib6_age(struct rt6_info *rt, static DEFINE_SPINLOCK(fib6_gc_lock); +LIST_HEAD(fib6_table_list); + void fib6_run_gc(unsigned long dummy) { if (dummy != ~0UL) { @@ -1473,9 +1521,13 @@ void __init fib6_init(void) { fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + prepare_fib6_table(); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + prepare_fib6_local_table(); +#endif fib6_tables_init(); __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); diff -uprN linux-2.6.24/net/ipv6/ip6_flowlabel.c linux-2.6.24.ovz/net/ipv6/ip6_flowlabel.c --- linux-2.6.24/net/ipv6/ip6_flowlabel.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ip6_flowlabel.c 2008-03-25 18:53:59.000000000 -0500 @@ -448,6 +448,9 @@ int ipv6_flowlabel_opt(struct sock *sk, struct ip6_flowlabel *fl, *fl1 = NULL; + if (!ve_is_super(get_exec_env())) + return -EPERM; + if (optlen < sizeof(freq)) return -EINVAL; diff -uprN linux-2.6.24/net/ipv6/ip6_input.c linux-2.6.24.ovz/net/ipv6/ip6_input.c --- linux-2.6.24/net/ipv6/ip6_input.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ip6_input.c 2008-03-25 18:53:59.000000000 -0500 @@ -61,11 +61,6 @@ int ipv6_rcv(struct sk_buff *skb, struct u32 pkt_len; struct inet6_dev *idev; - if (dev->nd_net != &init_net) { - kfree_skb(skb); - return 0; - } - if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); return 0; diff -uprN linux-2.6.24/net/ipv6/ip6_output.c linux-2.6.24.ovz/net/ipv6/ip6_output.c --- linux-2.6.24/net/ipv6/ip6_output.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ip6_output.c 2008-03-25 18:53:59.000000000 -0500 @@ -378,7 +378,7 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); - if (ipv6_devconf.forwarding == 0) + if (ve_ipv6_devconf.forwarding == 0) goto error; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { @@ -422,7 +422,7 @@ int ip6_forward(struct sk_buff *skb) } /* XXX: idev->cnf.proxy_ndp? */ - if (ipv6_devconf.proxy_ndp && + if (ve_ipv6_devconf.proxy_ndp && pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) @@ -488,6 +488,20 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } + /* + * We try to optimize forwarding of VE packets: + * do not decrement TTL (and so save skb_cow) + * during forwarding of outgoing pkts from VE. + * For incoming pkts we still do ttl decr, + * since such skb is not cloned and does not require + * actual cow. So, there is at least one place + * in pkts path with mandatory ttl decr, that is + * sufficient to prevent routing loops. + */ + hdr = ipv6_hdr(skb); + if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ + goto no_ttl_decr; + if (skb_cow(skb, dst->dev->hard_header_len)) { IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -499,6 +513,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; +no_ttl_decr: IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); diff -uprN linux-2.6.24/net/ipv6/ipv6_sockglue.c linux-2.6.24.ovz/net/ipv6/ipv6_sockglue.c --- linux-2.6.24/net/ipv6/ipv6_sockglue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ipv6_sockglue.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -544,7 +545,7 @@ done: if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) goto e_inval; - if (__dev_get_by_index(&init_net, val) == NULL) { + if (__dev_get_by_index(get_exec_env()->ve_ns->net_ns, val) == NULL) { retv = -ENODEV; break; } @@ -1021,7 +1022,7 @@ static int do_ipv6_getsockopt(struct soc dst_release(dst); } if (val < 0) - val = ipv6_devconf.hop_limit; + val = ve_ipv6_devconf.hop_limit; break; } diff -uprN linux-2.6.24/net/ipv6/mcast.c linux-2.6.24.ovz/net/ipv6/mcast.c --- linux-2.6.24/net/ipv6/mcast.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/mcast.c 2008-03-25 18:53:59.000000000 -0500 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -156,7 +157,7 @@ static int ip6_mc_leave_src(struct sock #define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 -#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ +#define MLD_V1_SEEN(idev) (ve_ipv6_devconf.force_mld_version == 1 || \ (idev)->cnf.force_mld_version == 1 || \ ((idev)->mc_v1_seen && \ time_before(jiffies, (idev)->mc_v1_seen))) @@ -215,7 +216,7 @@ int ipv6_sock_mc_join(struct sock *sk, i dst_release(&rt->u.dst); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex); if (dev == NULL) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); @@ -248,6 +249,7 @@ int ipv6_sock_mc_join(struct sock *sk, i return 0; } +EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); /* * socket leave on multicast group @@ -266,7 +268,7 @@ int ipv6_sock_mc_drop(struct sock *sk, i *lnk = mc_lst->next; write_unlock_bh(&ipv6_sk_mc_lock); - if ((dev = dev_get_by_index(&init_net, mc_lst->ifindex)) != NULL) { + if ((dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, mc_lst->ifindex)) != NULL) { struct inet6_dev *idev = in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); @@ -301,7 +303,7 @@ static struct inet6_dev *ip6_mc_find_dev dst_release(&rt->u.dst); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, ifindex); if (!dev) return NULL; @@ -332,7 +334,7 @@ void ipv6_sock_mc_close(struct sock *sk) np->ipv6_mc_list = mc_lst->next; write_unlock_bh(&ipv6_sk_mc_lock); - dev = dev_get_by_index(&init_net, mc_lst->ifindex); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = in6_dev_get(dev); @@ -2170,15 +2172,18 @@ static void igmp6_leave_group(struct ifm static void mld_gq_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); idev->mc_gq_running = 0; mld_send_report(idev, NULL); __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_timer_expire(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2187,6 +2192,7 @@ static void mld_ifc_timer_expire(unsigne mld_ifc_start_timer(idev, idev->mc_maxdelay); } __in6_dev_put(idev); + set_exec_env(old_env); } static void mld_ifc_event(struct inet6_dev *idev) @@ -2201,6 +2207,7 @@ static void mld_ifc_event(struct inet6_d static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); if (MLD_V1_SEEN(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); @@ -2212,6 +2219,7 @@ static void igmp6_timer_handler(unsigned ma->mca_flags &= ~MAF_TIMER_RUNNING; spin_unlock(&ma->mca_lock); ma_put(ma); + set_exec_env(old_env); } /* Device going down */ @@ -2326,8 +2334,9 @@ static inline struct ifmcaddr6 *igmp6_mc struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); state->idev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, state->dev) { struct inet6_dev *idev; + idev = in6_dev_get(state->dev); if (!idev) continue; @@ -2454,8 +2463,9 @@ static inline struct ip6_sf_list *igmp6_ state->idev = NULL; state->im = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev(get_exec_env()->ve_ns->net_ns, state->dev) { struct inet6_dev *idev; + idev = in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; diff -uprN linux-2.6.24/net/ipv6/ndisc.c linux-2.6.24.ovz/net/ipv6/ndisc.c --- linux-2.6.24/net/ipv6/ndisc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/ndisc.c 2008-03-25 18:53:59.000000000 -0500 @@ -128,7 +128,7 @@ static struct neigh_ops ndisc_direct_ops .queue_xmit = dev_queue_xmit, }; -struct neigh_table nd_tbl = { +struct neigh_table global_nd_tbl = { .family = AF_INET6, .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), .key_len = sizeof(struct in6_addr), @@ -139,7 +139,7 @@ struct neigh_table nd_tbl = { .proxy_redo = pndisc_redo, .id = "ndisc_cache", .parms = { - .tbl = &nd_tbl, + .tbl = &global_nd_tbl, .base_reachable_time = 30 * HZ, .retrans_time = 1 * HZ, .gc_staletime = 60 * HZ, @@ -787,7 +787,7 @@ static void ndisc_recv_ns(struct sk_buff if (ipv6_chk_acast_addr(dev, &msg->target) || (idev->cnf.forwarding && - (ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) && + (ve_ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) && (pneigh = pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) != NULL)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && @@ -928,7 +928,7 @@ static void ndisc_recv_na(struct sk_buff * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && - ipv6_devconf.forwarding && ipv6_devconf.proxy_ndp && + ve_ipv6_devconf.forwarding && ve_ipv6_devconf.proxy_ndp && pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) { /* XXX: idev->cnf.prixy_ndp */ goto out; @@ -1610,9 +1610,6 @@ static int ndisc_netdev_event(struct not { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); @@ -1729,6 +1726,55 @@ static int ndisc_ifinfo_sysctl_strategy( #endif +static int ndisc_net_init(struct net *net) +{ + struct ve_struct *ve = get_exec_env(); + int err; + + ve->ve_nd_tbl = kmemdup(ve0.ve_nd_tbl, sizeof(struct neigh_table), + GFP_KERNEL); + if (ve->ve_nd_tbl == NULL) + return -ENOMEM; + ve->ve_nd_tbl->parms.tbl = ve->ve_nd_tbl; + + err = neigh_table_init(ve->ve_nd_tbl); + if (err) + goto out_free; +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, + "ipv6", + &ndisc_ifinfo_sysctl_change, + &ndisc_ifinfo_sysctl_strategy); +#endif + err = 0; +out: + return err; + +out_free: + kfree(ve->ve_nd_tbl); + ve->ve_nd_tbl = NULL; + goto out; +} + +static void ndisc_net_exit(struct net *net) +{ + struct ve_struct *ve = get_exec_env(); + + if (ve->ve_nd_tbl) { +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&ve->ve_nd_tbl->parms); +#endif + neigh_table_clear(ve->ve_nd_tbl); + kfree(ve->ve_nd_tbl); + ve->ve_nd_tbl = NULL; + } +} + +static struct pernet_operations ndisc_net_ops = { + .init = ndisc_net_init, + .exit = ndisc_net_exit, +}; + int __init ndisc_init(struct net_proto_family *ops) { struct ipv6_pinfo *np; @@ -1755,15 +1801,8 @@ int __init ndisc_init(struct net_proto_f /* * Initialize the neighbour table */ - - neigh_table_init(&nd_tbl); - -#ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, - "ipv6", - &ndisc_ifinfo_sysctl_change, - &ndisc_ifinfo_sysctl_strategy); -#endif + get_ve0()->ve_nd_tbl = &global_nd_tbl; + register_pernet_subsys(&ndisc_net_ops); register_netdevice_notifier(&ndisc_netdev_notifier); return 0; @@ -1772,10 +1811,7 @@ int __init ndisc_init(struct net_proto_f void ndisc_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); -#ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(&nd_tbl.parms); -#endif - neigh_table_clear(&nd_tbl); + unregister_pernet_subsys(&ndisc_net_ops); sock_release(ndisc_socket); ndisc_socket = NULL; /* For safety. */ } diff -uprN linux-2.6.24/net/ipv6/netfilter/ip6_queue.c linux-2.6.24.ovz/net/ipv6/netfilter/ip6_queue.c --- linux-2.6.24/net/ipv6/netfilter/ip6_queue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/ip6_queue.c 2008-03-25 18:53:59.000000000 -0500 @@ -489,7 +489,7 @@ __ipq_rcv_skb(struct sk_buff *skb) if (type <= IPQM_BASE) return; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); write_lock_bh(&queue_lock); @@ -519,8 +519,12 @@ __ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_skb(struct sk_buff *skb) { + struct ve_struct *old_ve; + mutex_lock(&ipqnl_mutex); + old_ve = set_exec_env(skb->owner_env); __ipq_rcv_skb(skb); + (void)set_exec_env(old_ve); mutex_unlock(&ipqnl_mutex); } @@ -530,9 +534,6 @@ ipq_rcv_dev_event(struct notifier_block { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -552,7 +553,7 @@ ipq_rcv_nl_event(struct notifier_block * if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW && n->pid) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if (n->pid == peer_pid) __ipq_reset(); write_unlock_bh(&queue_lock); } diff -uprN linux-2.6.24/net/ipv6/netfilter/ip6_tables.c linux-2.6.24.ovz/net/ipv6/netfilter/ip6_tables.c --- linux-2.6.24/net/ipv6/netfilter/ip6_tables.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/ip6_tables.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,7 @@ #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -1211,9 +1212,14 @@ do_ip6t_set_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET6].next) + return -ENOENT; +#endif + switch (cmd) { case IP6T_SO_SET_REPLACE: ret = do_replace(user, len); @@ -1236,9 +1242,14 @@ do_ip6t_get_ctl(struct sock *sk, int cmd { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_VE_NET_ADMIN)) return -EPERM; +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_xt_tables[AF_INET6].next) + return -ENOENT; +#endif + switch (cmd) { case IP6T_SO_GET_INFO: { char name[IP6T_TABLE_MAXNAMELEN]; @@ -1334,18 +1345,18 @@ do_ip6t_get_ctl(struct sock *sk, int cmd return ret; } -int ip6t_register_table(struct xt_table *table, +struct ip6t_table *ip6t_register_table(struct xt_table *table, const struct ip6t_replace *repl) { int ret; struct xt_table_info *newinfo; static struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* choose the copy on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; @@ -1358,28 +1369,29 @@ int ip6t_register_table(struct xt_table repl->underflow); if (ret != 0) { xt_free_table_info(newinfo); - return ret; + return ERR_PTR(ret); } - ret = xt_register_table(table, &bootstrap, newinfo); - if (ret != 0) { + table = virt_xt_register_table(table, &bootstrap, newinfo); + if (IS_ERR(table)) xt_free_table_info(newinfo); - return ret; - } - - return 0; + return table; } void ip6t_unregister_table(struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; + struct module *me; - private = xt_unregister_table(table); + me = table->me; + private = virt_xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + if (private->number > private->initial_entries) + module_put(me); xt_free_table_info(private); } @@ -1474,12 +1486,30 @@ static struct xt_match icmp6_matchstruct .family = AF_INET6, }; +static int init_ip6tables(void) +{ +#ifdef CONFIG_VE_IPTABLES + if (get_exec_env()->_xt_tables[AF_INET6].next != NULL) + return -EEXIST; +#endif + + return xt_proto_init(AF_INET6); +} + +static void fini_ip6tables(void) +{ +#ifdef CONFIG_VE_IPTABLES + get_exec_env()->_xt_tables[AF_INET6].next = NULL; +#endif + xt_proto_fini(AF_INET6); +} + static int __init ip6_tables_init(void) { int ret; - ret = xt_proto_init(AF_INET6); - if (ret < 0) + ret = init_ip6tables(); + if (ret) goto err1; /* Noone else will be downing sem now, so we won't sleep */ @@ -1498,6 +1528,10 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; + KSYMRESOLVE(init_ip6tables); + KSYMRESOLVE(fini_ip6tables); + KSYMMODRESOLVE(ip6_tables); + printk(KERN_INFO "ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; @@ -1508,18 +1542,21 @@ err4: err3: xt_unregister_target(&ip6t_standard_target); err2: - xt_proto_fini(AF_INET6); + fini_ip6tables(); err1: return ret; } static void __exit ip6_tables_fini(void) { + KSYMMODUNRESOLVE(ip6_tables); + KSYMUNRESOLVE(init_ip6tables); + KSYMUNRESOLVE(fini_ip6tables); nf_unregister_sockopt(&ip6t_sockopts); xt_unregister_match(&icmp6_matchstruct); xt_unregister_target(&ip6t_error_target); xt_unregister_target(&ip6t_standard_target); - xt_proto_fini(AF_INET6); + fini_ip6tables(); } /* @@ -1605,5 +1642,5 @@ EXPORT_SYMBOL(ip6t_do_table); EXPORT_SYMBOL(ip6t_ext_hdr); EXPORT_SYMBOL(ipv6_find_hdr); -module_init(ip6_tables_init); +subsys_initcall(ip6_tables_init); module_exit(ip6_tables_fini); diff -uprN linux-2.6.24/net/ipv6/netfilter/ip6table_filter.c linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_filter.c --- linux-2.6.24/net/ipv6/netfilter/ip6table_filter.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_filter.c 2008-03-25 18:53:59.000000000 -0500 @@ -11,12 +11,20 @@ #include #include +#include #include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("ip6tables filter table"); +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_filter (get_exec_env()->_ve_ip6t_filter_pf) +#else +#define ve_packet_filter &packet_filter +#endif + #define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) static struct @@ -24,7 +32,7 @@ static struct struct ip6t_replace repl; struct ip6t_standard entries[3]; struct ip6t_error term; -} initial_table __initdata = { +} initial_table = { .repl = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, @@ -65,7 +73,7 @@ ip6t_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, &packet_filter); + return ip6t_do_table(skb, hook, in, out, ve_packet_filter); } static unsigned int @@ -85,7 +93,7 @@ ip6t_local_out_hook(unsigned int hook, } #endif - return ip6t_do_table(skb, hook, in, out, &packet_filter); + return ip6t_do_table(skb, hook, in, out, ve_packet_filter); } static struct nf_hook_ops ip6t_ops[] = { @@ -116,22 +124,19 @@ static struct nf_hook_ops ip6t_ops[] = { static int forward = NF_ACCEPT; module_param(forward, bool, 0000); -static int __init ip6table_filter_init(void) +int init_ip6table_filter(void) { int ret; - - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; + struct ip6t_table *tmp_filter; /* Register table */ - ret = ip6t_register_table(&packet_filter, &initial_table.repl); - if (ret < 0) - return ret; + tmp_filter = ip6t_register_table(&packet_filter, + &initial_table.repl); + if (IS_ERR(tmp_filter)) + return PTR_ERR(tmp_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = tmp_filter; +#endif /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); @@ -141,14 +146,50 @@ static int __init ip6table_filter_init(v return ret; cleanup_table: - ip6t_unregister_table(&packet_filter); + ip6t_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif return ret; } -static void __exit ip6table_filter_fini(void) +void fini_ip6table_filter(void) { nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - ip6t_unregister_table(&packet_filter); + ip6t_unregister_table(ve_packet_filter); +#ifdef CONFIG_VE_IPTABLES + ve_packet_filter = NULL; +#endif +} + +static int __init ip6table_filter_init(void) +{ + int err; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + err = init_ip6table_filter(); + if (err < 0) + return err; + + KSYMRESOLVE(init_ip6table_filter); + KSYMRESOLVE(fini_ip6table_filter); + KSYMMODRESOLVE(ip6table_filter); + return 0; +} + +static void __exit ip6table_filter_fini(void) +{ + KSYMMODUNRESOLVE(ip6table_filter); + KSYMUNRESOLVE(init_ip6table_filter); + KSYMUNRESOLVE(fini_ip6table_filter); + fini_ip6table_filter(); } module_init(ip6table_filter_init); diff -uprN linux-2.6.24/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_mangle.c --- linux-2.6.24/net/ipv6/netfilter/ip6table_mangle.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_mangle.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ */ #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); @@ -26,7 +27,7 @@ static struct struct ip6t_replace repl; struct ip6t_standard entries[5]; struct ip6t_error term; -} initial_table __initdata = { +} initial_table = { .repl = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, @@ -65,6 +66,13 @@ static struct xt_table packet_mangler = .af = AF_INET6, }; +#ifdef CONFIG_VE_IPTABLES +#include +#define ve_packet_mangler (get_exec_env()->_ip6t_mangle_table) +#else +#define ve_packet_mangler &packet_mangler +#endif + /* The work comes in here from netfilter.c. */ static unsigned int ip6t_route_hook(unsigned int hook, @@ -73,7 +81,7 @@ ip6t_route_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, &packet_mangler); + return ip6t_do_table(skb, hook, in, out, ve_packet_mangler); } static unsigned int @@ -108,7 +116,7 @@ ip6t_local_hook(unsigned int hook, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, hook, in, out, &packet_mangler); + ret = ip6t_do_table(skb, hook, in, out, ve_packet_mangler); if (ret != NF_DROP && ret != NF_STOLEN && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) @@ -158,14 +166,19 @@ static struct nf_hook_ops ip6t_ops[] = { }, }; -static int __init ip6table_mangle_init(void) +int init_ip6table_mangle(void) { int ret; + struct ip6t_table *tmp_mangler; /* Register table */ - ret = ip6t_register_table(&packet_mangler, &initial_table.repl); - if (ret < 0) - return ret; + tmp_mangler = ip6t_register_table(&packet_mangler, + &initial_table.repl); + if (IS_ERR(tmp_mangler)) + return PTR_ERR(tmp_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = tmp_mangler; +#endif /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); @@ -175,14 +188,42 @@ static int __init ip6table_mangle_init(v return ret; cleanup_table: - ip6t_unregister_table(&packet_mangler); + ip6t_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif return ret; } -static void __exit ip6table_mangle_fini(void) +void fini_ip6table_mangle(void) { nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - ip6t_unregister_table(&packet_mangler); + ip6t_unregister_table(ve_packet_mangler); +#ifdef CONFIG_VE_IPTABLES + ve_packet_mangler = NULL; +#endif +} + +static int __init ip6table_mangle_init(void) +{ + int err; + + err = init_ip6table_mangle(); + if (err < 0) + return err; + + KSYMRESOLVE(init_ip6table_mangle); + KSYMRESOLVE(fini_ip6table_mangle); + KSYMMODRESOLVE(ip6table_mangle); + return 0; +} + +static void __exit ip6table_mangle_fini(void) +{ + KSYMMODUNRESOLVE(ip6table_mangle); + KSYMUNRESOLVE(init_ip6table_mangle); + KSYMUNRESOLVE(fini_ip6table_mangle); + fini_ip6table_mangle(); } module_init(ip6table_mangle_init); diff -uprN linux-2.6.24/net/ipv6/netfilter/ip6table_raw.c linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_raw.c --- linux-2.6.24/net/ipv6/netfilter/ip6table_raw.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/ip6table_raw.c 2008-03-25 18:53:59.000000000 -0500 @@ -74,11 +74,12 @@ static struct nf_hook_ops ip6t_ops[] = { static int __init ip6table_raw_init(void) { int ret; + struct ip6t_table *tmp; /* Register table */ - ret = ip6t_register_table(&packet_raw, &initial_table.repl); - if (ret < 0) - return ret; + tmp = ip6t_register_table(&packet_raw, &initial_table.repl); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); /* Register hooks */ ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); diff -uprN linux-2.6.24/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c --- linux-2.6.24/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2008-03-25 18:53:59.000000000 -0500 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -394,39 +395,101 @@ MODULE_ALIAS("nf_conntrack-" __stringify MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); -static int __init nf_conntrack_l3proto_ipv6_init(void) +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +static int nf_ct_proto_ipv6_sysctl_init(void) { - int ret = 0; + struct nf_conntrack_l3proto *ipv6; - need_conntrack(); + if (ve_is_super(get_exec_env())) { + ipv6 = &nf_conntrack_l3proto_ipv6; + goto out; + } + + ipv6 = kmemdup(&nf_conntrack_l3proto_ipv6, + sizeof(struct nf_conntrack_l3proto), GFP_KERNEL); + if (!ipv6) + goto no_mem_ct; + + ipv6->ctl_table_path = nf_net_netfilter_sysctl_path; + ipv6->ctl_table = clone_sysctl_template(nf_ct_ipv6_sysctl_table); + if (!ipv6->ctl_table) + goto no_mem_sys; + + ipv6->ctl_table[0].data = &ve_nf_ct_frag6_timeout; + ipv6->ctl_table[1].data = &ve_nf_ct_frag6_low_thresh; + ipv6->ctl_table[2].data = &ve_nf_ct_frag6_high_thresh; +out: + ve_nf_ct_frag6_timeout = nf_frags_ctl.timeout; + ve_nf_ct_frag6_low_thresh = nf_frags_ctl.low_thresh; + ve_nf_ct_frag6_high_thresh = nf_frags_ctl.high_thresh; + + ve_nf_conntrack_l3proto_ipv6 = ipv6; + return 0; +no_mem_sys: + kfree(ipv6); +no_mem_ct: + return -ENOMEM; +} + +static void nf_ct_proto_ipv6_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { + free_sysctl_clone(ve_nf_conntrack_l3proto_ipv6->ctl_table); + kfree(ve_nf_conntrack_l3proto_ipv6); + } +} +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + +int init_nf_ct_l3proto_ipv6(void) +{ + int ret = -ENOMEM; + +#ifdef CONFIG_VE_IPTABLES + if (!ve_is_super(get_exec_env())) + __module_get(THIS_MODULE); + + ret = nf_ct_proto_ipv6_sysctl_init(); + if (ret < 0) + goto no_mem_ipv6; + ret = nf_ct_proto_tcp_sysctl_init(); + if (ret < 0) + goto no_mem_tcp; + ret = nf_ct_proto_udp_sysctl_init(); + if (ret < 0) + goto no_mem_udp; + ret = nf_ct_proto_icmpv6_sysctl_init(); + if (ret < 0) + goto no_mem_icmp; +#endif /* CONFIG_VE_IPTABLES */ ret = nf_ct_frag6_init(); if (ret < 0) { printk("nf_conntrack_ipv6: can't initialize frag6.\n"); - return ret; + goto cleanup_sys; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); + + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register tcp.\n"); goto cleanup_frag6; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register udp.\n"); - goto cleanup_tcp; + goto unreg_tcp; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmpv6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register icmpv6.\n"); - goto cleanup_udp; + goto unreg_udp; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv6); if (ret < 0) { printk("nf_conntrack_ipv6: can't register ipv6\n"); - goto cleanup_icmpv6; + goto unreg_icmpv6; } ret = nf_register_hooks(ipv6_conntrack_ops, @@ -434,32 +497,80 @@ static int __init nf_conntrack_l3proto_i if (ret < 0) { printk("nf_conntrack_ipv6: can't register pre-routing defrag " "hook.\n"); - goto cleanup_ipv6; + goto unreg_ipv6; } - return ret; + return 0; - cleanup_ipv6: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - cleanup_icmpv6: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: +unreg_ipv6: + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6); +unreg_icmpv6: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); +unreg_udp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); +unreg_tcp: + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); +cleanup_frag6: nf_ct_frag6_cleanup(); +cleanup_sys: +#ifdef CONFIG_VE_IPTABLES +no_mem_icmp: + nf_ct_proto_udp_sysctl_cleanup(); +no_mem_udp: + nf_ct_proto_tcp_sysctl_cleanup(); +no_mem_tcp: + nf_ct_proto_ipv6_sysctl_cleanup(); +no_mem_ipv6: + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ return ret; } +EXPORT_SYMBOL(init_nf_ct_l3proto_ipv6); -static void __exit nf_conntrack_l3proto_ipv6_fini(void) +void fini_nf_ct_l3proto_ipv6(void) { - synchronize_net(); nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); + nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); nf_ct_frag6_cleanup(); + +#ifdef CONFIG_VE_IPTABLES + nf_ct_proto_icmpv6_sysctl_cleanup(); + nf_ct_proto_udp_sysctl_cleanup(); + nf_ct_proto_tcp_sysctl_cleanup(); + nf_ct_proto_ipv6_sysctl_cleanup(); + if (!ve_is_super(get_exec_env())) + module_put(THIS_MODULE); +#endif /* CONFIG_VE_IPTABLES */ +} +EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv6); + +static int __init nf_conntrack_l3proto_ipv6_init(void) +{ + int ret = 0; + + need_conntrack(); + + ret = init_nf_ct_l3proto_ipv6(); + if (ret < 0) { + printk(KERN_ERR "Unable to initialize netfilter protocols\n"); + return ret; + } + KSYMRESOLVE(init_nf_ct_l3proto_ipv6); + KSYMRESOLVE(fini_nf_ct_l3proto_ipv6); + KSYMMODRESOLVE(nf_conntrack_ipv6); + return 0; +} + +static void __exit nf_conntrack_l3proto_ipv6_fini(void) +{ + synchronize_net(); + KSYMMODUNRESOLVE(nf_conntrack_ipv6); + KSYMUNRESOLVE(init_nf_ct_l3proto_ipv6); + KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv6); + fini_nf_ct_l3proto_ipv6(); } module_init(nf_conntrack_l3proto_ipv6_init); diff -uprN linux-2.6.24/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c --- linux-2.6.24/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c 2008-03-25 18:53:59.000000000 -0500 @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -100,7 +101,7 @@ static int icmpv6_packet(struct nf_conn } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmpv6_timeout); } return NF_ACCEPT; @@ -156,7 +157,7 @@ icmpv6_error_message(struct sk_buff *skb /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&intuple, &origtuple, - &nf_conntrack_l3proto_ipv6, inproto)) { + ve_nf_conntrack_l3proto_ipv6, inproto)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } @@ -294,3 +295,47 @@ struct nf_conntrack_l4proto nf_conntrack .ctl_table = icmpv6_sysctl_table, #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_icmpv6_sysctl_init(void) +{ + struct nf_conntrack_l4proto *icmp6; + + if (ve_is_super(get_exec_env())) { + icmp6 = &nf_conntrack_l4proto_icmpv6; + goto out; + } + + icmp6 = kmemdup(&nf_conntrack_l4proto_icmpv6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!icmp6) + goto no_mem_ct; + + icmp6->ctl_table_header = &ve_icmpv6_sysctl_header; + icmp6->ctl_table = clone_sysctl_template(icmpv6_sysctl_table); + if (!icmp6->ctl_table) + goto no_mem_sys; + + icmp6->ctl_table[0].data = &ve_nf_ct_icmpv6_timeout; +out: + ve_nf_ct_icmpv6_timeout = nf_ct_icmpv6_timeout; + + ve_nf_conntrack_l4proto_icmpv6 = icmp6; + return 0; + +no_mem_sys: + kfree(icmp6); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_init); + +void nf_ct_proto_icmpv6_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { + free_sysctl_clone(ve_nf_conntrack_l4proto_icmpv6->ctl_table); + kfree(ve_nf_conntrack_l4proto_icmpv6); + } +} +EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff -uprN linux-2.6.24/net/ipv6/netfilter/nf_conntrack_reasm.c linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_reasm.c --- linux-2.6.24/net/ipv6/netfilter/nf_conntrack_reasm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/netfilter/nf_conntrack_reasm.c 2008-03-25 18:53:59.000000000 -0500 @@ -44,6 +44,7 @@ #include #include #include +#include #define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ #define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ @@ -76,9 +77,16 @@ struct inet_frags_ctl nf_frags_ctl __rea .timeout = IPV6_FRAG_TIMEOUT, .secret_interval = 10 * 60 * HZ, }; - static struct inet_frags nf_frags; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_frags (get_exec_env()->_nf_conntrack->_nf_frags6) +#define ve_nf_frags_ctl (get_exec_env()->_nf_conntrack->_nf_frags6_ctl) +#else +#define ve_nf_frags nf_frags +#define ve_nf_frags_ctl nf_frags_ctl +#endif + static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -90,7 +98,7 @@ static unsigned int ip6qhashfn(__be32 id a += JHASH_GOLDEN_RATIO; b += JHASH_GOLDEN_RATIO; - c += nf_frags.rnd; + c += ve_nf_frags.rnd; __jhash_mix(a, b, c); a += (__force u32)saddr->s6_addr32[3]; @@ -125,7 +133,7 @@ static inline void frag_kfree_skb(struct { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_frags.mem); + atomic_sub(skb->truesize, &ve_nf_frags.mem); nf_skb_free(skb); kfree_skb(skb); } @@ -134,7 +142,7 @@ static inline void frag_kfree_skb(struct static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) { - inet_frag_put(&fq->q, &nf_frags); + inet_frag_put(&fq->q, &ve_nf_frags); } /* Kill fq entry. It is not destroyed immediately, @@ -142,12 +150,12 @@ static __inline__ void fq_put(struct nf_ */ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) { - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q, &ve_nf_frags); } static void nf_ct_frag6_evictor(void) { - inet_frag_evictor(&nf_frags); + inet_frag_evictor(&ve_nf_frags); } static void nf_ct_frag6_expire(unsigned long data) @@ -183,7 +191,7 @@ fq_find(__be32 id, struct in6_addr *src, arg.dst = dst; hash = ip6qhashfn(id, src, dst); - q = inet_frag_find(&nf_frags, &arg, hash); + q = inet_frag_find(&ve_nf_frags, &arg, hash); if (q == NULL) goto oom; @@ -352,7 +360,7 @@ static int nf_ct_frag6_queue(struct nf_c skb->dev = NULL; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_frags.mem); + atomic_add(skb->truesize, &ve_nf_frags.mem); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -361,9 +369,9 @@ static int nf_ct_frag6_queue(struct nf_c fq->nhoffset = nhoff; fq->q.last_in |= FIRST_IN; } - write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_frags.lru_list); - write_unlock(&nf_frags.lock); + write_lock(&ve_nf_frags.lock); + list_move_tail(&fq->q.lru_list, &ve_nf_frags.lru_list); + write_unlock(&ve_nf_frags.lock); return 0; err: @@ -429,7 +437,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_frags.mem); + atomic_add(clone->truesize, &ve_nf_frags.mem); } /* We have to remove fragment header from datagram and to relocate @@ -443,7 +451,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_frags.mem); + atomic_sub(head->truesize, &ve_nf_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -453,7 +461,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_que else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_frags.mem); + atomic_sub(fp->truesize, &ve_nf_frags.mem); } head->next = NULL; @@ -603,7 +611,7 @@ struct sk_buff *nf_ct_frag6_gather(struc goto ret_orig; } - if (atomic_read(&nf_frags.mem) > nf_frags_ctl.high_thresh) + if (atomic_read(&ve_nf_frags.mem) > ve_nf_frags_ctl.high_thresh) nf_ct_frag6_evictor(); fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); @@ -674,23 +682,24 @@ int nf_ct_frag6_kfree_frags(struct sk_bu int nf_ct_frag6_init(void) { - nf_frags.ctl = &nf_frags_ctl; - nf_frags.hashfn = nf_hashfn; - nf_frags.constructor = ip6_frag_init; - nf_frags.destructor = NULL; - nf_frags.skb_free = nf_skb_free; - nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); - nf_frags.match = ip6_frag_match; - nf_frags.frag_expire = nf_ct_frag6_expire; - inet_frags_init(&nf_frags); + memcpy(&ve_nf_frags_ctl, &nf_frags_ctl, sizeof(struct inet_frags_ctl)); + ve_nf_frags.ctl = &ve_nf_frags_ctl; + ve_nf_frags.hashfn = nf_hashfn; + ve_nf_frags.constructor = ip6_frag_init; + ve_nf_frags.destructor = NULL; + ve_nf_frags.skb_free = nf_skb_free; + ve_nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); + ve_nf_frags.match = ip6_frag_match; + ve_nf_frags.frag_expire = nf_ct_frag6_expire; + inet_frags_init(&ve_nf_frags); return 0; } void nf_ct_frag6_cleanup(void) { - inet_frags_fini(&nf_frags); + inet_frags_fini(&ve_nf_frags); - nf_frags_ctl.low_thresh = 0; + ve_nf_frags_ctl.low_thresh = 0; nf_ct_frag6_evictor(); } diff -uprN linux-2.6.24/net/ipv6/proc.c linux-2.6.24.ovz/net/ipv6/proc.c --- linux-2.6.24/net/ipv6/proc.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/proc.c 2008-03-25 18:53:59.000000000 -0500 @@ -22,15 +22,21 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include +#ifdef CONFIG_VE +#define proc_net_devsnmp6 (get_exec_env()->_proc_net_devsnmp6) +#else static struct proc_dir_entry *proc_net_devsnmp6; +#endif static int sockstat6_seq_show(struct seq_file *seq, void *v) { @@ -171,11 +177,11 @@ static int snmp6_seq_show(struct seq_fil snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg); } else { - snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics); - snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); - snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list); + snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list); + snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, (void **)ve_icmpv6msg_statistics); + snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void **)ve_udplite_stats_in6, snmp6_udplite6_list); } return 0; } @@ -233,12 +239,27 @@ int snmp6_unregister_dev(struct inet6_de return -ENOENT; if (!idev || !idev->stats.proc_dir_entry) return -EINVAL; - remove_proc_entry(idev->stats.proc_dir_entry->name, + remove_proc_glob_entry(idev->stats.proc_dir_entry->name, proc_net_devsnmp6); idev->stats.proc_dir_entry = NULL; return 0; } +int ve_snmp_proc_init(struct ve_struct *ve) +{ + ve->_proc_net_devsnmp6 = proc_mkdir("dev_snmp6", ve->_proc_net); + if (!ve->_proc_net_devsnmp6) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(ve_snmp_proc_init); + +void ve_snmp_proc_fini(struct ve_struct *ve) +{ + remove_proc_entry("dev_snmp6", ve->_proc_net); +} +EXPORT_SYMBOL(ve_snmp_proc_fini); + int __init ipv6_misc_proc_init(void) { int rc = 0; @@ -246,8 +267,7 @@ int __init ipv6_misc_proc_init(void) if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops)) goto proc_snmp6_fail; - proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net); - if (!proc_net_devsnmp6) + if (ve_snmp_proc_init(get_exec_env())) goto proc_dev_snmp6_fail; if (!proc_net_fops_create(&init_net, "sockstat6", S_IRUGO, &sockstat6_seq_fops)) @@ -256,7 +276,7 @@ out: return rc; proc_sockstat6_fail: - proc_net_remove(&init_net, "dev_snmp6"); + ve_snmp_proc_fini(get_exec_env()); proc_dev_snmp6_fail: proc_net_remove(&init_net, "snmp6"); proc_snmp6_fail: @@ -267,7 +287,7 @@ proc_snmp6_fail: void ipv6_misc_proc_exit(void) { proc_net_remove(&init_net, "sockstat6"); - proc_net_remove(&init_net, "dev_snmp6"); + ve_snmp_proc_fini(get_exec_env()); proc_net_remove(&init_net, "snmp6"); } diff -uprN linux-2.6.24/net/ipv6/raw.c linux-2.6.24.ovz/net/ipv6/raw.c --- linux-2.6.24/net/ipv6/raw.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/raw.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,10 @@ struct sock *__raw_v6_lookup(struct sock if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; + if (!ve_accessible_strict(sk->owner_env, + get_exec_env())) + continue; + if (!ipv6_addr_any(&np->rcv_saddr)) { if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) goto found; @@ -283,7 +288,7 @@ static int rawv6_bind(struct sock *sk, s if (!sk->sk_bound_dev_if) goto out; - dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out; @@ -1200,8 +1205,13 @@ static struct sock *raw6_get_next(struct do { sk = sk_next(sk); try_again: - ; - } while (sk && sk->sk_family != PF_INET6); + if (!sk) + break; + if (sk->sk_family != PF_INET6) + continue; + if (ve_accessible(sk->owner_env, get_exec_env())) + break; + } while (1); if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { sk = sk_head(&raw_v6_htable[state->bucket]); diff -uprN linux-2.6.24/net/ipv6/reassembly.c linux-2.6.24.ovz/net/ipv6/reassembly.c --- linux-2.6.24/net/ipv6/reassembly.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/reassembly.c 2008-03-25 18:53:59.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ struct frag_queue int iif; unsigned int csum; __u16 nhoffset; + struct ve_struct *owner_ve; }; struct inet_frags_ctl ip6_frags_ctl __read_mostly = { @@ -151,7 +153,8 @@ int ip6_frag_match(struct inet_frag_queu fq = container_of(q, struct frag_queue, q); return (fq->id == arg->id && ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst)); + ipv6_addr_equal(&fq->daddr, arg->dst) && + fq->owner_ve == get_exec_env()); } EXPORT_SYMBOL(ip6_frag_match); @@ -203,8 +206,10 @@ static void ip6_frag_expire(unsigned lon { struct frag_queue *fq; struct net_device *dev = NULL; + struct ve_struct *old_ve; fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + old_ve = set_exec_env(fq->owner_ve); spin_lock(&fq->q.lock); @@ -213,7 +218,7 @@ static void ip6_frag_expire(unsigned lon fq_kill(fq); - dev = dev_get_by_index(&init_net, fq->iif); + dev = dev_get_by_index(get_exec_env()->ve_ns->net_ns, fq->iif); if (!dev) goto out; @@ -238,6 +243,8 @@ out: dev_put(dev); spin_unlock(&fq->q.lock); fq_put(fq); + + (void)set_exec_env(old_ve); } static __inline__ struct frag_queue * @@ -511,6 +518,7 @@ static int ip6_frag_reasm(struct frag_qu clone->csum = 0; clone->ip_summed = head->ip_summed; atomic_add(clone->truesize, &ip6_frags.mem); + clone->owner_env = head->owner_env; } /* We have to remove fragment header from datagram and to relocate diff -uprN linux-2.6.24/net/ipv6/route.c linux-2.6.24.ovz/net/ipv6/route.c --- linux-2.6.24/net/ipv6/route.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/route.c 2008-03-25 18:53:59.000000000 -0500 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -216,9 +216,10 @@ static void ip6_dst_ifdown(struct dst_en { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = dev->nd_net->loopback_dev; - if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) { - struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev); + if (dev != loopback_dev && idev != NULL && idev->dev == dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); if (loopback_idev != NULL) { rt->rt6i_idev = loopback_idev; in6_dev_put(idev); @@ -668,8 +669,9 @@ static struct rt6_info *ip6_pol_route(st int strict = 0; int attempts = 3; int err; - int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; - + int reachable; + + reachable = ve_ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; strict |= flags & RT6_LOOKUP_F_IFACE; relookup: @@ -1033,7 +1035,7 @@ static int ipv6_get_mtu(struct net_devic int ipv6_get_hoplimit(struct net_device *dev) { - int hoplimit = ipv6_devconf.hop_limit; + int hoplimit = ve_ipv6_devconf.hop_limit; struct inet6_dev *idev; idev = in6_dev_get(dev); @@ -1050,6 +1052,7 @@ int ipv6_get_hoplimit(struct net_device int ip6_route_add(struct fib6_config *cfg) { + struct net *net = get_exec_env()->ve_ns->net_ns; int err; struct rt6_info *rt = NULL; struct net_device *dev = NULL; @@ -1065,7 +1068,7 @@ int ip6_route_add(struct fib6_config *cf #endif if (cfg->fc_ifindex) { err = -ENODEV; - dev = dev_get_by_index(&init_net, cfg->fc_ifindex); + dev = dev_get_by_index(net, cfg->fc_ifindex); if (!dev) goto out; idev = in6_dev_get(dev); @@ -1122,13 +1125,15 @@ int ip6_route_add(struct fib6_config *cf */ if ((cfg->fc_flags & RTF_REJECT) || (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + struct net *net = get_exec_env()->ve_ns->net_ns; + /* hold loopback dev/idev if we haven't done so. */ - if (dev != init_net.loopback_dev) { + if (dev != net->loopback_dev) { if (dev) { dev_put(dev); in6_dev_put(idev); } - dev = init_net.loopback_dev; + dev = net->loopback_dev; dev_hold(dev); idev = in6_dev_get(dev); if (!idev) { @@ -1827,18 +1832,19 @@ struct rt6_info *addrconf_dst_alloc(stru const struct in6_addr *addr, int anycast) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct rt6_info *rt = ip6_dst_alloc(); if (rt == NULL) return ERR_PTR(-ENOMEM); - dev_hold(init_net.loopback_dev); + dev_hold(net->loopback_dev); in6_dev_hold(idev); rt->u.dst.flags = DST_HOST; rt->u.dst.input = ip6_input; rt->u.dst.output = ip6_output; - rt->rt6i_dev = init_net.loopback_dev; + rt->rt6i_dev = net->loopback_dev; rt->rt6i_idev = idev; rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); @@ -1850,10 +1856,12 @@ struct rt6_info *addrconf_dst_alloc(stru rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (rt->rt6i_nexthop == NULL) { - dst_free(&rt->u.dst); - return ERR_PTR(-ENOMEM); + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); + if (IS_ERR(rt->rt6i_nexthop)) { + void *err = rt->rt6i_nexthop; + rt->rt6i_nexthop = NULL; + dst_free((struct dst_entry *) rt); + return err; } ipv6_addr_copy(&rt->rt6i_dst.addr, addr); @@ -2129,8 +2137,12 @@ static int rt6_fill_node(struct sk_buff if (rt->u.dst.neighbour) NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); + if (rt->u.dst.dev) { + struct net_device *odev = rt->rt6i_dev; + if (rt == &ip6_null_entry) + odev = get_exec_env()->ve_ns->net_ns->loopback_dev; + NLA_PUT_U32(skb, RTA_OIF, odev->ifindex); + } NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); @@ -2164,6 +2176,7 @@ int rt6_dump_route(struct rt6_info *rt, static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = get_exec_env()->ve_ns->net_ns; struct nlattr *tb[RTA_MAX+1]; struct rt6_info *rt; struct sk_buff *skb; @@ -2200,7 +2213,7 @@ static int inet6_rtm_getroute(struct sk_ if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + dev = __dev_get_by_index(net, iif); if (!dev) { err = -ENODEV; goto errout; @@ -2501,3 +2514,54 @@ void ip6_route_cleanup(void) fib6_gc_cleanup(); kmem_cache_destroy(ip6_dst_ops.kmem_cachep); } + +#ifdef CONFIG_VE +int init_ve_route6(struct ve_struct *ve) +{ + struct ve_struct *old_env = set_exec_env(ve); + ve->_fib6_table = kzalloc(sizeof(struct fib6_table), GFP_KERNEL_UBC); + if (!ve->_fib6_table) { + set_exec_env(old_env); + return -ENOMEM; + } + ve->_fib6_table->owner_env = ve; + ve->_fib6_table->tb6_id = RT6_TABLE_MAIN; + ve->_fib6_table->tb6_root.leaf = &ip6_null_entry; + ve->_fib6_table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | + RTN_RTINFO; +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + ve->_fib6_local_table = kzalloc(sizeof(struct fib6_table), + GFP_KERNEL_UBC); + if (!ve->_fib6_local_table) { + kfree(ve->_fib6_table); + set_exec_env(old_env); + return -ENOMEM; + } + ve->_fib6_local_table->owner_env = ve; + ve->_fib6_local_table->tb6_id = RT6_TABLE_LOCAL; + ve->_fib6_local_table->tb6_root.leaf = &ip6_null_entry; + ve->_fib6_local_table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | + RTN_RTINFO; +#endif + fib6_tables_init(); + set_exec_env(old_env); + return 0; +} +EXPORT_SYMBOL(init_ve_route6); + +void fini_ve_route6(struct ve_struct *ve) +{ + struct ve_struct *old_env = set_exec_env(ve); + + if (ve->_fib6_table) { + rt6_ifdown(NULL); + fib6_tables_cleanup(); + kfree(ve->_fib6_table); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(ve->_fib6_local_table); +#endif + } + set_exec_env(old_env); +} +EXPORT_SYMBOL(fini_ve_route6); +#endif diff -uprN linux-2.6.24/net/ipv6/sit.c linux-2.6.24.ovz/net/ipv6/sit.c --- linux-2.6.24/net/ipv6/sit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/sit.c 2008-03-25 18:53:59.000000000 -0500 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include + /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -65,6 +68,26 @@ static int ipip6_fb_tunnel_init(struct n static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); +#ifdef CONFIG_VE +struct ve_sit_tunnels { + struct net_device *_ipip6_fb_tunnel_dev; + struct ip_tunnel *_tunnels_r_l[HASH_SIZE]; + struct ip_tunnel *_tunnels_r[HASH_SIZE]; + struct ip_tunnel *_tunnels_l[HASH_SIZE]; + struct ip_tunnel *_tunnels_wc[1]; + struct ip_tunnel **_tunnels[4]; + rwlock_t _ipip6_lock; +}; + +#define ipip6_fb_tunnel_dev \ + (get_exec_env()->_sit_tunnels->_ipip6_fb_tunnel_dev) +#define tunnels_r_l (get_exec_env()->_sit_tunnels->_tunnels_r_l) +#define tunnels_r (get_exec_env()->_sit_tunnels->_tunnels_r) +#define tunnels_l (get_exec_env()->_sit_tunnels->_tunnels_l) +#define tunnels_wc (get_exec_env()->_sit_tunnels->_tunnels_wc) +#define tunnels (get_exec_env()->_sit_tunnels->_tunnels) +#define ipip6_lock (get_exec_env()->_sit_tunnels->_ipip6_lock) +#else static struct net_device *ipip6_fb_tunnel_dev; static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; @@ -74,6 +97,7 @@ static struct ip_tunnel *tunnels_wc[1]; static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; static DEFINE_RWLOCK(ipip6_lock); +#endif static struct ip_tunnel * ipip6_tunnel_lookup(__be32 remote, __be32 local) { @@ -167,7 +191,7 @@ static struct ip_tunnel * ipip6_tunnel_l int i; for (i=1; i<100; i++) { sprintf(name, "sit%d", i); - if (__dev_get_by_name(&init_net, name) == NULL) + if (__dev_get_by_name(get_exec_env()->ve_ns->net_ns, name) == NULL) break; } if (i==100) @@ -177,7 +201,7 @@ static struct ip_tunnel * ipip6_tunnel_l dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); if (dev == NULL) return NULL; - + dev->nd_net = get_exec_env()->ve_ns->net_ns; nt = netdev_priv(dev); dev->init = ipip6_tunnel_init; nt->parms = *parms; @@ -619,9 +643,12 @@ ipip6_tunnel_ioctl (struct net_device *d case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) +#ifdef CONFIG_VE + && !capable(CAP_VE_NET_ADMIN) +#endif + ) goto done; - err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; @@ -672,7 +699,11 @@ ipip6_tunnel_ioctl (struct net_device *d case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN) +#ifdef CONFIG_VE + && !capable(CAP_VE_NET_ADMIN) +#endif + ) goto done; if (dev == ipip6_fb_tunnel_dev) { @@ -727,6 +758,9 @@ static void ipip6_tunnel_setup(struct ne dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; +#ifdef CONFIG_VE + dev->features |= NETIF_F_VIRTUAL; +#endif } static int ipip6_tunnel_init(struct net_device *dev) @@ -760,7 +794,7 @@ static int ipip6_tunnel_init(struct net_ } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(get_exec_env()->ve_ns->net_ns, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); @@ -773,7 +807,7 @@ static int ipip6_tunnel_init(struct net_ return 0; } -static int __init ipip6_fb_tunnel_init(struct net_device *dev) +static int ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -797,7 +831,7 @@ static struct xfrm_tunnel sit_handler = .priority = 1, }; -static void __exit sit_destroy_tunnels(void) +static void sit_destroy_tunnels(void) { int prio; @@ -811,14 +845,93 @@ static void __exit sit_destroy_tunnels(v } } +#ifdef CONFIG_VE +static int sit_ve_start(void *data) +{ + struct ve_struct *ve = data; + struct ve_sit_tunnels *st; + int err; + + if (!ve_is_super(ve)) + __module_get(THIS_MODULE); + + st = kzalloc(sizeof(struct ve_sit_tunnels), GFP_KERNEL_UBC); + if (!st) { + err = -ENOMEM; + goto out; + } + st->_tunnels[0] = st->_tunnels_wc; + st->_tunnels[1] = st->_tunnels_l; + st->_tunnels[2] = st->_tunnels_r; + st->_tunnels[3] = st->_tunnels_r_l; + rwlock_init(&st->_ipip6_lock); + + ve->_sit_tunnels = st; + if (ve_is_super(ve)) + goto out_ok; + + st->_ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + "sit0", ipip6_tunnel_setup); + if (!st->_ipip6_fb_tunnel_dev) { + err = -ENOMEM; + goto free_tunnel; + } + st->_ipip6_fb_tunnel_dev->nd_net = get_exec_env()->ve_ns->net_ns; + st->_ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init; + err = register_netdev(st->_ipip6_fb_tunnel_dev); + if (err < 0) + goto free_netdev; +out_ok: + return 0; + +free_netdev: + free_netdev(st->_ipip6_fb_tunnel_dev); +free_tunnel: + kfree(st); + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +out: + return err; +} + +static void sit_ve_stop(void *data) +{ + struct ve_struct *ve = data; + + if (ve->_sit_tunnels == NULL) + return; + if (!ve_is_super(ve)) { + rtnl_lock(); + sit_destroy_tunnels(); + unregister_netdevice(ipip6_fb_tunnel_dev); + rtnl_unlock(); + } + kfree(ve->_sit_tunnels); + ve->_sit_tunnels = NULL; + if (!ve_is_super(ve)) + module_put(THIS_MODULE); +} + +static struct ve_hook sit_ve_hook = { + .init = sit_ve_start, + .fini = sit_ve_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_POST, +}; +#endif + static void __exit sit_cleanup(void) { + ve_hook_unregister(&sit_ve_hook); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); rtnl_lock(); sit_destroy_tunnels(); unregister_netdevice(ipip6_fb_tunnel_dev); rtnl_unlock(); +#ifdef CONFIG_VE + sit_ve_stop(get_exec_env()); +#endif } static int __init sit_init(void) @@ -832,23 +945,35 @@ static int __init sit_init(void) return -EAGAIN; } +#ifdef CONFIG_VE + err = sit_ve_start(get_exec_env()); + if (err) + goto err1; +#endif + ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", ipip6_tunnel_setup); if (!ipip6_fb_tunnel_dev) { err = -ENOMEM; - goto err1; + goto err2; } ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init; if ((err = register_netdev(ipip6_fb_tunnel_dev))) - goto err2; + goto err3; + + ve_hook_register(VE_SS_CHAIN, &sit_ve_hook); out: return err; - err2: + err3: free_netdev(ipip6_fb_tunnel_dev); - err1: + err2: +#ifdef CONFIG_VE + sit_ve_stop(get_exec_env()); +err1: +#endif xfrm4_tunnel_deregister(&sit_handler, AF_INET6); goto out; } diff -uprN linux-2.6.24/net/ipv6/tcp_ipv6.c linux-2.6.24.ovz/net/ipv6/tcp_ipv6.c --- linux-2.6.24/net/ipv6/tcp_ipv6.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/tcp_ipv6.c 2008-03-25 18:53:59.000000000 -0500 @@ -61,6 +61,8 @@ #include #include +#include + #include #include @@ -79,7 +81,7 @@ static void tcp_v6_send_check(struct soc static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static struct inet_connection_sock_af_ops ipv6_mapped; +struct inet_connection_sock_af_ops ipv6_mapped; static struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv6_specific; @@ -1552,6 +1554,7 @@ static int tcp_v6_do_rcv(struct sock *sk struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; struct sk_buff *opt_skb = NULL; + struct user_beancounter *ub; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. @@ -1564,6 +1567,8 @@ static int tcp_v6_do_rcv(struct sock *sk if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + ub = set_exec_ub(sock_bc(sk)->ub); + #ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash (sk, skb)) goto discard; @@ -1600,7 +1605,7 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; - return 0; + goto restore_context; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) @@ -1621,7 +1626,7 @@ static int tcp_v6_do_rcv(struct sock *sk goto reset; if (opt_skb) __kfree_skb(opt_skb); - return 0; + goto restore_context; } } @@ -1631,6 +1636,9 @@ static int tcp_v6_do_rcv(struct sock *sk TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; + +restore_context: + (void)set_exec_ub(ub); return 0; reset: @@ -1639,7 +1647,7 @@ discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb(skb); - return 0; + goto restore_context; csum_err: TCP_INC_STATS_BH(TCP_MIB_INERRS); goto discard; @@ -1671,7 +1679,7 @@ ipv6_pktoptions: if (opt_skb) kfree_skb(opt_skb); - return 0; + goto restore_context; } static int tcp_v6_rcv(struct sk_buff *skb) @@ -1851,7 +1859,7 @@ static struct tcp_sock_af_ops tcp_sock_i * TCP over IPv4 via INET6 API */ -static struct inet_connection_sock_af_ops ipv6_mapped = { +struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1869,6 +1877,8 @@ static struct inet_connection_sock_af_op #endif }; +EXPORT_SYMBOL_GPL(ipv6_mapped); + #ifdef CONFIG_TCP_MD5SIG static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .md5_lookup = tcp_v4_md5_lookup, diff -uprN linux-2.6.24/net/ipv6/udp.c linux-2.6.24.ovz/net/ipv6/udp.c --- linux-2.6.24/net/ipv6/udp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/udp.c 2008-03-25 18:53:59.000000000 -0500 @@ -65,12 +65,15 @@ static struct sock *__udp6_lib_lookup(st struct hlist_node *node; unsigned short hnum = ntohs(dport); int badness = -1; + struct ve_struct *ve; + ve = get_exec_env(); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && sk->sk_family == PF_INET6) { + if (inet->num == hnum && sk->sk_family == PF_INET6 && + ve_accessible_strict(sk->owner_env, ve)) { struct ipv6_pinfo *np = inet6_sk(sk); int score = 0; if (inet->dport) { @@ -349,7 +352,7 @@ static int __udp6_lib_mcast_deliver(stru int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest), VEID(skb->owner_env))]); dif = inet6_iif(skb); sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) { diff -uprN linux-2.6.24/net/ipv6/xfrm6_policy.c linux-2.6.24.ovz/net/ipv6/xfrm6_policy.c --- linux-2.6.24/net/ipv6/xfrm6_policy.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/ipv6/xfrm6_policy.c 2008-03-25 18:53:59.000000000 -0500 @@ -362,7 +362,8 @@ static void xfrm6_dst_ifdown(struct dst_ xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { - struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev); + struct inet6_dev *loopback_idev = + in6_dev_get(dev->nd_net->loopback_dev); BUG_ON(!loopback_idev); do { diff -uprN linux-2.6.24/net/netfilter/core.c linux-2.6.24.ovz/net/netfilter/core.c --- linux-2.6.24/net/netfilter/core.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/core.c 2008-03-25 18:53:59.000000000 -0500 @@ -59,16 +59,34 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo); struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); static DEFINE_MUTEX(nf_hook_mutex); +#ifdef CONFIG_VE_IPTABLES +#define VE_NF_HOOKS(env, x, y) \ + ((struct list_head (*)[NF_MAX_HOOKS])(env->_nf_hooks))[x][y] +#else +#define VE_NF_HOOKS(env, x, y) nf_hooks[x][y] +#endif int nf_register_hook(struct nf_hook_ops *reg) { struct list_head *i; + struct ve_struct *env; int err; + env = get_exec_env(); + if (!ve_is_super(env)) { + struct nf_hook_ops *tmp; + tmp = kmemdup(reg, sizeof(struct nf_hook_ops), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + reg = tmp; + } + err = mutex_lock_interruptible(&nf_hook_mutex); - if (err < 0) + if (err < 0) { + kfree(reg); return err; - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + } + list_for_each(i, &VE_NF_HOOKS(env, reg->pf, reg->hooknum)) { if (reg->priority < ((struct nf_hook_ops *)i)->priority) break; } @@ -80,11 +98,29 @@ EXPORT_SYMBOL(nf_register_hook); void nf_unregister_hook(struct nf_hook_ops *reg) { + struct nf_hook_ops *i; + struct ve_struct *env; + + env = get_exec_env(); + if (!ve_is_super(env)) { + list_for_each_entry_rcu(i, + &VE_NF_HOOKS(env, reg->pf, reg->hooknum), list) { + if (reg->hook == i->hook) { + reg = i; + break; + } + } + if (reg != i) + return; + } + mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); synchronize_net(); + if (!ve_is_super(env)) + kfree(reg); } EXPORT_SYMBOL(nf_unregister_hook); @@ -169,13 +205,15 @@ int nf_hook_slow(int pf, unsigned int ho struct list_head *elem; unsigned int verdict; int ret = 0; + struct ve_struct *ve; /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = &nf_hooks[pf][hook]; + ve = get_exec_env(); + elem = &VE_NF_HOOKS(ve, pf, hook); next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, + verdict = nf_iterate(&VE_NF_HOOKS(ve, pf, hook), skb, hook, indev, outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; @@ -275,13 +313,54 @@ struct proc_dir_entry *proc_net_netfilte EXPORT_SYMBOL(proc_net_netfilter); #endif -void __init netfilter_init(void) +void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS]) { int i, h; for (i = 0; i < NPROTO; i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); + INIT_LIST_HEAD(&nh[i][h]); } +} + +int init_netfilter(void) +{ +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *envid; + + envid = get_exec_env(); + envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL); + if (envid->_nf_hooks == NULL) + return -ENOMEM; + + /* FIXME: charge ubc */ + + init_nf_hooks(envid->_nf_hooks); + return 0; +#else + init_nf_hooks(nf_hooks); + return 0; +#endif +} +EXPORT_SYMBOL(init_netfilter); + +#ifdef CONFIG_VE_IPTABLES +void fini_netfilter(void) +{ + struct ve_struct *envid; + + envid = get_exec_env(); + if (envid->_nf_hooks != NULL) + kfree(envid->_nf_hooks); + envid->_nf_hooks = NULL; + + /* FIXME: uncharge ubc */ +} +EXPORT_SYMBOL(fini_netfilter); +#endif + +void __init netfilter_init(void) +{ + init_netfilter(); #ifdef CONFIG_PROC_FS proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_core.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_core.c --- linux-2.6.24/net/netfilter/nf_conntrack_core.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_core.c 2008-03-25 18:53:59.000000000 -0500 @@ -38,6 +38,9 @@ #include #include +#include +#include + #define NF_CONNTRACK_VERSION "0.5.0" DEFINE_RWLOCK(nf_conntrack_lock); @@ -177,7 +180,14 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; +#ifdef CONFIG_VE_IPTABLES + struct ve_struct *old_ve; + + old_ve = set_exec_env(ct->ct_owner_env); +#endif pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); @@ -186,10 +196,17 @@ destroy_conntrack(struct nf_conntrack *n nf_conntrack_event(IPCT_DESTROY, ct); set_bit(IPS_DYING_BIT, &ct->status); + if (help && help->helper && help->helper->destroy) + help->helper->destroy(ct); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + if (l3proto && l3proto->destroy) + l3proto->destroy(ct); + l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (l4proto && l4proto->destroy) @@ -220,6 +237,9 @@ destroy_conntrack(struct nf_conntrack *n pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); nf_conntrack_free(ct); +#ifdef CONFIG_VE_IPTABLES + (void)set_exec_env(old); +#endif } static void death_by_timeout(unsigned long ul_conntrack) @@ -253,7 +273,7 @@ __nf_conntrack_find(const struct nf_conn struct hlist_node *n; unsigned int hash = hash_conntrack(tuple); - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); @@ -287,9 +307,9 @@ static void __nf_conntrack_hash_insert(s unsigned int repl_hash) { hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, - &nf_conntrack_hash[hash]); + &ve_nf_conntrack_hash[hash]); hlist_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, - &nf_conntrack_hash[repl_hash]); + &ve_nf_conntrack_hash[repl_hash]); } void nf_conntrack_hash_insert(struct nf_conn *ct) @@ -343,11 +363,11 @@ __nf_conntrack_confirm(struct sk_buff *s /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple)) goto out; - hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[repl_hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple)) goto out; @@ -415,7 +435,7 @@ static int early_drop(unsigned int hash) read_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) ct = tmp; @@ -442,9 +462,11 @@ static int early_drop(unsigned int hash) } struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_tuple *repl) + const struct nf_conntrack_tuple *repl, + struct user_beancounter *ub) { struct nf_conn *conntrack = NULL; + struct user_beancounter *old_ub; if (unlikely(!nf_conntrack_hash_rnd_initted)) { get_random_bytes(&nf_conntrack_hash_rnd, 4); @@ -452,25 +474,27 @@ struct nf_conn *nf_conntrack_alloc(const } /* We don't want any race condition at early drop stage */ - atomic_inc(&nf_conntrack_count); + atomic_inc(&ve_nf_conntrack_count); - if (nf_conntrack_max - && atomic_read(&nf_conntrack_count) > nf_conntrack_max) { + if (ve_nf_conntrack_max + && atomic_read(&ve_nf_conntrack_count) > ve_nf_conntrack_max) { unsigned int hash = hash_conntrack(orig); if (!early_drop(hash)) { - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); if (net_ratelimit()) - printk(KERN_WARNING - "nf_conntrack: table full, dropping" - " packet.\n"); + ve_printk(VE_LOG_BOTH, KERN_WARNING + "nf_conntrack: CT %d: table full, dropping" + " packet.\n", VEID(get_exec_env())); return ERR_PTR(-ENOMEM); } } + old_ub = set_exec_ub(ub); conntrack = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC); + (void)set_exec_ub(old_ub); if (conntrack == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); return ERR_PTR(-ENOMEM); } @@ -480,6 +504,9 @@ struct nf_conn *nf_conntrack_alloc(const /* Don't set timer yet: wait for confirmation */ setup_timer(&conntrack->timeout, death_by_timeout, (unsigned long)conntrack); +#ifdef CONFIG_VE_IPTABLES + conntrack->ct_owner_env = get_exec_env(); +#endif return conntrack; } @@ -489,7 +516,7 @@ void nf_conntrack_free(struct nf_conn *c { nf_ct_ext_free(conntrack); kmem_cache_free(nf_conntrack_cachep, conntrack); - atomic_dec(&nf_conntrack_count); + atomic_dec(&ve_nf_conntrack_count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -506,13 +533,20 @@ init_conntrack(const struct nf_conntrack struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_expect *exp; + struct user_beancounter *ub = NULL; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } - conntrack = nf_conntrack_alloc(tuple, &repl_tuple); +#ifdef CONFIG_BEANCOUNTERS + if (skb->dev != NULL) /* received skb */ + ub = netdev_bc(skb->dev)->exec_ub; + else if (skb->sk != NULL) /* sent skb */ + ub = sock_bc(skb->sk)->ub; +#endif + conntrack = nf_conntrack_alloc(tuple, &repl_tuple, ub); if (conntrack == NULL || IS_ERR(conntrack)) { pr_debug("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)conntrack; @@ -560,7 +594,7 @@ init_conntrack(const struct nf_conntrack /* Overload tuple linked list to put us in unconfirmed list. */ hlist_add_head(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].hnode, - &unconfirmed); + &ve_unconfirmed); write_unlock_bh(&nf_conntrack_lock); @@ -901,13 +935,13 @@ get_next_corpse(int (*iter)(struct nf_co write_lock_bh(&nf_conntrack_lock); for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[*bucket], hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) goto found; } } - hlist_for_each_entry(h, n, &unconfirmed, hnode) { + hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) set_bit(IPS_DYING_BIT, &ct->status); @@ -962,7 +996,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush); supposed to kill the mall. */ void nf_conntrack_cleanup(void) { - rcu_assign_pointer(ip_ct_attach, NULL); + struct ve_struct *ve = get_exec_env(); + + if (ve_is_super(ve)) + rcu_assign_pointer(ip_ct_attach, NULL); /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module @@ -972,10 +1009,12 @@ void nf_conntrack_cleanup(void) nf_ct_event_cache_flush(); i_see_dead_people: nf_conntrack_flush(); - if (atomic_read(&nf_conntrack_count) != 0) { + if (atomic_read(&ve_nf_conntrack_count) != 0) { schedule(); goto i_see_dead_people; } + if (!ve_is_super(ve)) + goto skip_ct_cache; /* wait until all references to nf_conntrack_untracked are dropped */ while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) schedule(); @@ -983,12 +1022,17 @@ void nf_conntrack_cleanup(void) rcu_assign_pointer(nf_ct_destroy, NULL); kmem_cache_destroy(nf_conntrack_cachep); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, - nf_conntrack_htable_size); - - nf_conntrack_proto_fini(); +skip_ct_cache: nf_conntrack_helper_fini(); nf_conntrack_expect_fini(); + + nf_ct_proto_generic_sysctl_cleanup(); + nf_ct_free_hashtable(ve_nf_conntrack_hash, ve_nf_conntrack_vmalloc, + nf_conntrack_htable_size); + nf_conntrack_proto_fini(); +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_nf_conntrack); +#endif } struct hlist_head *nf_ct_alloc_hashtable(int *sizep, int *vmalloced) @@ -999,13 +1043,13 @@ struct hlist_head *nf_ct_alloc_hashtable *vmalloced = 0; size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head)); - hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN, + hash = (void*)__get_free_pages(GFP_KERNEL_UBC|__GFP_NOWARN, get_order(sizeof(struct hlist_head) * size)); if (!hash) { *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = vmalloc(sizeof(struct hlist_head) * size); + hash = ub_vmalloc(sizeof(struct hlist_head) * size); } if (hash) @@ -1042,8 +1086,8 @@ int nf_conntrack_set_hashsize(const char write_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_conntrack_htable_size; i++) { - while (!hlist_empty(&nf_conntrack_hash[i])) { - h = hlist_entry(nf_conntrack_hash[i].first, + while (!hlist_empty(&ve_nf_conntrack_hash[i])) { + h = hlist_entry(ve_nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnode); hlist_del(&h->hnode); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); @@ -1051,12 +1095,12 @@ int nf_conntrack_set_hashsize(const char } } old_size = nf_conntrack_htable_size; - old_vmalloced = nf_conntrack_vmalloc; - old_hash = nf_conntrack_hash; + old_vmalloced = ve_nf_conntrack_vmalloc; + old_hash = ve_nf_conntrack_hash; nf_conntrack_htable_size = hashsize; - nf_conntrack_vmalloc = vmalloced; - nf_conntrack_hash = hash; + ve_nf_conntrack_vmalloc = vmalloced; + ve_nf_conntrack_hash = hash; nf_conntrack_hash_rnd = rnd; write_unlock_bh(&nf_conntrack_lock); @@ -1068,52 +1112,74 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashs module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -int __init nf_conntrack_init(void) +int nf_conntrack_init(void) { + struct ve_struct *ve = get_exec_env(); int max_factor = 8; - int ret; + int ret = 0, i; - /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB - * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ - if (!nf_conntrack_htable_size) { - nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) - / sizeof(struct hlist_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; - - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; - } - nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, - &nf_conntrack_vmalloc); - if (!nf_conntrack_hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_out; + if (ve_is_super(ve)) { + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 16384; + if (nf_conntrack_htable_size < 32) + nf_conntrack_htable_size = 32; + + /* Use a max. factor of four by default to get the same + * max as with the old struct list_heads. When a table + * size is given we use the old value of 8 to avoid + * reducing the max. entries. */ + max_factor = 4; + } + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + + printk("nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); } - nf_conntrack_max = max_factor * nf_conntrack_htable_size; +#ifdef CONFIG_VE_IPTABLES + ve->_nf_conntrack = kzalloc(sizeof(struct ve_nf_conntrack), GFP_KERNEL); + if (!ve->_nf_conntrack) { + ret = -ENOMEM; + goto out; + } - printk("nf_conntrack version %s (%u buckets, %d max)\n", - NF_CONNTRACK_VERSION, nf_conntrack_htable_size, - nf_conntrack_max); + ve_nf_conntrack_max = nf_conntrack_max; + atomic_set(&ve_nf_conntrack_count, 0); + INIT_HLIST_HEAD(&ve_unconfirmed); +#endif + ve_nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + &ve_nf_conntrack_vmalloc); + if (!ve_nf_conntrack_hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_out; + } - nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + if (ve_is_super(ve)) { + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), - 0, 0, NULL); - if (!nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_free_hash; + 0, SLAB_UBC, NULL); + if (!nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_free_hash; + } } - ret = nf_conntrack_proto_init(); + ret = nf_ct_proto_generic_sysctl_init(); if (ret < 0) goto err_free_conntrack_slab; + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < AF_MAX; i++) + ve_nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic; + write_unlock_bh(&nf_conntrack_lock); ret = nf_conntrack_expect_init(); if (ret < 0) @@ -1123,27 +1189,37 @@ int __init nf_conntrack_init(void) if (ret < 0) goto out_fini_expect; - /* For use by REJECT target */ - rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); - rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); - - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + if (ve_is_super(ve)) { + /* For use by REJECT target */ + rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); + rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + } - return ret; + return 0; +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) + nf_ct_proto_generic_sysctl_cleanup(); +#endif out_fini_expect: nf_conntrack_expect_fini(); out_fini_proto: nf_conntrack_proto_fini(); err_free_conntrack_slab: - kmem_cache_destroy(nf_conntrack_cachep); + if (ve_is_super(ve)) + kmem_cache_destroy(nf_conntrack_cachep); err_free_hash: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(ve_nf_conntrack_hash, nf_conntrack_vmalloc, nf_conntrack_htable_size); err_out: - return -ENOMEM; +#ifdef CONFIG_VE_IPTABLES + kfree(ve->_nf_conntrack); +out: +#endif + return ret; } diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_ecache.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_ecache.c --- linux-2.6.24/net/netfilter/nf_conntrack_ecache.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_ecache.c 2008-03-25 18:53:59.000000000 -0500 @@ -53,6 +53,9 @@ void nf_ct_deliver_cached_events(const s { struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + local_bh_disable(); ecache = &__get_cpu_var(nf_conntrack_ecache); if (ecache->ct == ct) @@ -66,6 +69,9 @@ void __nf_ct_event_cache_init(struct nf_ { struct nf_conntrack_ecache *ecache; + if (!ve_is_super(get_exec_env())) + return; + /* take care of delivering potentially old events */ ecache = &__get_cpu_var(nf_conntrack_ecache); BUG_ON(ecache->ct == ct); @@ -84,6 +90,9 @@ void nf_ct_event_cache_flush(void) struct nf_conntrack_ecache *ecache; int cpu; + if (!ve_is_super(get_exec_env())) + return; + for_each_possible_cpu(cpu) { ecache = &per_cpu(nf_conntrack_ecache, cpu); if (ecache->ct) diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_expect.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_expect.c --- linux-2.6.24/net/netfilter/nf_conntrack_expect.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_expect.c 2008-03-25 18:53:59.000000000 -0500 @@ -39,6 +39,13 @@ static unsigned int nf_ct_expect_count; unsigned int nf_ct_expect_max __read_mostly; static int nf_ct_expect_hash_rnd_initted __read_mostly; static int nf_ct_expect_vmalloc; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_ct_expect_count (get_exec_env()->_nf_conntrack->_nf_ct_expect_count) +#define ve_nf_ct_expect_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_expect_vmalloc) +#else +#define ve_nf_ct_expect_count nf_ct_expect_count +#define ve_nf_ct_expect_vmalloc nf_ct_expect_vmalloc +#endif static struct kmem_cache *nf_ct_expect_cachep __read_mostly; @@ -51,7 +58,7 @@ void nf_ct_unlink_expect(struct nf_connt NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del(&exp->hnode); - nf_ct_expect_count--; + ve_nf_ct_expect_count--; hlist_del(&exp->lnode); master_help->expecting--; @@ -91,11 +98,11 @@ __nf_ct_expect_find(const struct nf_conn struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!ve_nf_ct_expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) return i; } @@ -293,8 +300,8 @@ static void nf_ct_expect_insert(struct n hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting++; - hlist_add_head(&exp->hnode, &nf_ct_expect_hash[h]); - nf_ct_expect_count++; + hlist_add_head(&exp->hnode, &ve_nf_ct_expect_hash[h]); + ve_nf_ct_expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); @@ -350,7 +357,7 @@ int nf_ct_expect_related(struct nf_connt goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { @@ -367,10 +374,10 @@ int nf_ct_expect_related(struct nf_connt master_help->expecting >= master_help->helper->max_expected) evict_oldest_expect(master); - if (nf_ct_expect_count >= nf_ct_expect_max) { + if (ve_nf_ct_expect_count >= ve_nf_ct_expect_max) { if (net_ratelimit()) printk(KERN_WARNING - "nf_conntrack: expectation table full"); + "nf_conntrack: expectation table full\n"); ret = -EMFILE; goto out; } @@ -394,8 +401,8 @@ static struct hlist_node *ct_expect_get_ struct ct_expect_iter_state *st = seq->private; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - if (!hlist_empty(&nf_ct_expect_hash[st->bucket])) - return nf_ct_expect_hash[st->bucket].first; + if (!hlist_empty(&ve_nf_ct_expect_hash[st->bucket])) + return ve_nf_ct_expect_hash[st->bucket].first; } return NULL; } @@ -409,7 +416,7 @@ static struct hlist_node *ct_expect_get_ while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = nf_ct_expect_hash[st->bucket].first; + head = ve_nf_ct_expect_hash[st->bucket].first; } return head; } @@ -485,7 +492,7 @@ static const struct file_operations exp_ }; #endif /* CONFIG_PROC_FS */ -static int __init exp_proc_init(void) +static int exp_proc_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; @@ -506,7 +513,7 @@ static void exp_proc_remove(void) module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); -int __init nf_conntrack_expect_init(void) +int nf_conntrack_expect_init(void) { int err = -ENOMEM; @@ -517,16 +524,20 @@ int __init nf_conntrack_expect_init(void } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &nf_ct_expect_vmalloc); - if (nf_ct_expect_hash == NULL) + ve_nf_ct_expect_count = 0; + ve_nf_ct_expect_max = nf_ct_expect_max; + ve_nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, + &ve_nf_ct_expect_vmalloc); + if (ve_nf_ct_expect_hash == NULL) goto err1; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + if (ve_is_super(get_exec_env())) { + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), - 0, 0, NULL); - if (!nf_ct_expect_cachep) - goto err2; + 0, SLAB_UBC, NULL); + if (!nf_ct_expect_cachep) + goto err2; + } err = exp_proc_init(); if (err < 0) @@ -535,10 +546,11 @@ int __init nf_conntrack_expect_init(void return 0; err3: - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, nf_ct_expect_hsize); err2: - kmem_cache_destroy(nf_ct_expect_cachep); + if (ve_is_super(get_exec_env())) + kmem_cache_destroy(nf_ct_expect_cachep); err1: return err; } @@ -546,7 +558,8 @@ err1: void nf_conntrack_expect_fini(void) { exp_proc_remove(); - kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + if (ve_is_super(get_exec_env())) + kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, nf_ct_expect_hsize); } diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_helper.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_helper.c --- linux-2.6.24/net/netfilter/nf_conntrack_helper.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_helper.c 2008-03-25 18:53:59.000000000 -0500 @@ -32,6 +32,13 @@ static struct hlist_head *nf_ct_helper_h static unsigned int nf_ct_helper_hsize __read_mostly; static unsigned int nf_ct_helper_count __read_mostly; static int nf_ct_helper_vmalloc; +#ifdef CONFIG_VE_IPTABLES +#define ve_nf_ct_helper_hash (get_exec_env()->_nf_conntrack->_nf_ct_helper_hash) +#define ve_nf_ct_helper_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_helper_vmalloc) +#else +#define ve_nf_ct_helper_hash nf_ct_helper_hash +#define ve_nf_ct_helper_vmalloc nf_ct_helper_vmalloc +#endif /* Stupid hash, but collision free for the default registrations of the @@ -54,7 +61,7 @@ __nf_ct_helper_find(const struct nf_conn return NULL; h = helper_hash(tuple); - hlist_for_each_entry(helper, n, &nf_ct_helper_hash[h], hnode) { + hlist_for_each_entry(helper, n, &ve_nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) return helper; } @@ -99,7 +106,7 @@ __nf_conntrack_helper_find_byname(const unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { - hlist_for_each_entry(h, n, &nf_ct_helper_hash[i], hnode) { + hlist_for_each_entry(h, n, &ve_nf_ct_helper_hash[i], hnode) { if (!strcmp(h->name, name)) return h; } @@ -141,7 +148,7 @@ int nf_conntrack_helper_register(struct BUG_ON(me->timeout == 0); write_lock_bh(&nf_conntrack_lock); - hlist_add_head(&me->hnode, &nf_ct_helper_hash[h]); + hlist_add_head(&me->hnode, &ve_nf_ct_helper_hash[h]); nf_ct_helper_count++; write_unlock_bh(&nf_conntrack_lock); @@ -164,7 +171,7 @@ void nf_conntrack_helper_unregister(stru /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], hnode) { + &ve_nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((help->helper == me || exp->helper == me) && del_timer(&exp->timeout)) { @@ -175,10 +182,10 @@ void nf_conntrack_helper_unregister(stru } /* Get rid of expecteds, set helpers to NULL. */ - hlist_for_each_entry(h, n, &unconfirmed, hnode) + hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode) + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[i], hnode) unhelp(h, me); } write_unlock_bh(&nf_conntrack_lock); @@ -199,26 +206,29 @@ int nf_conntrack_helper_init(void) int err; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, - &nf_ct_helper_vmalloc); - if (!nf_ct_helper_hash) + ve_nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, + &ve_nf_ct_helper_vmalloc); + if (!ve_nf_ct_helper_hash) return -ENOMEM; - err = nf_ct_extend_register(&helper_extend); - if (err < 0) - goto err1; + if (ve_is_super(get_exec_env())) { + err = nf_ct_extend_register(&helper_extend); + if (err < 0) + goto err1; + } return 0; err1: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, + nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, nf_ct_helper_hsize); return err; } void nf_conntrack_helper_fini(void) { - nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, + if (ve_is_super(get_exec_env())) + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, nf_ct_helper_hsize); } diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_netlink.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_netlink.c --- linux-2.6.24/net/netfilter/nf_conntrack_netlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_netlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ #include #include +#include +#include MODULE_LICENSE("GPL"); @@ -459,7 +462,7 @@ ctnetlink_dump_table(struct sk_buff *skb last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - hlist_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[cb->args[0]], hnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -976,14 +979,15 @@ static int ctnetlink_create_conntrack(struct nlattr *cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, - struct nf_conn *master_ct) + struct nf_conn *master_ct, + struct user_beancounter *ub) { struct nf_conn *ct; int err = -EINVAL; struct nf_conn_help *help; struct nf_conntrack_helper *helper; - ct = nf_conntrack_alloc(otuple, rtuple); + ct = nf_conntrack_alloc(otuple, rtuple, ub); if (ct == NULL || IS_ERR(ct)) return -ENOMEM; @@ -1094,11 +1098,19 @@ ctnetlink_new_conntrack(struct sock *ctn write_unlock_bh(&nf_conntrack_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) + if (nlh->nlmsg_flags & NLM_F_CREATE) { + struct user_beancounter *ub = NULL; + +#ifdef CONFIG_BEANCOUNTERS + if (skb->sk) + ub = sock_bc(skb->sk)->ub; +#endif err = ctnetlink_create_conntrack(cda, &otuple, &rtuple, - master_ct); + master_ct, + ub); + } if (err < 0 && master_ct) nf_ct_put(master_ct); @@ -1318,7 +1330,7 @@ ctnetlink_exp_dump_table(struct sk_buff last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]], + hlist_for_each_entry(exp, n, &ve_nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -1463,7 +1475,7 @@ ctnetlink_del_expect(struct sock *ctnl, } for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &ve_nf_ct_expect_hash[i], hnode) { m_help = nfct_help(exp->master); if (m_help->helper == h @@ -1479,7 +1491,7 @@ ctnetlink_del_expect(struct sock *ctnl, write_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &ve_nf_ct_expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_proto.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto.c --- linux-2.6.24/net/netfilter/nf_conntrack_proto.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto.c 2008-03-25 18:53:59.000000000 -0500 @@ -28,7 +28,7 @@ #include #include -static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); @@ -63,10 +63,10 @@ nf_ct_unregister_sysctl(struct ctl_table struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) { - if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) - return &nf_conntrack_l4proto_generic; + if (unlikely(l3proto >= AF_MAX || ve_nf_ct_protos[l3proto] == NULL)) + return ve_nf_conntrack_l4proto_generic; - return rcu_dereference(nf_ct_protos[l3proto][l4proto]); + return rcu_dereference(ve_nf_ct_protos[l3proto][l4proto]); } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); @@ -80,7 +80,7 @@ nf_ct_l4proto_find_get(u_int16_t l3proto rcu_read_lock(); p = __nf_ct_l4proto_find(l3proto, l4proto); if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; + p = ve_nf_conntrack_l4proto_generic; rcu_read_unlock(); return p; @@ -190,7 +190,8 @@ int nf_conntrack_l3proto_register(struct return -EBUSY; mutex_lock(&nf_ct_proto_mutex); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + if (ve_nf_ct_l3protos[proto->l3proto] != + &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -199,7 +200,7 @@ int nf_conntrack_l3proto_register(struct if (ret < 0) goto out_unlock; - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); + rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], proto); out_unlock: mutex_unlock(&nf_ct_proto_mutex); @@ -212,8 +213,8 @@ void nf_conntrack_l3proto_unregister(str BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], + BUG_ON(ve_nf_ct_l3protos[proto->l3proto] != proto); + rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); nf_ct_l3proto_unregister_sysctl(proto); mutex_unlock(&nf_ct_proto_mutex); @@ -281,7 +282,7 @@ int nf_conntrack_l4proto_register(struct return -EBUSY; mutex_lock(&nf_ct_proto_mutex); - if (!nf_ct_protos[l4proto->l3proto]) { + if (!ve_nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ struct nf_conntrack_l4proto **proto_array; int i; @@ -295,10 +296,10 @@ int nf_conntrack_l4proto_register(struct } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; - nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != - &nf_conntrack_l4proto_generic) { + proto_array[i] = ve_nf_conntrack_l4proto_generic; + ve_nf_ct_protos[l4proto->l3proto] = proto_array; + } else if (ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != + ve_nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -307,7 +308,7 @@ int nf_conntrack_l4proto_register(struct if (ret < 0) goto out_unlock; - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); out_unlock: @@ -321,9 +322,9 @@ void nf_conntrack_l4proto_unregister(str BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], - &nf_conntrack_l4proto_generic); + BUG_ON(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); + rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + ve_nf_conntrack_l4proto_generic); nf_ct_l4proto_unregister_sysctl(l4proto); mutex_unlock(&nf_ct_proto_mutex); @@ -339,12 +340,12 @@ int nf_conntrack_proto_init(void) unsigned int i; int err; - err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + err = nf_ct_l4proto_register_sysctl(ve_nf_conntrack_l4proto_generic); if (err < 0) return err; for (i = 0; i < AF_MAX; i++) - rcu_assign_pointer(nf_ct_l3protos[i], + rcu_assign_pointer(ve_nf_ct_l3protos[i], &nf_conntrack_l3proto_generic); return 0; } @@ -353,9 +354,13 @@ void nf_conntrack_proto_fini(void) { unsigned int i; - nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(ve_nf_conntrack_l4proto_generic); /* free l3proto protocol tables */ for (i = 0; i < PF_MAX; i++) - kfree(nf_ct_protos[i]); + kfree(ve_nf_ct_protos[i]); +#ifdef CONFIG_VE_IPTABLES + if (!ve_is_super(get_exec_env())) + kfree(ve_nf_conntrack_l4proto_generic); +#endif } diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_proto_generic.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_generic.c --- linux-2.6.24/net/netfilter/nf_conntrack_proto_generic.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_generic.c 2008-03-25 18:53:59.000000000 -0500 @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -55,7 +56,7 @@ static int packet(struct nf_conn *conntr int pf, unsigned int hooknum) { - nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); + nf_ct_refresh_acct(conntrack, ctinfo, skb, ve_nf_ct_generic_timeout); return NF_ACCEPT; } @@ -115,3 +116,61 @@ struct nf_conntrack_l4proto nf_conntrack #endif #endif }; + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_generic_sysctl_init(void) +{ + struct nf_conntrack_l4proto *generic; + + if (ve_is_super(get_exec_env())) { + generic = &nf_conntrack_l4proto_generic; + goto out; + } + + generic = kmemdup(&nf_conntrack_l4proto_generic, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (generic == NULL) + goto no_mem_ct; + + generic->ctl_table_header = &ve_generic_sysctl_header; + generic->ctl_table = clone_sysctl_template(generic_sysctl_table); + if (generic->ctl_table == NULL) + goto no_mem_sys; + + generic->ctl_table[0].data = &ve_nf_ct_generic_timeout; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + generic->ctl_compat_table_header = ve_generic_compat_sysctl_header; + generic->ctl_compat_table = clone_sysctl_template(generic_compat_sysctl_table); + if (generic->ctl_compat_table == NULL) + goto no_mem_compat; + generic->ctl_compat_table[0].data = &ve_nf_ct_generic_timeout; +#endif +out: + ve_nf_ct_generic_timeout = nf_ct_generic_timeout; + + ve_nf_conntrack_l4proto_generic = generic; + return 0; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +no_mem_compat: + free_sysctl_clone(generic->ctl_table); +#endif +no_mem_sys: + kfree(generic); +no_mem_ct: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_init); + +void nf_ct_proto_generic_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone( + ve_nf_conntrack_l4proto_generic->ctl_compat_table); +#endif + free_sysctl_clone(ve_nf_conntrack_l4proto_generic->ctl_table); + } +} +EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_proto_tcp.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_tcp.c --- linux-2.6.24/net/netfilter/nf_conntrack_proto_tcp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_tcp.c 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -31,16 +32,16 @@ static DEFINE_RWLOCK(tcp_lock); /* "Be conservative in what you do, be liberal in what you accept from others." If it's non-zero, we mark only out of window RST segments as INVALID. */ -static int nf_ct_tcp_be_liberal __read_mostly = 0; +int nf_ct_tcp_be_liberal __read_mostly = 0; /* If it is set to zero, we disable picking up already established connections. */ -static int nf_ct_tcp_loose __read_mostly = 1; +int nf_ct_tcp_loose __read_mostly = 1; /* Max number of the retransmitted packets without receiving an (acceptable) ACK from the destination. If this number is reached, a shorter timer will be started. */ -static int nf_ct_tcp_max_retrans __read_mostly = 3; +int nf_ct_tcp_max_retrans __read_mostly = 3; /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ @@ -63,21 +64,21 @@ static const char *tcp_conntrack_names[] #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -static unsigned int nf_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close __read_mostly = 10 SECS; +unsigned int nf_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; +unsigned int nf_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; +unsigned int nf_ct_tcp_timeout_established __read_mostly = 5 DAYS; +unsigned int nf_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; +unsigned int nf_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; +unsigned int nf_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; +unsigned int nf_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; +unsigned int nf_ct_tcp_timeout_close __read_mostly = 10 SECS; /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; +unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; -static unsigned int * tcp_timeouts[] = { +unsigned int * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -671,7 +672,7 @@ static int tcp_in_window(struct nf_conn } else { res = 0; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - nf_ct_tcp_be_liberal) + ve_nf_ct_tcp_be_liberal) res = 1; if (!res && LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, @@ -941,9 +942,9 @@ static int tcp_packet(struct nf_conn *co && (new_state == TCP_CONNTRACK_FIN_WAIT || new_state == TCP_CONNTRACK_CLOSE)) conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans - ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + timeout = conntrack->proto.tcp.retrans >= ve_nf_ct_tcp_max_retrans + && ve_nf_ct_tcp_timeouts[new_state] > ve_nf_ct_tcp_timeout_max_retrans + ? ve_nf_ct_tcp_timeout_max_retrans : ve_nf_ct_tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); @@ -1013,7 +1014,7 @@ static int tcp_new(struct nf_conn *connt tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); conntrack->proto.tcp.seen[1].flags = 0; - } else if (nf_ct_tcp_loose == 0) { + } else if (ve_nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return 0; } else { @@ -1408,3 +1409,114 @@ struct nf_conntrack_l4proto nf_conntrack #endif }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_tcp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *tcp4, *tcp6; + + if (ve_is_super(get_exec_env())) { + tcp4 = &nf_conntrack_l4proto_tcp4; + tcp6 = &nf_conntrack_l4proto_tcp6; + goto out; + } + + tcp4 = kmemdup(&nf_conntrack_l4proto_tcp4, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (tcp4 == NULL) + goto no_mem_ct4; + + tcp4->ctl_table_users = &ve_tcp_sysctl_table_users; + tcp4->ctl_table_header = &ve_tcp_sysctl_header; + tcp4->ctl_table = clone_sysctl_template(tcp_sysctl_table); + if (tcp4->ctl_table == NULL) + goto no_mem_sys; + + tcp4->ctl_table[0].data = &ve_nf_ct_tcp_timeouts[1]; + tcp4->ctl_table[1].data = &ve_nf_ct_tcp_timeouts[2]; + tcp4->ctl_table[2].data = &ve_nf_ct_tcp_timeouts[3]; + tcp4->ctl_table[3].data = &ve_nf_ct_tcp_timeouts[4]; + tcp4->ctl_table[4].data = &ve_nf_ct_tcp_timeouts[5]; + tcp4->ctl_table[5].data = &ve_nf_ct_tcp_timeouts[6]; + tcp4->ctl_table[6].data = &ve_nf_ct_tcp_timeouts[7]; + tcp4->ctl_table[7].data = &ve_nf_ct_tcp_timeouts[8]; + tcp4->ctl_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; + tcp4->ctl_table[9].data = &ve_nf_ct_tcp_loose; + tcp4->ctl_table[10].data = &ve_nf_ct_tcp_be_liberal; + tcp4->ctl_table[11].data = &ve_nf_ct_tcp_max_retrans; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + tcp4->ctl_compat_table_header = ve_tcp_compat_sysctl_header; + tcp4->ctl_compat_table = clone_sysctl_template(tcp_compat_sysctl_table); + if (tcp4->ctl_compat_table == NULL) + goto no_mem_compat; + + tcp4->ctl_compat_table[0].data = &ve_nf_ct_tcp_timeouts[1]; + tcp4->ctl_compat_table[1].data = &ve_nf_ct_tcp_timeouts[2]; + tcp4->ctl_compat_table[2].data = &ve_nf_ct_tcp_timeouts[3]; + tcp4->ctl_compat_table[3].data = &ve_nf_ct_tcp_timeouts[4]; + tcp4->ctl_compat_table[4].data = &ve_nf_ct_tcp_timeouts[5]; + tcp4->ctl_compat_table[5].data = &ve_nf_ct_tcp_timeouts[6]; + tcp4->ctl_compat_table[6].data = &ve_nf_ct_tcp_timeouts[7]; + tcp4->ctl_compat_table[7].data = &ve_nf_ct_tcp_timeouts[8]; + tcp4->ctl_compat_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; + tcp4->ctl_compat_table[9].data = &ve_nf_ct_tcp_loose; + tcp4->ctl_compat_table[10].data = &ve_nf_ct_tcp_be_liberal; + tcp4->ctl_compat_table[11].data = &ve_nf_ct_tcp_max_retrans; +#endif + + tcp6 = kmemdup(&nf_conntrack_l4proto_tcp6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!tcp6) + goto no_mem_ct6; + + tcp6->ctl_table_users = &ve_tcp_sysctl_table_users; + tcp6->ctl_table_header = &ve_tcp_sysctl_header; + tcp6->ctl_table = tcp4->ctl_table; +out: + ve_nf_ct_tcp_timeouts[1] = nf_ct_tcp_timeout_syn_sent; + ve_nf_ct_tcp_timeouts[2] = nf_ct_tcp_timeout_syn_recv; + ve_nf_ct_tcp_timeouts[3] = nf_ct_tcp_timeout_established; + ve_nf_ct_tcp_timeouts[4] = nf_ct_tcp_timeout_fin_wait; + ve_nf_ct_tcp_timeouts[5] = nf_ct_tcp_timeout_close_wait; + ve_nf_ct_tcp_timeouts[6] = nf_ct_tcp_timeout_last_ack; + ve_nf_ct_tcp_timeouts[7] = nf_ct_tcp_timeout_time_wait; + ve_nf_ct_tcp_timeouts[8] = nf_ct_tcp_timeout_close; + ve_nf_ct_tcp_timeout_max_retrans = nf_ct_tcp_timeout_max_retrans; + ve_nf_ct_tcp_loose = nf_ct_tcp_loose; + ve_nf_ct_tcp_be_liberal = nf_ct_tcp_be_liberal; + ve_nf_ct_tcp_max_retrans = nf_ct_tcp_max_retrans; + + ve_nf_conntrack_l4proto_tcp4 = tcp4; + ve_nf_conntrack_l4proto_tcp6 = tcp6; + return 0; + +no_mem_ct6: +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone(tcp4->ctl_compat_table); +no_mem_compat: +#endif + free_sysctl_clone(tcp4->ctl_table); +no_mem_sys: + kfree(tcp4); +no_mem_ct4: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_init); + +void nf_ct_proto_tcp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone( + ve_nf_conntrack_l4proto_tcp4->ctl_compat_table); +#endif + free_sysctl_clone(ve_nf_conntrack_l4proto_tcp4->ctl_table); + kfree(ve_nf_conntrack_l4proto_tcp4); + + kfree(ve_nf_conntrack_l4proto_tcp6); + } +} +EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_proto_udp.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_udp.c --- linux-2.6.24/net/netfilter/nf_conntrack_proto_udp.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_proto_udp.c 2008-03-25 18:53:59.000000000 -0500 @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -78,12 +79,12 @@ static int udp_packet(struct nf_conn *co stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { nf_ct_refresh_acct(conntrack, ctinfo, skb, - nf_ct_udp_timeout_stream); + ve_nf_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) nf_conntrack_event_cache(IPCT_STATUS, skb); } else - nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); + nf_ct_refresh_acct(conntrack, ctinfo, skb, ve_nf_ct_udp_timeout); return NF_ACCEPT; } @@ -238,3 +239,84 @@ struct nf_conntrack_l4proto nf_conntrack #endif }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); + +#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) +int nf_ct_proto_udp_sysctl_init(void) +{ + struct nf_conntrack_l4proto *udp4, *udp6; + + if (ve_is_super(get_exec_env())) { + udp4 = &nf_conntrack_l4proto_udp4; + udp6 = &nf_conntrack_l4proto_udp6; + goto out; + } + + udp4 = kmemdup(&nf_conntrack_l4proto_udp4, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (udp4 == NULL) + goto no_mem_ct4; + + udp4->ctl_table_users = &ve_udp_sysctl_table_users; + udp4->ctl_table_header = &ve_udp_sysctl_header; + udp4->ctl_table = clone_sysctl_template(udp_sysctl_table); + if (udp4->ctl_table == NULL) + goto no_mem_sys; + udp4->ctl_table[0].data = &ve_nf_ct_udp_timeout; + udp4->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + udp4->ctl_compat_table_header = ve_udp_compat_sysctl_header; + udp4->ctl_compat_table = clone_sysctl_template(udp_compat_sysctl_table); + if (udp4->ctl_compat_table == NULL) + goto no_mem_compat; + udp4->ctl_compat_table[0].data = &ve_nf_ct_udp_timeout; + udp4->ctl_compat_table[1].data = &ve_nf_ct_udp_timeout_stream; +#endif + + udp6 = kmemdup(&nf_conntrack_l4proto_udp6, + sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); + if (!udp6) + goto no_mem_ct6; + + udp6->ctl_table_users = &ve_udp_sysctl_table_users; + udp6->ctl_table_header = &ve_udp_sysctl_header; + udp6->ctl_table = udp4->ctl_table; + + udp6->ctl_table[0].data = &ve_nf_ct_udp_timeout; + udp6->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; +out: + ve_nf_ct_udp_timeout = nf_ct_udp_timeout; + ve_nf_ct_udp_timeout_stream = nf_ct_udp_timeout_stream; + + ve_nf_conntrack_l4proto_udp4 = udp4; + ve_nf_conntrack_l4proto_udp6 = udp6; + return 0; + +no_mem_ct6: +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone(udp4->ctl_compat_table); +no_mem_compat: +#endif + free_sysctl_clone(udp4->ctl_table); +no_mem_sys: + kfree(udp4); +no_mem_ct4: + return -ENOMEM; +} +EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_init); + +void nf_ct_proto_udp_sysctl_cleanup(void) +{ + if (!ve_is_super(get_exec_env())) { +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + free_sysctl_clone( + ve_nf_conntrack_l4proto_udp4->ctl_compat_table); +#endif + free_sysctl_clone(ve_nf_conntrack_l4proto_udp4->ctl_table); + kfree(ve_nf_conntrack_l4proto_udp4); + + kfree(ve_nf_conntrack_l4proto_udp6); + } +} +EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_cleanup); +#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ diff -uprN linux-2.6.24/net/netfilter/nf_conntrack_standalone.c linux-2.6.24.ovz/net/netfilter/nf_conntrack_standalone.c --- linux-2.6.24/net/netfilter/nf_conntrack_standalone.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_conntrack_standalone.c 2008-03-25 18:53:59.000000000 -0500 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #ifdef CONFIG_SYSCTL #include #endif +#include #include #include @@ -28,6 +30,10 @@ MODULE_LICENSE("GPL"); +int ip_conntrack_disable_ve0 = 0; +module_param(ip_conntrack_disable_ve0, int, 0440); +EXPORT_SYMBOL(ip_conntrack_disable_ve0); + #ifdef CONFIG_PROC_FS int print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, @@ -62,8 +68,8 @@ static struct hlist_node *ct_get_first(s for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - if (!hlist_empty(&nf_conntrack_hash[st->bucket])) - return nf_conntrack_hash[st->bucket].first; + if (!hlist_empty(&ve_nf_conntrack_hash[st->bucket])) + return ve_nf_conntrack_hash[st->bucket].first; } return NULL; } @@ -77,7 +83,7 @@ static struct hlist_node *ct_get_next(st while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = nf_conntrack_hash[st->bucket].first; + head = ve_nf_conntrack_hash[st->bucket].first; } return head; } @@ -244,7 +250,7 @@ static void ct_cpu_seq_stop(struct seq_f static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -294,6 +300,55 @@ static const struct file_operations ct_c .llseek = seq_lseek, .release = seq_release_private, }; + +static int nf_conntrack_init_ve_proc(struct ve_struct *ve) +{ + struct net *net = ve->ve_ns->net_ns; + struct proc_dir_entry *proc, *proc_stat; + int create_proc_net_stat_nf_conntrack = 1; + + proc = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + if (!proc) + goto out; +#ifdef CONFIG_VE_IPTABLES + create_proc_net_stat_nf_conntrack = ve_is_super(get_exec_env()); +#endif + if (create_proc_net_stat_nf_conntrack) { + proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, + net->proc_net_stat); + if (!proc_stat) + goto out_rm_nf_conntrack_expect; + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; + } + return 0; +out_rm_nf_conntrack_expect: + proc_net_remove(net, "nf_conntrack"); +out: + return -ENOMEM; +} + +static void nf_conntrack_fini_ve_proc(struct ve_struct *ve) +{ + struct net *net = ve->ve_ns->net_ns; + int remove_proc_net_stat_nf_conntrack = 1; + +#ifdef CONFIG_VE_IPTABLES + remove_proc_net_stat_nf_conntrack = ve_is_super(get_exec_env()); +#endif + if (remove_proc_net_stat_nf_conntrack) + remove_proc_entry("nf_conntrack", net->proc_net_stat); + proc_net_remove(net, "nf_conntrack"); +} +#else +static inline int nf_conntrack_init_ve_proc(struct ve_struct *ve) +{ + return 0; +} + +static inline void nf_conntrack_fini_ve_proc(struct ve_struct *ve) +{ +} #endif /* CONFIG_PROC_FS */ /* Sysctl support */ @@ -395,61 +450,113 @@ static ctl_table nf_ct_net_table[] = { EXPORT_SYMBOL_GPL(nf_ct_log_invalid); #endif /* CONFIG_SYSCTL */ -static int __init nf_conntrack_standalone_init(void) +#if defined(CONFIG_SYSCTL) && defined(CONFIG_VE_IPTABLES) +static int nf_conntrack_init_ve_sysctl(struct ve_struct *ve) { -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc, *proc_stat; -#endif - int ret = 0; + ve_nf_ct_net_table = nf_ct_net_table; + ve_nf_ct_netfilter_table = nf_ct_netfilter_table; + ve_nf_ct_sysctl_table = nf_ct_sysctl_table; + + if (!ve_is_super(ve)) { + ve_nf_ct_net_table = clone_sysctl_template(nf_ct_net_table); + if (ve_nf_ct_net_table == NULL) + goto out; + } - ret = nf_conntrack_init(); - if (ret < 0) - return ret; + ve_nf_ct_netfilter_table = ve_nf_ct_net_table[0].child; + ve_nf_ct_netfilter_table[1].data = &ve_nf_conntrack_max; + ve_nf_ct_sysctl_table = ve_nf_ct_netfilter_table[0].child; + ve_nf_ct_sysctl_table[0].data = &ve_nf_conntrack_max; + ve_nf_ct_sysctl_table[1].data = &ve_nf_conntrack_count; + ve_nf_ct_sysctl_table[3].data = &ve_nf_conntrack_checksum; + ve_nf_ct_sysctl_table[4].data = &ve_nf_ct_log_invalid; + ve_nf_ct_sysctl_table[5].data = &ve_nf_ct_expect_max; + + ve_nf_ct_sysctl_header = register_sysctl_table(ve_nf_ct_net_table); + if (!ve_nf_ct_sysctl_header) + goto out_unclone; -#ifdef CONFIG_PROC_FS - proc = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops); - if (!proc) goto cleanup_init; + return 0; - proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, init_net.proc_net_stat); - if (!proc_stat) - goto cleanup_proc; +out_unclone: + if (!ve_is_super(ve)) + free_sysctl_clone(ve_nf_ct_net_table); +out: + return -ENOMEM; +} - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; -#endif -#ifdef CONFIG_SYSCTL - nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table); - if (nf_ct_sysctl_header == NULL) { - printk("nf_conntrack: can't register to sysctl.\n"); - ret = -ENOMEM; - goto cleanup_proc_stat; - } -#endif - return ret; +static void nf_conntrack_fini_ve_sysctl(struct ve_struct *ve) +{ + unregister_sysctl_table(ve_nf_ct_sysctl_header); + if (!ve_is_super(ve)) + free_sysctl_clone(ve_nf_ct_net_table); +} +#else +static inline int nf_conntrack_init_ve_sysctl(struct ve_struct *ve) +{ + return 0; +} -#ifdef CONFIG_SYSCTL - cleanup_proc_stat: +static inline void nf_conntrack_fini_ve_sysctl(struct ve_struct *ve) +{ +} #endif -#ifdef CONFIG_PROC_FS - remove_proc_entry("nf_conntrack", init_net. proc_net_stat); - cleanup_proc: - proc_net_remove(&init_net, "nf_conntrack"); - cleanup_init: -#endif /* CNFIG_PROC_FS */ + +int nf_conntrack_init_ve(void) +{ + struct ve_struct *ve = get_exec_env(); + int err; + + err = nf_conntrack_init(); + if (err) + goto out; + + ve_nf_conntrack_checksum = nf_conntrack_checksum; + + err = nf_conntrack_init_ve_sysctl(ve); + if (err < 0) + goto out_generic; + + err = nf_conntrack_init_ve_proc(ve); + if (err < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + nf_conntrack_fini_ve_proc(ve); +out_generic: + nf_conntrack_cleanup(); +out: + return err; +} + +void nf_conntrack_cleanup_ve(void) +{ + nf_conntrack_fini_ve_proc(get_exec_env()); + nf_conntrack_fini_ve_sysctl(get_exec_env()); nf_conntrack_cleanup(); - return ret; +} +EXPORT_SYMBOL(nf_conntrack_cleanup_ve); + +static int __init nf_conntrack_standalone_init(void) +{ +#ifdef CONFIG_VE_IPTABLES + KSYMRESOLVE(nf_conntrack_init_ve); + KSYMRESOLVE(nf_conntrack_cleanup_ve); + KSYMMODRESOLVE(nf_conntrack); +#endif + return nf_conntrack_init_ve(); } static void __exit nf_conntrack_standalone_fini(void) { -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(nf_ct_sysctl_header); +#ifdef CONFIG_VE_IPTABLES + KSYMMODUNRESOLVE(nf_conntrack); + KSYMUNRESOLVE(nf_conntrack_init_ve); + KSYMUNRESOLVE(nf_conntrack_cleanup_ve); #endif -#ifdef CONFIG_PROC_FS - remove_proc_entry("nf_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "nf_conntrack"); -#endif /* CNFIG_PROC_FS */ - nf_conntrack_cleanup(); + nf_conntrack_cleanup_ve(); } module_init(nf_conntrack_standalone_init); diff -uprN linux-2.6.24/net/netfilter/nf_queue.c linux-2.6.24.ovz/net/netfilter/nf_queue.c --- linux-2.6.24/net/netfilter/nf_queue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_queue.c 2008-03-25 18:53:59.000000000 -0500 @@ -236,12 +236,12 @@ void nf_reinject(struct sk_buff *skb, st /* Drop reference to owner of hook which queued us. */ module_put(info->elem->owner); - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) { if (i == elem) break; } - if (i == &nf_hooks[info->pf][info->hook]) { + if (i == &ve_nf_hooks[info->pf][info->hook]) { /* The module which sent it to userspace is gone. */ NFDEBUG("%s: module disappeared, dropping packet.\n", __FUNCTION__); @@ -262,7 +262,7 @@ void nf_reinject(struct sk_buff *skb, st if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook], skb, info->hook, info->indev, info->outdev, &elem, info->okfn, INT_MIN); diff -uprN linux-2.6.24/net/netfilter/nf_sockopt.c linux-2.6.24.ovz/net/netfilter/nf_sockopt.c --- linux-2.6.24/net/netfilter/nf_sockopt.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nf_sockopt.c 2008-03-25 18:53:59.000000000 -0500 @@ -65,8 +65,10 @@ static struct nf_sockopt_ops *nf_sockopt { struct nf_sockopt_ops *ops; - if (sk->sk_net != &init_net) +#ifdef CONFIG_VE_IPTABLES + if (!get_exec_env()->_nf_hooks) return ERR_PTR(-ENOPROTOOPT); +#endif if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return ERR_PTR(-EINTR); diff -uprN linux-2.6.24/net/netfilter/nfnetlink.c linux-2.6.24.ovz/net/netfilter/nfnetlink.c --- linux-2.6.24/net/netfilter/nfnetlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nfnetlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -124,7 +124,7 @@ static int nfnetlink_rcv_msg(struct sk_b const struct nfnetlink_subsystem *ss; int type, err; - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; /* All the messages must at least contain nfgenmsg */ diff -uprN linux-2.6.24/net/netfilter/nfnetlink_queue.c linux-2.6.24.ovz/net/netfilter/nfnetlink_queue.c --- linux-2.6.24/net/netfilter/nfnetlink_queue.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/nfnetlink_queue.c 2008-03-25 18:53:59.000000000 -0500 @@ -726,9 +726,6 @@ nfqnl_rcv_dev_event(struct notifier_bloc { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev->ifindex); @@ -757,8 +754,7 @@ nfqnl_rcv_nl_event(struct notifier_block struct hlist_head *head = &instance_table[i]; hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - if ((n->net == &init_net) && - (n->pid == inst->peer_pid)) + if (n->pid == inst->peer_pid) __instance_destroy(inst); } } diff -uprN linux-2.6.24/net/netfilter/x_tables.c linux-2.6.24.ovz/net/netfilter/x_tables.c --- linux-2.6.24/net/netfilter/x_tables.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/x_tables.c 2008-03-25 18:53:59.000000000 -0500 @@ -26,6 +26,10 @@ #include #include +#include + +#include +#include MODULE_LICENSE("GPL"); @@ -44,6 +48,14 @@ struct xt_af { static struct xt_af *xt; +#ifdef CONFIG_VE_IPTABLES +/* include ve.h and define get_exec_env */ +#include +#define xt_tables(af) (get_exec_env()->_xt_tables[af]) +#else +#define xt_tables(af) xt[af].tables +#endif + #ifdef DEBUG_IP_FIREWALL_USER #define duprintf(format, args...) printk(format , ## args) #else @@ -62,6 +74,46 @@ static const char *xt_prefix[NPROTO] = { [NF_ARP] = "arp", }; +#ifdef CONFIG_BEANCOUNTERS +static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info) +{ + struct user_beancounter *ub; + + for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent); + return ub; +} + +static void uncharge_xtables(struct xt_table_info *info, unsigned long size) +{ + struct user_beancounter *ub; + + ub = xt_table_ub(info); + uncharge_beancounter(ub, UB_NUMXTENT, size); +} + +static int recharge_xtables(int check_ub, + struct xt_table_info *new, struct xt_table_info *old) +{ + struct user_beancounter *ub; + long change; + + ub = xt_table_ub(new); + BUG_ON(check_ub && ub != xt_table_ub(old)); + + change = (long)new->number - (long)old->number; + if (change > 0) { + if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT)) + return -ENOMEM; + } else if (change < 0) + uncharge_beancounter(ub, UB_NUMXTENT, -change); + + return 0; +} +#else +#define recharge_xtables(c, new, old) (0) +#define uncharge_xtables(info, s) do { } while (0) +#endif /* CONFIG_BEANCOUNTERS */ + /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) @@ -73,7 +125,7 @@ xt_register_target(struct xt_target *tar return ret; list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_target); @@ -130,7 +182,7 @@ xt_register_match(struct xt_match *match list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_match); @@ -310,23 +362,23 @@ int xt_check_match(const struct xt_match unsigned short proto, int inv_proto) { if (XT_ALIGN(match->matchsize) != size) { - printk("%s_tables: %s match: invalid size %Zu != %u\n", + ve_printk(VE_LOG, "%s_tables: %s match: invalid size %Zu != %u\n", xt_prefix[family], match->name, XT_ALIGN(match->matchsize), size); return -EINVAL; } if (match->table && strcmp(match->table, table)) { - printk("%s_tables: %s match: only valid in %s table, not %s\n", + ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[family], match->name, match->table, table); return -EINVAL; } if (match->hooks && (hook_mask & ~match->hooks) != 0) { - printk("%s_tables: %s match: bad hook_mask %u/%u\n", + ve_printk(VE_LOG, "%s_tables: %s match: bad hook_mask %u/%u\n", xt_prefix[family], match->name, hook_mask, match->hooks); return -EINVAL; } if (match->proto && (match->proto != proto || inv_proto)) { - printk("%s_tables: %s match: only valid for protocol %u\n", + ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n", xt_prefix[family], match->name, match->proto); return -EINVAL; } @@ -402,24 +454,24 @@ int xt_check_target(const struct xt_targ unsigned short proto, int inv_proto) { if (XT_ALIGN(target->targetsize) != size) { - printk("%s_tables: %s target: invalid size %Zu != %u\n", + ve_printk(VE_LOG, "%s_tables: %s target: invalid size %Zu != %u\n", xt_prefix[family], target->name, XT_ALIGN(target->targetsize), size); return -EINVAL; } if (target->table && strcmp(target->table, table)) { - printk("%s_tables: %s target: only valid in %s table, not %s\n", + ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n", xt_prefix[family], target->name, target->table, table); return -EINVAL; } if (target->hooks && (hook_mask & ~target->hooks) != 0) { - printk("%s_tables: %s target: bad hook_mask %u/%u\n", + ve_printk(VE_LOG, "%s_tables: %s target: bad hook_mask %u/%u\n", xt_prefix[family], target->name, hook_mask, target->hooks); return -EINVAL; } if (target->proto && (target->proto != proto || inv_proto)) { - printk("%s_tables: %s target: only valid for protocol %u\n", + ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n", xt_prefix[family], target->name, target->proto); return -EINVAL; } @@ -499,19 +551,19 @@ struct xt_table_info *xt_alloc_table_inf if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) return NULL; - newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); + newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL_UBC); if (!newinfo) return NULL; - newinfo->size = size; + newinfo->alloc_size = newinfo->size = size; for_each_possible_cpu(cpu) { if (size <= PAGE_SIZE) newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, + GFP_KERNEL_UBC, cpu_to_node(cpu)); else - newinfo->entries[cpu] = vmalloc_node(size, + newinfo->entries[cpu] = ub_vmalloc_node(size, cpu_to_node(cpu)); if (newinfo->entries[cpu] == NULL) { @@ -529,7 +581,7 @@ void xt_free_table_info(struct xt_table_ int cpu; for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) + if (info->alloc_size <= PAGE_SIZE) kfree(info->entries[cpu]); else vfree(info->entries[cpu]); @@ -546,7 +598,7 @@ struct xt_table *xt_find_table_lock(int if (mutex_lock_interruptible(&xt[af].mutex) != 0) return ERR_PTR(-EINTR); - list_for_each_entry(t, &xt[af].tables, list) + list_for_each_entry(t, &xt_tables(af), list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; mutex_unlock(&xt[af].mutex); @@ -594,6 +646,13 @@ xt_replace_table(struct xt_table *table, return NULL; } oldinfo = private; + + if (recharge_xtables(num_counters != 0, newinfo, oldinfo)) { + write_unlock_bh(&table->lock); + *error = -ENOMEM; + return NULL; + } + table->private = newinfo; newinfo->initial_entries = oldinfo->initial_entries; write_unlock_bh(&table->lock); @@ -615,7 +674,7 @@ int xt_register_table(struct xt_table *t return ret; /* Don't autoload: we'd eat our tail... */ - list_for_each_entry(t, &xt[table->af].tables, list) { + list_for_each_entry(t, &xt_tables(table->af), list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; @@ -634,7 +693,7 @@ int xt_register_table(struct xt_table *t /* save number of initial entries */ private->initial_entries = private->number; - list_add(&table->list, &xt[table->af].tables); + list_add(&table->list, &xt_tables(table->af)); ret = 0; unlock: @@ -643,6 +702,39 @@ int xt_register_table(struct xt_table *t } EXPORT_SYMBOL_GPL(xt_register_table); +struct xt_table * virt_xt_register_table(struct xt_table *table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) +{ + int ret; + struct module *mod = table->me; + + if (!ve_is_super(get_exec_env())) { + struct xt_table *tmp; + __module_get(mod); + ret = -ENOMEM; + tmp = kmalloc(sizeof(struct xt_table), GFP_KERNEL_UBC); + if (!tmp) + goto nomem; + memcpy(tmp, table, sizeof(struct xt_table)); + table = tmp; + } + + ret = xt_register_table(table, bootstrap, newinfo); + if (ret) + goto out; + + return table; +out: + if (!ve_is_super(get_exec_env())) { + kfree(table); +nomem: + module_put(mod); + } + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(virt_xt_register_table); + void *xt_unregister_table(struct xt_table *table) { struct xt_table_info *private; @@ -652,10 +744,25 @@ void *xt_unregister_table(struct xt_tabl list_del(&table->list); mutex_unlock(&xt[table->af].mutex); + uncharge_xtables(private, private->number); + return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); +void *virt_xt_unregister_table(struct xt_table *table) +{ + void *ret; + + ret = xt_unregister_table(table); + if (!ve_is_super(get_exec_env())) { + module_put(table->me); + kfree(table); + } + return ret; +} +EXPORT_SYMBOL_GPL(virt_xt_unregister_table); + #ifdef CONFIG_PROC_FS static struct list_head *xt_get_idx(struct list_head *list, struct seq_file *seq, loff_t pos) { @@ -684,7 +791,7 @@ static struct list_head *type2list(u_int list = &xt[af].match; break; case TABLE: - list = &xt[af].tables; + list = &xt_tables(af); break; default: list = NULL; @@ -797,6 +904,7 @@ int xt_proto_init(int af) return -EINVAL; + INIT_LIST_HEAD(&xt_tables(af)); #ifdef CONFIG_PROC_FS strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); @@ -885,6 +993,6 @@ static void __exit xt_fini(void) kfree(xt); } -module_init(xt_init); +subsys_initcall(xt_init); module_exit(xt_fini); diff -uprN linux-2.6.24/net/netfilter/xt_MARK.c linux-2.6.24.ovz/net/netfilter/xt_MARK.c --- linux-2.6.24/net/netfilter/xt_MARK.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/xt_MARK.c 2008-03-25 18:53:59.000000000 -0500 @@ -75,7 +75,7 @@ checkentry_v0(const char *tablename, const struct xt_mark_target_info *markinfo = targinfo; if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); return false; } return true; @@ -93,12 +93,12 @@ checkentry_v1(const char *tablename, if (markinfo->mode != XT_MARK_SET && markinfo->mode != XT_MARK_AND && markinfo->mode != XT_MARK_OR) { - printk(KERN_WARNING "MARK: unknown mode %u\n", + ve_printk(VE_LOG, KERN_WARNING "MARK: unknown mode %u\n", markinfo->mode); return false; } if (markinfo->mark > 0xffffffff) { - printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); return false; } return true; diff -uprN linux-2.6.24/net/netfilter/xt_TCPMSS.c linux-2.6.24.ovz/net/netfilter/xt_TCPMSS.c --- linux-2.6.24/net/netfilter/xt_TCPMSS.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/xt_TCPMSS.c 2008-03-25 18:53:59.000000000 -0500 @@ -63,7 +63,7 @@ tcpmss_mangle_packet(struct sk_buff *skb badly. --RR */ if (tcplen != tcph->doff*4) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", skb->len); return -1; } @@ -71,7 +71,7 @@ tcpmss_mangle_packet(struct sk_buff *skb if (info->mss == XT_TCPMSS_CLAMP_PMTU) { if (dst_mtu(skb->dst) <= minlen) { if (net_ratelimit()) - printk(KERN_ERR "xt_TCPMSS: " + ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: " "unknown or invalid path-MTU (%u)\n", dst_mtu(skb->dst)); return -1; @@ -217,13 +217,13 @@ xt_tcpmss_checkentry4(const char *tablen (hook_mask & ~((1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IPT_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } @@ -242,13 +242,13 @@ xt_tcpmss_checkentry6(const char *tablen (hook_mask & ~((1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) != 0) { - printk("xt_TCPMSS: path-MTU clamping only supported in " + ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; } if (IP6T_MATCH_ITERATE(e, find_syn_match)) return true; - printk("xt_TCPMSS: Only works on TCP SYN packets\n"); + ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); return false; } #endif diff -uprN linux-2.6.24/net/netfilter/xt_hashlimit.c linux-2.6.24.ovz/net/netfilter/xt_hashlimit.c --- linux-2.6.24/net/netfilter/xt_hashlimit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/xt_hashlimit.c 2008-03-25 18:53:59.000000000 -0500 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -36,8 +37,13 @@ MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); /* need to declare this at the top */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4) +#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6) +#else static struct proc_dir_entry *hashlimit_procdir4; static struct proc_dir_entry *hashlimit_procdir6; +#endif static const struct file_operations dl_file_ops; /* hash table crap */ @@ -92,7 +98,11 @@ struct xt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +#ifdef CONFIG_VE_IPTABLES +#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables) +#else static HLIST_HEAD(hashlimit_htables); +#endif static struct kmem_cache *hashlimit_cachep __read_mostly; static inline bool dst_cmp(const struct dsthash_ent *ent, @@ -440,6 +450,9 @@ hashlimit_init_dst(const struct xt_hashl return 0; } +static int init_xt_hashlimit(struct ve_struct *ve); +static void fini_xt_hashlimit(struct ve_struct *ve); + static bool hashlimit_match(const struct sk_buff *skb, const struct net_device *in, @@ -528,6 +541,9 @@ hashlimit_checkentry(const char *tablena if (r->name[sizeof(r->name) - 1] != '\0') return false; + if (init_xt_hashlimit(get_exec_env())) + return 0; + /* This is the best we've got: We cannot release and re-grab lock, * since checkentry() is called before x_tables.c grabs xt_mutex. * We also cannot grab the hashtable spinlock, since htable_create will @@ -553,6 +569,8 @@ hashlimit_destroy(const struct xt_match const struct xt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); + if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) + fini_xt_hashlimit(get_exec_env()); } #ifdef CONFIG_COMPAT @@ -728,6 +746,59 @@ static const struct file_operations dl_f .release = seq_release }; +static int init_xt_hashlimit(struct ve_struct *ve) +{ + struct proc_dir_entry *proc_net = ve->ve_ns->net_ns->proc_net; + +#if defined(CONFIG_VE_IPTABLES) + if (ve->_xt_hashlimit) + return 0; + + ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL); + if (!ve->_xt_hashlimit) + goto err1; +#endif + INIT_HLIST_HEAD(&hashlimit_htables); + + hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir4) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err2; + } + hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net); + if (!hashlimit_procdir6) { + printk(KERN_ERR "xt_hashlimit: unable to create proc dir " + "entry\n"); + goto err3; + } + + return 0; + +err3: + remove_proc_entry("ipt_hashlimit", proc_net); +err2: +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +#endif +err1: + return -ENOMEM; +} + +static void fini_xt_hashlimit(struct ve_struct *ve) +{ + struct proc_dir_entry *proc_net = ve->ve_ns->net_ns->proc_net; + + remove_proc_entry("ip6t_hashlimit", proc_net); + remove_proc_entry("ipt_hashlimit", proc_net); + +#if defined(CONFIG_VE_IPTABLES) + kfree(ve->_xt_hashlimit); + ve->_xt_hashlimit = NULL; +#endif +} + static int __init xt_hashlimit_init(void) { int err; @@ -744,21 +815,10 @@ static int __init xt_hashlimit_init(void printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n"); goto err2; } - hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net); - if (!hashlimit_procdir4) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); + err = init_xt_hashlimit(get_exec_env()); + if (err) goto err3; - } - hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net); - if (!hashlimit_procdir6) { - printk(KERN_ERR "xt_hashlimit: unable to create proc dir " - "entry\n"); - goto err4; - } return 0; -err4: - remove_proc_entry("ipt_hashlimit", init_net.proc_net); err3: kmem_cache_destroy(hashlimit_cachep); err2: @@ -770,8 +830,7 @@ err1: static void __exit xt_hashlimit_fini(void) { - remove_proc_entry("ipt_hashlimit", init_net.proc_net); - remove_proc_entry("ip6t_hashlimit", init_net.proc_net); + fini_xt_hashlimit(get_exec_env()); kmem_cache_destroy(hashlimit_cachep); xt_unregister_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); } diff -uprN linux-2.6.24/net/netfilter/xt_limit.c linux-2.6.24.ovz/net/netfilter/xt_limit.c --- linux-2.6.24/net/netfilter/xt_limit.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netfilter/xt_limit.c 2008-03-25 18:53:59.000000000 -0500 @@ -111,7 +111,7 @@ ipt_limit_checkentry(const char *tablena /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in xt_limit, try lower: %u/%u\n", + ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n", r->avg, r->burst); return false; } diff -uprN linux-2.6.24/net/netlink/af_netlink.c linux-2.6.24.ovz/net/netlink/af_netlink.c --- linux-2.6.24/net/netlink/af_netlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netlink/af_netlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -61,29 +61,14 @@ #include #include #include +#include + +#include +#include #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - struct module *module; -}; - #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 @@ -225,7 +210,9 @@ static __inline__ struct sock *netlink_l read_lock(&nl_table_lock); head = nl_pid_hashfn(hash, pid); sk_for_each(sk, node, head) { - if ((sk->sk_net == net) && (nlk_sk(sk)->pid == pid)) { + /* VEs should find sockets, created by kernel */ + if (nlk_sk(sk)->pid == pid && (netlink_is_kernel(sk) || + ve_accessible_strict(sk->owner_env, get_exec_env()))) { sock_hold(sk); goto found; } @@ -345,7 +332,8 @@ static int netlink_insert(struct sock *s head = nl_pid_hashfn(hash, pid); len = 0; sk_for_each(osk, node, head) { - if ((osk->sk_net == net) && (nlk_sk(osk)->pid == pid)) + if ((osk->sk_net == net) && (nlk_sk(osk)->pid == pid) && + ve_accessible_strict(osk->owner_env, get_exec_env())) break; len++; } @@ -399,6 +387,8 @@ static int __netlink_create(struct net * sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); if (!sk) return -ENOMEM; + if (ub_other_sock_charge(sk)) + goto out_free; sock_init_data(sock, sk); @@ -414,6 +404,10 @@ static int __netlink_create(struct net * sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; + +out_free: + sk_free(sk); + return -ENOMEM; } static int netlink_create(struct net *net, struct socket *sock, int protocol) @@ -516,7 +510,7 @@ static int netlink_autobind(struct socke struct hlist_head *head; struct sock *osk; struct hlist_node *node; - s32 pid = current->tgid; + s32 pid = task_tgid_vnr(current); int err; static s32 rover = -4097; @@ -527,6 +521,8 @@ retry: sk_for_each(osk, node, head) { if ((osk->sk_net != net)) continue; + if (!ve_accessible_strict(osk->owner_env, get_exec_env())) + continue; if (nlk_sk(osk)->pid == pid) { /* Bind collision, search negative pid values. */ pid = rover--; @@ -552,7 +548,7 @@ retry: static inline int netlink_capable(struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); + capable(CAP_VE_NET_ADMIN); } static void @@ -755,12 +751,20 @@ int netlink_attachskb(struct sock *sk, s long *timeo, struct sock *ssk) { struct netlink_sock *nlk; + unsigned long chargesize; + int no_ubc; nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + chargesize = skb_charge_fullsize(skb); + no_ubc = ub_sock_getwres_other(sk, chargesize); + if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); + + if (!no_ubc) + ub_sock_retwres_other(sk, chargesize, + SOCK_MIN_UBCSPACE_CH); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); @@ -772,13 +776,20 @@ int netlink_attachskb(struct sock *sk, s __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); + /* this if can't be moved upper because ub_sock_snd_queue_add() + * may change task state to TASK_RUNNING */ + if (no_ubc) + ub_sock_sndqueueadd_other(sk, chargesize); + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(0, &nlk->state) || no_ubc) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); + if (no_ubc) + ub_sock_sndqueuedel(sk); sock_put(sk); if (signal_pending(current)) { @@ -788,6 +799,7 @@ int netlink_attachskb(struct sock *sk, s return 1; } skb_set_owner_r(skb, sk); + ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); return 0; } @@ -944,7 +956,7 @@ static inline int do_one_broadcast(struc !test_bit(p->group - 1, nlk->groups)) goto out; - if ((sk->sk_net != p->net)) + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) goto out; if (p->failure) { @@ -1049,6 +1061,9 @@ static inline int do_one_set_err(struct !test_bit(p->group - 1, nlk->groups)) goto out; + if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) + goto out; + sk->sk_err = p->code; sk->sk_error_report(sk); out: @@ -1485,6 +1500,10 @@ static int netlink_dump(struct sock *sk) skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); if (!skb) goto errout; + if (ub_nlrcvbuf_charge(skb, sk) < 0) { + kfree_skb(skb); + return -EACCES; + } mutex_lock(nlk->cb_mutex); diff -uprN linux-2.6.24/net/netlink/attr.c linux-2.6.24.ovz/net/netlink/attr.c --- linux-2.6.24/net/netlink/attr.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netlink/attr.c 2008-03-25 18:53:59.000000000 -0500 @@ -163,7 +163,7 @@ int nla_parse(struct nlattr *tb[], int m } if (unlikely(rem > 0)) - printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing " "attributes.\n", rem); err = 0; diff -uprN linux-2.6.24/net/netlink/genetlink.c linux-2.6.24.ovz/net/netlink/genetlink.c --- linux-2.6.24/net/netlink/genetlink.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/netlink/genetlink.c 2008-03-25 18:53:59.000000000 -0500 @@ -439,7 +439,7 @@ static int genl_rcv_msg(struct sk_buff * return -EOPNOTSUPP; if ((ops->flags & GENL_ADMIN_PERM) && - security_netlink_recv(skb, CAP_NET_ADMIN)) + security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if (nlh->nlmsg_flags & NLM_F_DUMP) { diff -uprN linux-2.6.24/net/packet/af_packet.c linux-2.6.24.ovz/net/packet/af_packet.c --- linux-2.6.24/net/packet/af_packet.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/packet/af_packet.c 2008-03-25 18:53:59.000000000 -0500 @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -80,6 +81,8 @@ #include #include +#include + #ifdef CONFIG_INET #include #endif @@ -246,9 +249,6 @@ static int packet_rcv_spkt(struct sk_buf struct sock *sk; struct sockaddr_pkt *spkt; - if (dev->nd_net != &init_net) - goto out; - /* * When we registered the protocol we saved the socket in the data * field for just this event. @@ -267,7 +267,8 @@ static int packet_rcv_spkt(struct sk_buf * so that this procedure is noop. */ - if (skb->pkt_type == PACKET_LOOPBACK) + if (skb->pkt_type == PACKET_LOOPBACK || + !ve_accessible(skb->owner_env, sk->owner_env)) goto out; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) @@ -341,7 +342,7 @@ static int packet_sendmsg_spkt(struct ki */ saddr->spkt_device[13] = 0; - dev = dev_get_by_name(&init_net, saddr->spkt_device); + dev = dev_get_by_name(current->nsproxy->net_ns, saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; @@ -449,15 +450,17 @@ static int packet_rcv(struct sk_buff *sk int skb_len = skb->len; unsigned int snaplen, res; - if (dev->nd_net != &init_net) - goto drop; - if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); + if (!ve_accessible(skb->owner_env, sk->owner_env)) + goto drop; + + skb_orphan(skb); + skb->dev = dev; if (dev->header_ops) { @@ -521,6 +524,9 @@ static int packet_rcv(struct sk_buff *sk if (pskb_trim(skb, snaplen)) goto drop_n_acct; + if (ub_sockrcvbuf_charge(sk, skb)) + goto drop_n_acct; + skb_set_owner_r(skb, sk); skb->dev = NULL; dst_release(skb->dst); @@ -566,15 +572,17 @@ static int tpacket_rcv(struct sk_buff *s struct sk_buff *copy_skb = NULL; struct timeval tv; - if (dev->nd_net != &init_net) - goto drop; - if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); + if (!ve_accessible(skb->owner_env, sk->owner_env)) + goto drop; + + skb_orphan(skb); + if (dev->header_ops) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); @@ -621,6 +629,12 @@ static int tpacket_rcv(struct sk_buff *s snaplen = 0; } + if (copy_skb && + ub_sockrcvbuf_charge(sk, copy_skb)) { + spin_lock(&sk->sk_receive_queue.lock); + goto ring_is_full; + } + spin_lock(&sk->sk_receive_queue.lock); h = packet_lookup_frame(po, po->head); @@ -732,7 +746,7 @@ static int packet_sendmsg(struct kiocb * } - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, ifindex); err = -ENXIO; if (dev == NULL) goto out_unlock; @@ -916,7 +930,7 @@ static int packet_bind_spkt(struct socke return -EINVAL; strlcpy(name,uaddr->sa_data,sizeof(name)); - dev = dev_get_by_name(&init_net, name); + dev = dev_get_by_name(current->nsproxy->net_ns, name); if (dev) { err = packet_do_bind(sk, dev, pkt_sk(sk)->num); dev_put(dev); @@ -943,7 +957,7 @@ static int packet_bind(struct socket *so if (sll->sll_ifindex) { err = -ENODEV; - dev = dev_get_by_index(&init_net, sll->sll_ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, sll->sll_ifindex); if (dev == NULL) goto out; } @@ -972,9 +986,6 @@ static int packet_create(struct net *net __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; - if (net != &init_net) - return -EAFNOSUPPORT; - if (!capable(CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && @@ -987,6 +998,8 @@ static int packet_create(struct net *net sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; + if (ub_other_sock_charge(sk)) + goto out_free; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) @@ -1024,6 +1037,9 @@ static int packet_create(struct net *net sk_add_node(sk, &packet_sklist); write_unlock_bh(&packet_sklist_lock); return(0); + +out_free: + sk_free(sk); out: return err; } @@ -1140,7 +1156,7 @@ static int packet_getname_spkt(struct so return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, pkt_sk(sk)->ifindex); if (dev) { strlcpy(uaddr->sa_data, dev->name, 15); dev_put(dev); @@ -1165,7 +1181,7 @@ static int packet_getname(struct socket sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; sll->sll_protocol = po->num; - dev = dev_get_by_index(&init_net, po->ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, po->ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; @@ -1217,7 +1233,7 @@ static int packet_mc_add(struct sock *sk rtnl_lock(); err = -ENODEV; - dev = __dev_get_by_index(&init_net, mreq->mr_ifindex); + dev = __dev_get_by_index(current->nsproxy->net_ns, mreq->mr_ifindex); if (!dev) goto done; @@ -1271,7 +1287,7 @@ static int packet_mc_drop(struct sock *s if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; - dev = dev_get_by_index(&init_net, ml->ifindex); + dev = dev_get_by_index(current->nsproxy->net_ns, ml->ifindex); if (dev) { packet_dev_mc(dev, ml, -1); dev_put(dev); @@ -1299,7 +1315,7 @@ static void packet_flush_mclist(struct s struct net_device *dev; po->mclist = ml->next; - if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) { + if ((dev = dev_get_by_index(current->nsproxy->net_ns, ml->ifindex)) != NULL) { packet_dev_mc(dev, ml, -1); dev_put(dev); } @@ -1455,14 +1471,16 @@ static int packet_notifier(struct notifi struct sock *sk; struct hlist_node *node; struct net_device *dev = data; + struct ve_struct *ve; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - + ve = get_exec_env(); read_lock(&packet_sklist_lock); sk_for_each(sk, node, &packet_sklist) { struct packet_sock *po = pkt_sk(sk); + if (!ve_accessible_strict(sk->owner_env, ve)) + continue; + switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) @@ -1868,6 +1886,8 @@ static inline struct sock *packet_seq_id struct hlist_node *node; sk_for_each(s, node, &packet_sklist) { + if (!ve_accessible(s->owner_env, get_exec_env())) + continue; if (!off--) return s; } @@ -1883,9 +1903,14 @@ static void *packet_seq_start(struct seq static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return (v == SEQ_START_TOKEN) - ? sk_head(&packet_sklist) - : sk_next((struct sock*)v) ; + do { + v = (v == SEQ_START_TOKEN) + ? sk_head(&packet_sklist) + : sk_next((struct sock*)v); + } while (v != NULL && + !ve_accessible(((struct sock*)v)->owner_env, + get_exec_env())); + return v; } static void packet_seq_stop(struct seq_file *seq, void *v) diff -uprN linux-2.6.24/net/sched/sch_cbq.c linux-2.6.24.ovz/net/sched/sch_cbq.c --- linux-2.6.24/net/sched/sch_cbq.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sched/sch_cbq.c 2008-03-25 18:53:59.000000000 -0500 @@ -905,8 +905,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int if (cl->deficit <= 0) { q->active[prio] = cl; - cl = cl->next_alive; cl->deficit += cl->quantum; + cl = cl->next_alive; } return skb; @@ -1078,17 +1078,19 @@ static void cbq_normalize_quanta(struct for (h=0; h<16; h++) { for (cl = q->classes[h]; cl; cl = cl->next) { + long mtu; /* BUGGGG... Beware! This expression suffer of arithmetic overflows! */ if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); - cl->quantum = cl->qdisc->dev->mtu/2 + 1; + cl->quantum = (cl->weight * cl->allot) / + (q->quanta[prio] / q->nclasses[prio]); } + mtu = cl->qdisc->dev->mtu; + if (cl->quantum <= mtu/2) + cl->quantum = mtu/2 + 1; + else if (cl->quantum > 32*mtu) + cl->quantum = 32*mtu; } } } diff -uprN linux-2.6.24/net/sched/sch_generic.c linux-2.6.24.ovz/net/sched/sch_generic.c --- linux-2.6.24/net/sched/sch_generic.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sched/sch_generic.c 2008-03-25 18:53:59.000000000 -0500 @@ -135,11 +135,13 @@ static inline int qdisc_restart(struct n struct Qdisc *q = dev->qdisc; struct sk_buff *skb; int ret = NETDEV_TX_BUSY; + struct ve_struct *old_ve; /* Dequeue packet */ if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL)) return 0; + old_ve = set_exec_env(skb->owner_env); /* And release queue */ spin_unlock(&dev->queue_lock); @@ -173,6 +175,8 @@ static inline int qdisc_restart(struct n break; } + (void)set_exec_env(old_ve); + return ret; } diff -uprN linux-2.6.24/net/sched/sch_teql.c linux-2.6.24.ovz/net/sched/sch_teql.c --- linux-2.6.24/net/sched/sch_teql.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sched/sch_teql.c 2008-03-25 18:53:59.000000000 -0500 @@ -174,6 +174,9 @@ static int teql_qdisc_init(struct Qdisc struct teql_master *m = (struct teql_master*)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (dev->hard_header_len > m->dev->hard_header_len) return -EINVAL; diff -uprN linux-2.6.24/net/socket.c linux-2.6.24.ovz/net/socket.c --- linux-2.6.24/net/socket.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/socket.c 2008-03-25 18:53:59.000000000 -0500 @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -155,15 +156,6 @@ static DEFINE_PER_CPU(int, sockets_in_us * divide and look after the messy bits. */ -#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - - 16 for IP, 16 for IPX, - 24 for IPv6, - about 80 for AX.25 - must be at least one bigger than - the AF_UNIX size (see net/unix/af_unix.c - :unix_mkname()). - */ - /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space @@ -492,6 +484,8 @@ static struct socket *sock_alloc(void) return sock; } +EXPORT_SYMBOL(sock_alloc); + /* * In theory you can't get an open on this inode, but /proc provides * a back door. Remember to keep it shut otherwise you'll let the @@ -739,7 +733,6 @@ static ssize_t sock_aio_read(struct kioc if (iocb->ki_left == 0) /* Match SYS5 behaviour */ return 0; - x = alloc_sock_iocb(iocb, &siocb); if (!x) return -ENOMEM; @@ -1076,6 +1069,48 @@ call_kill: return 0; } +int vz_security_family_check(int family) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (family) { + case PF_UNSPEC: + case PF_PACKET: + case PF_NETLINK: + case PF_UNIX: + case PF_INET: + case PF_INET6: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_family_check); + +int vz_security_protocol_check(int protocol) +{ +#ifdef CONFIG_VE + if (ve_is_super(get_exec_env())) + return 0; + + switch (protocol) { + case IPPROTO_IP: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_RAW: + break; + default: + return -EAFNOSUPPORT; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(vz_security_protocol_check); + static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { @@ -1106,6 +1141,11 @@ static int __sock_create(struct net *net family = PF_PACKET; } + /* VZ compatibility layer */ + err = vz_security_family_check(family); + if (err < 0) + return err; + err = security_socket_create(family, type, protocol, kern); if (err) return err; @@ -2311,9 +2351,12 @@ int kernel_sock_ioctl(struct socket *soc { mm_segment_t oldfs = get_fs(); int err; + struct ve_struct *old_env; set_fs(KERNEL_DS); + old_env = set_exec_env(sock->sk->owner_env); err = sock->ops->ioctl(sock, cmd, arg); + (void)set_exec_env(old_env); set_fs(oldfs); return err; diff -uprN linux-2.6.24/net/sunrpc/clnt.c linux-2.6.24.ovz/net/sunrpc/clnt.c --- linux-2.6.24/net/sunrpc/clnt.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/clnt.c 2008-03-25 18:53:59.000000000 -0500 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,35 @@ static void rpc_unregister_client(struct spin_unlock(&rpc_client_lock); } +/* + * Grand abort timeout (stop the client if occures) + */ +int xprt_abort_timeout = RPC_MAX_ABORT_TIMEOUT; + +static int rpc_abort_hard(struct rpc_task *task) +{ + struct rpc_clnt *clnt; + clnt = task->tk_client; + + if (clnt->cl_pr_time == 0) { + clnt->cl_pr_time = jiffies; + return 0; + } + if (xprt_abort_timeout == RPC_MAX_ABORT_TIMEOUT) + return 0; + if (time_before(jiffies, clnt->cl_pr_time + xprt_abort_timeout * HZ)) + return 0; + + clnt->cl_broken = 1; + rpc_killall_tasks(clnt); + return -ETIMEDOUT; +} + +static void rpc_abort_clear(struct rpc_task *task) +{ + task->tk_client->cl_pr_time = 0; +} + static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { @@ -253,6 +283,7 @@ struct rpc_clnt *rpc_create(struct rpc_c if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; + xprt->owner_env = get_ve(get_exec_env()); /* * If the caller chooses not to specify a hostname, whip * up a string representation of the passed-in address. @@ -277,13 +308,16 @@ struct rpc_clnt *rpc_create(struct rpc_c clnt = rpc_new_client(xprt, args->servername, args->program, args->version, args->authflavor); - if (IS_ERR(clnt)) + if (IS_ERR(clnt)) { + put_ve(xprt->owner_env); return clnt; + } if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR); if (err != 0) { rpc_shutdown_client(clnt); + put_ve(xprt->owner_env); return ERR_PTR(err); } } @@ -322,6 +356,7 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; INIT_LIST_HEAD(&new->cl_tasks); spin_lock_init(&new->cl_lock); + new->cl_broken = 0; rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); new->cl_metrics = rpc_alloc_iostats(clnt); if (new->cl_metrics == NULL) @@ -528,6 +563,9 @@ struct rpc_task *rpc_do_run_task(struct struct rpc_task *task, *ret; sigset_t oldset; + if (clnt->cl_broken) + return ERR_PTR(-EIO); + task = rpc_new_task(clnt, flags, ops, data); if (task == NULL) { rpc_release_calldata(ops, data); @@ -944,6 +982,7 @@ call_bind_status(struct rpc_task *task) if (task->tk_status >= 0) { dprint_status(task); + rpc_abort_clear(task); task->tk_status = 0; task->tk_action = call_connect; return; @@ -969,6 +1008,10 @@ call_bind_status(struct rpc_task *task) case -ETIMEDOUT: dprintk("RPC: %5u rpcbind request timed out\n", task->tk_pid); + if (rpc_abort_hard(task)) { + status = -EIO; + break; + } goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ @@ -1034,6 +1077,8 @@ call_connect_status(struct rpc_task *tas /* Something failed: remote service port may have changed */ rpc_force_rebind(clnt); + if (rpc_abort_hard(task)) + goto exit; switch (status) { case -ENOTCONN: @@ -1046,6 +1091,7 @@ call_connect_status(struct rpc_task *tas task->tk_action = call_timeout; return; } +exit: rpc_exit(task, -EIO); } @@ -1176,7 +1222,7 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; - if (RPC_IS_SOFT(task)) { + if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) { printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); @@ -1217,7 +1263,7 @@ call_decode(struct rpc_task *task) } if (task->tk_status < 12) { - if (!RPC_IS_SOFT(task)) { + if (!RPC_IS_SOFT(task) && !rpc_abort_hard(task)) { task->tk_action = call_bind; clnt->cl_stats->rpcretrans++; goto out_retry; @@ -1228,6 +1274,7 @@ call_decode(struct rpc_task *task) goto out_retry; } + rpc_abort_clear(task); /* * Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_received. @@ -1563,3 +1610,67 @@ out: spin_unlock(&rpc_client_lock); } #endif + +#ifdef CONFIG_VE +static int ve_sunrpc_start(void *data) +{ + return 0; +} + +void ve_sunrpc_stop(void *data) +{ + struct ve_struct *ve = (struct ve_struct *)data; + struct rpc_clnt *clnt; + struct rpc_task *rovr; + + dprintk("RPC: killing all tasks for VE %d\n", ve->veid); + + spin_lock(&rpc_client_lock); + list_for_each_entry(clnt, &all_clients, cl_clients) { + if (clnt->cl_xprt->owner_env != ve) + continue; + + spin_lock(&clnt->cl_lock); + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(rovr)) + continue; + printk(KERN_WARNING "RPC: Killing task %d client %p\n", + rovr->tk_pid, clnt); + + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + rpc_wake_up_task(rovr); + } + schedule_work(&clnt->cl_xprt->task_cleanup); + spin_unlock(&clnt->cl_lock); + } + spin_unlock(&rpc_client_lock); + + flush_scheduled_work(); +} + +static struct ve_hook sunrpc_hook = { + .init = ve_sunrpc_start, + .fini = ve_sunrpc_stop, + .owner = THIS_MODULE, + .priority = HOOK_PRIO_NET_PRE, +}; + +void ve_sunrpc_hook_register(void) +{ + ve_hook_register(VE_SS_CHAIN, &sunrpc_hook); +} + +void ve_sunrpc_hook_unregister(void) +{ + ve_hook_unregister(&sunrpc_hook); +} +#else +void ve_sunrpc_hook_register(void) +{ +} + +void ve_sunrpc_hook_unregister(void) +{ +} +#endif diff -uprN linux-2.6.24/net/sunrpc/rpc_pipe.c linux-2.6.24.ovz/net/sunrpc/rpc_pipe.c --- linux-2.6.24/net/sunrpc/rpc_pipe.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/rpc_pipe.c 2008-03-25 18:53:59.000000000 -0500 @@ -839,6 +839,7 @@ static struct file_system_type rpc_pipe_ .name = "rpc_pipefs", .get_sb = rpc_get_sb, .kill_sb = kill_litter_super, + .fs_flags = FS_VIRTUALIZED, }; static void diff -uprN linux-2.6.24/net/sunrpc/sched.c linux-2.6.24.ovz/net/sunrpc/sched.c --- linux-2.6.24/net/sunrpc/sched.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/sched.c 2008-03-25 18:53:59.000000000 -0500 @@ -631,7 +631,9 @@ void rpc_release_calldata(const struct r static void __rpc_execute(struct rpc_task *task) { int status = 0; + struct ve_struct *env; + env = set_exec_env(task->tk_client->cl_xprt->owner_env); dprintk("RPC: %5u __rpc_execute flags=0x%x\n", task->tk_pid, task->tk_flags); @@ -681,10 +683,14 @@ static void __rpc_execute(struct rpc_tas rpc_clear_running(task); if (RPC_IS_ASYNC(task)) { /* Careful! we may have raced... */ - if (RPC_IS_QUEUED(task)) + if (RPC_IS_QUEUED(task)) { + (void)set_exec_env(env); return; - if (rpc_test_and_set_running(task)) + } + if (rpc_test_and_set_running(task)) { + (void)set_exec_env(env); return; + } continue; } @@ -714,6 +720,7 @@ static void __rpc_execute(struct rpc_tas task->tk_status); /* Release all resources associated with the task */ rpc_release_task(task); + (void)set_exec_env(env); } /* diff -uprN linux-2.6.24/net/sunrpc/sunrpc_syms.c linux-2.6.24.ovz/net/sunrpc/sunrpc_syms.c --- linux-2.6.24/net/sunrpc/sunrpc_syms.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/sunrpc_syms.c 2008-03-25 18:53:59.000000000 -0500 @@ -132,6 +132,9 @@ EXPORT_SYMBOL(nlm_debug); extern struct cache_detail ip_map_cache, unix_gid_cache; +extern void ve_sunrpc_hook_register(void); +extern void ve_sunrpc_hook_unregister(void); + static int __init init_sunrpc(void) { @@ -153,6 +156,7 @@ init_sunrpc(void) cache_register(&unix_gid_cache); init_socket_xprt(); rpcauth_init_module(); + ve_sunrpc_hook_register(); out: return err; } @@ -160,6 +164,7 @@ out: static void __exit cleanup_sunrpc(void) { + ve_sunrpc_hook_unregister(); rpcauth_remove_module(); cleanup_socket_xprt(); unregister_rpc_pipefs(); diff -uprN linux-2.6.24/net/sunrpc/svcsock.c linux-2.6.24.ovz/net/sunrpc/svcsock.c --- linux-2.6.24/net/sunrpc/svcsock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/svcsock.c 2008-03-25 18:53:59.000000000 -0500 @@ -508,6 +508,9 @@ svc_sendto(struct svc_rqst *rqstp, struc unsigned int pglen = xdr->page_len; unsigned int flags = MSG_MORE; char buf[RPC_MAX_ADDRBUFLEN]; + struct ve_struct *old_env; + + old_env = set_exec_env(sock->sk->owner_env); slen = xdr->len; @@ -568,6 +571,8 @@ out: rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); + (void)set_exec_env(old_env); + return len; } @@ -642,14 +647,18 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) { struct svc_sock *svsk = rqstp->rq_sock; + struct socket *sock = svsk->sk_sock; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; struct sockaddr *sin; int len; + struct ve_struct *old_env; + old_env = set_exec_env(sock->sk->owner_env); len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); + (void)set_exec_env(old_env); /* sock_recvmsg doesn't fill in the name/namelen, so we must.. */ @@ -1795,6 +1804,8 @@ svc_delete_socket(struct svc_sock *svsk) serv = svsk->sk_server; sk = svsk->sk_sk; + /* XXX: serialization? */ + sk->sk_user_data = NULL; sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; diff -uprN linux-2.6.24/net/sunrpc/xprt.c linux-2.6.24.ovz/net/sunrpc/xprt.c --- linux-2.6.24/net/sunrpc/xprt.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/xprt.c 2008-03-25 18:53:59.000000000 -0500 @@ -567,10 +567,13 @@ static void xprt_autoclose(struct work_s { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); xprt_disconnect(xprt); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); + (void)set_exec_env(ve); } /** @@ -1017,6 +1020,7 @@ found: xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; + xprt->owner_env = get_exec_env(); rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); diff -uprN linux-2.6.24/net/sunrpc/xprtsock.c linux-2.6.24.ovz/net/sunrpc/xprtsock.c --- linux-2.6.24/net/sunrpc/xprtsock.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/sunrpc/xprtsock.c 2008-03-25 18:53:59.000000000 -0500 @@ -64,6 +64,8 @@ static unsigned int min_slot_table_size static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; +static int xprt_min_abort_timeout = RPC_MIN_ABORT_TIMEOUT; +static int xprt_max_abort_timeout = RPC_MAX_ABORT_TIMEOUT; static struct ctl_table_header *sunrpc_table_header; @@ -117,6 +119,16 @@ static ctl_table xs_tunables_table[] = { .extra2 = &xprt_max_resvport_limit }, { + .procname = "abort_timeout", + .data = &xprt_abort_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_abort_timeout, + .extra2 = &xprt_max_abort_timeout + }, + { .ctl_name = 0, }, }; @@ -735,18 +747,23 @@ out_release: static void xs_close(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct socket *sock = transport->sock; - struct sock *sk = transport->inet; - - if (!sk) - goto clear_close_wait; + struct socket *sock; + struct sock *sk; dprintk("RPC: xs_close xprt %p\n", xprt); - write_lock_bh(&sk->sk_callback_lock); + spin_lock_bh(&xprt->transport_lock); + if (transport->sock == NULL) { + spin_unlock_bh(&xprt->transport_lock); + goto clear_close_wait; + } + sock = transport->sock; + sk = transport->inet; transport->inet = NULL; transport->sock = NULL; + spin_unlock_bh(&xprt->transport_lock); + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; @@ -1415,7 +1432,12 @@ static void xs_udp_connect_worker4(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1441,6 +1463,8 @@ static void xs_udp_connect_worker4(struc out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** @@ -1456,7 +1480,12 @@ static void xs_udp_connect_worker6(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1482,6 +1511,8 @@ static void xs_udp_connect_worker6(struc out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /* @@ -1560,7 +1591,12 @@ static void xs_tcp_connect_worker4(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1621,7 +1657,12 @@ static void xs_tcp_connect_worker6(struc struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; int err, status = -EIO; + struct ve_struct *ve; + ve = set_exec_env(xprt->owner_env); + down_read(&xprt->owner_env->op_sem); + if (!xprt->owner_env->is_running) + goto out; if (xprt->shutdown || !xprt_bound(xprt)) goto out; @@ -1666,6 +1707,8 @@ out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); + up_read(&xprt->owner_env->op_sem); + (void)set_exec_env(ve); } /** diff -uprN linux-2.6.24/net/unix/af_unix.c linux-2.6.24.ovz/net/unix/af_unix.c --- linux-2.6.24/net/unix/af_unix.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/unix/af_unix.c 2008-03-25 18:53:59.000000000 -0500 @@ -117,6 +117,9 @@ #include #include +#include +#include + int sysctl_unix_max_dgram_qlen __read_mostly = 10; static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; @@ -270,7 +273,8 @@ static inline void unix_insert_socket(st spin_unlock(&unix_table_lock); } -static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, +static struct sock *__unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; @@ -279,6 +283,9 @@ static struct sock *__unix_find_socket_b sk_for_each(s, node, &unix_socket_table[hash ^ type]) { struct unix_sock *u = unix_sk(s); + if (s->sk_net != net) + continue; + if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) goto found; @@ -288,21 +295,22 @@ found: return s; } -static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, +static inline struct sock *unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, type, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); return s; } -static struct sock *unix_find_socket_byinode(struct inode *i) +static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) { struct sock *s; struct hlist_node *node; @@ -312,6 +320,9 @@ static struct sock *unix_find_socket_byi &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; + if (s->sk_net != net) + continue; + if(dentry && dentry->d_inode == i) { sock_hold(s); @@ -606,6 +617,8 @@ static struct sock * unix_create1(struct sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); if (!sk) goto out; + if (ub_other_sock_charge(sk)) + goto out_sk_free; sock_init_data(sock,sk); lockdep_set_class(&sk->sk_receive_queue.lock, @@ -627,13 +640,13 @@ out: if (sk == NULL) atomic_dec(&unix_nr_socks); return sk; +out_sk_free: + sk_free(sk); + return NULL; } static int unix_create(struct net *net, struct socket *sock, int protocol) { - if (net != &init_net) - return -EAFNOSUPPORT; - if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -677,6 +690,7 @@ static int unix_release(struct socket *s static int unix_autobind(struct socket *sock) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); static u32 ordernum = 1; struct unix_address * addr; @@ -703,7 +717,7 @@ retry: spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ @@ -723,7 +737,8 @@ out: mutex_unlock(&u->readlock); return err; } -static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash, int *error) { struct sock *u; @@ -741,7 +756,7 @@ static struct sock *unix_find_other(stru err = -ECONNREFUSED; if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) goto put_fail; - u=unix_find_socket_byinode(nd.dentry->d_inode); + u=unix_find_socket_byinode(net, nd.dentry->d_inode); if (!u) goto put_fail; @@ -757,7 +772,7 @@ static struct sock *unix_find_other(stru } } else { err = -ECONNREFUSED; - u=unix_find_socket_byname(sunname, len, type, hash); + u=unix_find_socket_byname(net, sunname, len, type, hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->dentry; @@ -779,6 +794,7 @@ fail: static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct dentry * dentry = NULL; @@ -853,7 +869,7 @@ static int unix_bind(struct socket *sock if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; - if (__unix_find_socket_byname(sunaddr, addr_len, + if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { unix_release_addr(addr); goto out_unlock; @@ -919,6 +935,7 @@ static int unix_dgram_connect(struct soc int alen, int flags) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr; struct sock *other; unsigned hash; @@ -935,7 +952,7 @@ static int unix_dgram_connect(struct soc goto out; restart: - other=unix_find_other(sunaddr, alen, sock->type, hash, &err); + other=unix_find_other(net, sunaddr, alen, sock->type, hash, &err); if (!other) goto out; @@ -1015,6 +1032,7 @@ static int unix_stream_connect(struct so { struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct sock *newsk = NULL; struct sock *other = NULL; @@ -1023,6 +1041,7 @@ static int unix_stream_connect(struct so int st; int err; long timeo; + unsigned long chargesize; err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -1051,10 +1070,14 @@ static int unix_stream_connect(struct so skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; + chargesize = skb_charge_fullsize(skb); + if (ub_sock_getwres_other(newsk, chargesize) < 0) + goto out; + ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); restart: /* Find listening sock. */ - other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); if (!other) goto out; @@ -1299,7 +1322,7 @@ static void unix_detach_fds(struct scm_c unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +void unix_destruct_fds(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); @@ -1310,6 +1333,7 @@ static void unix_destruct_fds(struct sk_ scm_destroy(&scm); sock_wfree(skb); } +EXPORT_SYMBOL_GPL(unix_destruct_fds); static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { @@ -1330,6 +1354,7 @@ static int unix_dgram_sendmsg(struct kio { struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=msg->msg_name; struct sock *other = NULL; @@ -1393,7 +1418,7 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(sunaddr, namelen, sk->sk_type, + other = unix_find_other(net, sunaddr, namelen, sk->sk_type, hash, &err); if (other==NULL) goto out_free; @@ -1522,6 +1547,16 @@ static int unix_stream_sendmsg(struct ki size = len-sent; + if (msg->msg_flags & MSG_DONTWAIT) + ub_sock_makewres_other(sk, skb_charge_size(size)); + if (sock_bc(sk) != NULL && + sock_bc(sk)->poll_reserv >= + SOCK_MIN_UBCSPACE && + skb_charge_size(size) > + sock_bc(sk)->poll_reserv) + size = skb_charge_datalen(sock_bc(sk)->poll_reserv); + + /* Keep two messages in the pipe so it schedules better */ if (size > ((sk->sk_sndbuf >> 1) - 64)) size = (sk->sk_sndbuf >> 1) - 64; @@ -1533,7 +1568,9 @@ static int unix_stream_sendmsg(struct ki * Grab a buffer */ - skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); + + skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, + msg->msg_flags&MSG_DONTWAIT, &err); if (skb==NULL) goto out_err; @@ -1973,6 +2010,7 @@ static unsigned int unix_poll(struct fil { struct sock *sk = sock->sk; unsigned int mask; + int no_ub_res; poll_wait(file, sk->sk_sleep, wait); mask = 0; @@ -1985,6 +2023,10 @@ static unsigned int unix_poll(struct fil if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLRDHUP; + no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); + if (no_ub_res) + ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); + /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) @@ -1998,7 +2040,7 @@ static unsigned int unix_poll(struct fil * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (!no_ub_res && unix_writable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; @@ -2006,12 +2048,18 @@ static unsigned int unix_poll(struct fil #ifdef CONFIG_PROC_FS -static struct sock *unix_seq_idx(int *iter, loff_t pos) +struct unix_iter_state { + struct net *net; + int i; +}; +static struct sock *unix_seq_idx(struct unix_iter_state *iter, loff_t pos) { loff_t off = 0; struct sock *s; - for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) { + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (s->sk_net != iter->net) + continue; if (off == pos) return s; ++off; @@ -2022,17 +2070,24 @@ static struct sock *unix_seq_idx(int *it static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { + struct unix_iter_state *iter = seq->private; spin_lock(&unix_table_lock); - return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); + return *pos ? unix_seq_idx(iter, *pos - 1) : ((void *) 1); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct unix_iter_state *iter = seq->private; + struct sock *sk = v; ++*pos; if (v == (void *)1) - return first_unix_socket(seq->private); - return next_unix_socket(seq->private, v); + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); + while (sk && (sk->sk_net != iter->net)) + sk = next_unix_socket(&iter->i, sk); + return sk; } static void unix_seq_stop(struct seq_file *seq, void *v) @@ -2094,7 +2149,27 @@ static const struct seq_operations unix_ static int unix_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &unix_seq_ops, sizeof(int)); + struct unix_iter_state *it; + + it = __seq_open_private(file, &unix_seq_ops, + sizeof(struct unix_iter_state)); + if (it == NULL) + return -ENOMEM; + + it->net = get_proc_net(inode); + if (it->net == NULL) { + seq_release_private(inode, file); + return -ENXIO; + } + return 0; +} + +static int unix_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct unix_iter_state *iter = seq->private; + put_net(iter->net); + return seq_release_private(inode, file); } static const struct file_operations unix_seq_fops = { @@ -2102,7 +2177,7 @@ static const struct file_operations unix .open = unix_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = unix_seq_release, }; #endif @@ -2113,6 +2188,30 @@ static struct net_proto_family unix_fami .owner = THIS_MODULE, }; + +static int unix_net_init(struct net *net) +{ + int error = -ENOMEM; + +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) + goto out; +#endif + error = 0; +out: + return 0; +} + +static void unix_net_exit(struct net *net) +{ + proc_net_remove(net, "unix"); +} + +static struct pernet_operations unix_net_ops = { + .init = unix_net_init, + .exit = unix_net_exit, +}; + static int __init af_unix_init(void) { int rc = -1; @@ -2128,9 +2227,7 @@ static int __init af_unix_init(void) } sock_register(&unix_family_ops); -#ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "unix", 0, &unix_seq_fops); -#endif + register_pernet_subsys(&unix_net_ops); unix_sysctl_register(); out: return rc; @@ -2140,8 +2237,8 @@ static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); unix_sysctl_unregister(); - proc_net_remove(&init_net, "unix"); proto_unregister(&unix_proto); + unregister_pernet_subsys(&unix_net_ops); } module_init(af_unix_init); diff -uprN linux-2.6.24/net/unix/garbage.c linux-2.6.24.ovz/net/unix/garbage.c --- linux-2.6.24/net/unix/garbage.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/unix/garbage.c 2008-03-25 18:53:59.000000000 -0500 @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -151,6 +152,7 @@ void unix_notinflight(struct file *fp) spin_unlock(&unix_gc_lock); } } +EXPORT_SYMBOL_GPL(unix_notinflight); static inline struct sk_buff *sock_queue_head(struct sock *sk) { diff -uprN linux-2.6.24/net/xfrm/xfrm_policy.c linux-2.6.24.ovz/net/xfrm/xfrm_policy.c --- linux-2.6.24/net/xfrm/xfrm_policy.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/xfrm/xfrm_policy.c 2008-03-25 18:53:59.000000000 -0500 @@ -1793,7 +1793,7 @@ static int stale_bundle(struct dst_entry void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { - dst->dev = init_net.loopback_dev; + dst->dev = dev->nd_net->loopback_dev; dev_hold(dst->dev); dev_put(dev); } @@ -2066,9 +2066,6 @@ static int xfrm_dev_event(struct notifie { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_DOWN: xfrm_flush_bundles(); diff -uprN linux-2.6.24/net/xfrm/xfrm_user.c linux-2.6.24.ovz/net/xfrm/xfrm_user.c --- linux-2.6.24/net/xfrm/xfrm_user.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/net/xfrm/xfrm_user.c 2008-03-25 18:53:59.000000000 -0500 @@ -1865,7 +1865,7 @@ static int xfrm_user_rcv_msg(struct sk_b link = &xfrm_dispatch[type]; /* All operations require privileges, even GET */ - if (security_netlink_recv(skb, CAP_NET_ADMIN)) + if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) return -EPERM; if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || diff -uprN linux-2.6.24/scripts/kconfig/Makefile linux-2.6.24.ovz/scripts/kconfig/Makefile --- linux-2.6.24/scripts/kconfig/Makefile 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/scripts/kconfig/Makefile 2008-03-25 18:53:59.000000000 -0500 @@ -44,6 +44,9 @@ update-po-config: $(obj)/kxgettext $(Q)rm -f arch/um/Kconfig.arch $(Q)rm -f $(obj)/config.pot +nonint_oldconfig: $(obj)/conf + $< -b $(Kconfig) + PHONY += randconfig allyesconfig allnoconfig allmodconfig defconfig randconfig: $(obj)/conf diff -uprN linux-2.6.24/scripts/kconfig/conf.c linux-2.6.24.ovz/scripts/kconfig/conf.c --- linux-2.6.24/scripts/kconfig/conf.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/scripts/kconfig/conf.c 2008-03-25 18:53:59.000000000 -0500 @@ -21,6 +21,7 @@ enum { ask_all, ask_new, ask_silent, + dont_ask, set_default, set_yes, set_mod, @@ -45,6 +46,8 @@ static const char *get_help(struct menu return nohelp_text; } +static int return_value = 0; + static void strip(char *str) { char *p = str; @@ -91,6 +94,12 @@ static int conf_askvalue(struct symbol * } switch (input_mode) { + case dont_ask: + if (!sym_has_value(sym)) { + fprintf(stderr,"CONFIG_%s\n",sym->name); + return_value++; + } + return 0; case set_no: case set_mod: case set_yes: @@ -350,6 +359,10 @@ static int conf_choice(struct menu *menu printf("?"); printf("]: "); switch (input_mode) { + case dont_ask: + cnt = def; + printf("%d\n", cnt); + break; case ask_new: case ask_silent: if (!is_new) { @@ -485,7 +498,10 @@ static void check_conf(struct menu *menu if (!conf_cnt++) printf(_("*\n* Restart config...\n*\n")); rootEntry = menu_get_parent_menu(menu); - conf(rootEntry); + if (input_mode == dont_ask) + fprintf(stderr,"CONFIG_%s\n",sym->name); + else + conf(rootEntry); } } @@ -504,6 +520,9 @@ int main(int ac, char **av) case 'o': input_mode = ask_new; break; + case 'b': + input_mode = dont_ask; + break; case 's': input_mode = ask_silent; valid_stdin = isatty(0) && isatty(1) && isatty(2); @@ -570,6 +589,7 @@ int main(int ac, char **av) } case ask_all: case ask_new: + case dont_ask: conf_read(NULL); break; case set_no: @@ -616,7 +636,7 @@ int main(int ac, char **av) do { conf_cnt = 0; check_conf(&rootmenu); - } while (conf_cnt); + } while ((conf_cnt) && (input_mode != dont_ask)); if (conf_write(NULL)) { fprintf(stderr, _("\n*** Error during writing of the kernel configuration.\n\n")); return 1; @@ -627,5 +647,5 @@ skip_check: return 1; } - return 0; + return return_value; } diff -uprN linux-2.6.24/scripts/mod/file2alias.c linux-2.6.24.ovz/scripts/mod/file2alias.c --- linux-2.6.24/scripts/mod/file2alias.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/scripts/mod/file2alias.c 2008-03-25 18:53:59.000000000 -0500 @@ -155,7 +155,7 @@ static void do_usb_entry_multi(struct us * Some modules (visor) have empty slots as placeholder for * run-time specification that results in catch-all alias */ - if (!(id->idVendor | id->bDeviceClass | id->bInterfaceClass)) + if (!(id->idVendor | id->idProduct | id->bDeviceClass | id->bInterfaceClass)) return; /* Convert numeric bcdDevice range into fnmatch-able pattern(s) */ diff -uprN linux-2.6.24/security/Kconfig linux-2.6.24.ovz/security/Kconfig --- linux-2.6.24/security/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/security/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -4,6 +4,8 @@ menu "Security options" +source grsecurity/Kconfig + config KEYS bool "Enable access key retention support" help @@ -41,7 +43,7 @@ config KEYS_DEBUG_PROC_KEYS config SECURITY bool "Enable different security models" - depends on SYSFS + depends on SYSFS && !VE help This allows you to choose different security modules to be configured into your kernel. diff -uprN linux-2.6.24/security/commoncap.c linux-2.6.24.ovz/security/commoncap.c --- linux-2.6.24/security/commoncap.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/security/commoncap.c 2008-03-25 18:53:59.000000000 -0500 @@ -36,8 +36,10 @@ # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */ +#ifndef CONFIG_VE kernel_cap_t cap_bset = CAP_INIT_BSET; /* systemwide capability bound */ EXPORT_SYMBOL(cap_bset); +#endif /* Global security state */ @@ -52,6 +54,10 @@ int cap_netlink_send(struct sock *sk, st int cap_netlink_recv(struct sk_buff *skb, int cap) { + if (likely(cap == CAP_VE_NET_ADMIN) && + cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + return 0; + if (!cap_raised(NETLINK_CB(skb).eff_cap, cap)) return -EPERM; return 0; @@ -384,7 +390,7 @@ int cap_inode_setxattr(struct dentry *de return 0; } else if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -397,7 +403,7 @@ int cap_inode_removexattr(struct dentry return 0; } else if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) return -EPERM; return 0; } @@ -593,7 +599,7 @@ void cap_task_reparent_to_init (struct t int cap_syslog (int type) { - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) return -EPERM; return 0; } diff -uprN linux-2.6.24/security/selinux/Kconfig linux-2.6.24.ovz/security/selinux/Kconfig --- linux-2.6.24/security/selinux/Kconfig 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/security/selinux/Kconfig 2008-03-25 18:53:59.000000000 -0500 @@ -1,6 +1,6 @@ config SECURITY_SELINUX bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET + depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE select NETWORK_SECMARK default n help diff -uprN linux-2.6.24/security/selinux/hooks.c linux-2.6.24.ovz/security/selinux/hooks.c --- linux-2.6.24/security/selinux/hooks.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/security/selinux/hooks.c 2008-03-25 18:53:59.000000000 -0500 @@ -4665,12 +4665,12 @@ static int selinux_setprocattr(struct ta struct task_struct *g, *t; struct mm_struct *mm = p->mm; read_lock(&tasklist_lock); - do_each_thread(g, t) + do_each_thread_ve(g, t) if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); return -EPERM; } - while_each_thread(g, t); + while_each_thread_ve(g, t); read_unlock(&tasklist_lock); } diff -uprN linux-2.6.24/security/selinux/ss/services.c linux-2.6.24.ovz/security/selinux/ss/services.c --- linux-2.6.24/security/selinux/ss/services.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/security/selinux/ss/services.c 2008-03-25 18:53:59.000000000 -0500 @@ -1744,6 +1744,9 @@ int security_genfs_sid(const char *fstyp struct ocontext *c; int rc = 0, cmp = 0; + while (path[0] == '/' && path[1] == '/') + path++; + POLICY_RDLOCK; for (genfs = policydb.genfs; genfs; genfs = genfs->next) { diff -uprN linux-2.6.24/sound/core/info.c linux-2.6.24.ovz/sound/core/info.c --- linux-2.6.24/sound/core/info.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/sound/core/info.c 2008-03-25 18:53:59.000000000 -0500 @@ -545,7 +545,7 @@ int __init snd_info_init(void) { struct proc_dir_entry *p; - p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, &proc_root); + p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL); if (p == NULL) return -ENOMEM; snd_proc_root = p; @@ -595,7 +595,7 @@ int __exit snd_info_done(void) #ifdef CONFIG_SND_OSSEMUL snd_info_free_entry(snd_oss_root); #endif - snd_remove_proc_entry(&proc_root, snd_proc_root); + snd_remove_proc_entry(NULL, snd_proc_root); } return 0; } diff -uprN linux-2.6.24/sound/oss/via82cxxx_audio.c linux-2.6.24.ovz/sound/oss/via82cxxx_audio.c --- linux-2.6.24/sound/oss/via82cxxx_audio.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/sound/oss/via82cxxx_audio.c 2008-03-25 18:54:00.000000000 -0500 @@ -2104,6 +2104,7 @@ static struct page * via_mm_nopage (stru { struct via_info *card = vma->vm_private_data; struct via_channel *chan = &card->ch_out; + unsigned long max_bufs; struct page *dmapage; unsigned long pgoff; int rd, wr; @@ -2127,14 +2128,11 @@ static struct page * via_mm_nopage (stru rd = card->ch_in.is_mapped; wr = card->ch_out.is_mapped; -#ifndef VIA_NDEBUG - { - unsigned long max_bufs = chan->frag_number; - if (rd && wr) max_bufs *= 2; - /* via_dsp_mmap() should ensure this */ - assert (pgoff < max_bufs); - } -#endif + max_bufs = chan->frag_number; + if (rd && wr) + max_bufs *= 2; + if (pgoff >= max_bufs) + return NOPAGE_SIGBUS; /* if full-duplex (read+write) and we have two sets of bufs, * then the playback buffers come first, sez soundcard.c */ diff -uprN linux-2.6.24/sound/usb/usx2y/usX2Yhwdep.c linux-2.6.24.ovz/sound/usb/usx2y/usX2Yhwdep.c --- linux-2.6.24/sound/usb/usx2y/usX2Yhwdep.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/sound/usb/usx2y/usX2Yhwdep.c 2008-03-25 18:54:00.000000000 -0500 @@ -88,7 +88,7 @@ static int snd_us428ctls_mmap(struct snd us428->us428ctls_sharedmem->CtlSnapShotLast = -2; } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; area->vm_private_data = hw->private_data; return 0; } diff -uprN linux-2.6.24/sound/usb/usx2y/usx2yhwdeppcm.c linux-2.6.24.ovz/sound/usb/usx2y/usx2yhwdeppcm.c --- linux-2.6.24/sound/usb/usx2y/usx2yhwdeppcm.c 2008-01-24 17:58:37.000000000 -0500 +++ linux-2.6.24.ovz/sound/usb/usx2y/usx2yhwdeppcm.c 2008-03-25 18:54:00.000000000 -0500 @@ -728,7 +728,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(stru return -ENODEV; } area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; area->vm_private_data = hw->private_data; return 0; }